[PATCH 0/2] hw/block/nvme: oncs and write uncorrectable support

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

First, add support for toggling optional features through the new `oncs`
nvme device parameter.

Secondly, add support for the Write Uncorrectable command.

Gollu Appalanaidu (2):
  hw/block/nvme: add oncs device parameter
  hw/block/nvme: add write uncorrectable command

 docs/specs/nvme.txt   |   3 +
 hw/block/nvme-ns.h|   2 +
 hw/block/nvme.h   |   8 ++
 hw/block/nvme-ns.c|   2 +
 hw/block/nvme.c   | 166 +++---
 hw/block/trace-events |   1 +
 6 files changed, 140 insertions(+), 42 deletions(-)

-- 
2.30.0




[PATCH 1/2] hw/block/nvme: add oncs device parameter

2021-02-09 Thread Klaus Jensen
From: Gollu Appalanaidu 

Add the 'oncs' nvme device parameter to allow optional features to be
enabled/disabled explicitly. Since most of these are optional commands,
make the CSE log pages dynamic to account for the value of ONCS.

Signed-off-by: Gollu Appalanaidu 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h |   7 
 hw/block/nvme.c | 101 
 2 files changed, 74 insertions(+), 34 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index cb2b5175f1a1..98082b2dfba3 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -9,6 +9,7 @@
 
 #define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+#define NVME_MAX_COMMANDS 0x100
 
 typedef struct NvmeParams {
 char *serial;
@@ -22,6 +23,7 @@ typedef struct NvmeParams {
 bool use_intel_id;
 uint32_t zasl_bs;
 bool legacy_cmb;
+uint16_t oncs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -183,6 +185,11 @@ typedef struct NvmeCtrl {
 NvmeCQueue  admin_cq;
 NvmeIdCtrl  id_ctrl;
 NvmeFeatureVal  features;
+
+struct {
+uint32_t nvm[NVME_MAX_COMMANDS];
+uint32_t zoned[NVME_MAX_COMMANDS];
+} iocs;
 } NvmeCtrl;
 
 static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 93345bf3c1fc..e5f725d7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -71,6 +71,11 @@
  *   data size being in effect. By setting this property to 0, users can make
  *   ZASL to be equal to MDTS. This property only affects zoned namespaces.
  *
+ * - `oncs`
+ *   This field indicates the optional NVM commands and features supported
+ *   by the controller. To add support for the optional feature, needs to
+ *   set the corresponding support indicated bit.
+ *
  * nvme namespace device parameters
  * 
  * - `subsys`
@@ -165,7 +170,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 };
 
-static const uint32_t nvme_cse_acs[256] = {
+static const uint32_t nvme_cse_acs[NVME_MAX_COMMANDS] = {
 [NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
@@ -178,30 +183,7 @@ static const uint32_t nvme_cse_acs[256] = {
 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
 };
 
-static const uint32_t nvme_cse_iocs_none[256];
-
-static const uint32_t nvme_cse_iocs_nvm[256] = {
-[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
-[NVME_CMD_DSM]  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_COMPARE]  = NVME_CMD_EFF_CSUPP,
-};
-
-static const uint32_t nvme_cse_iocs_zoned[256] = {
-[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
-[NVME_CMD_DSM]  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_COMPARE]  = NVME_CMD_EFF_CSUPP,
-[NVME_CMD_ZONE_APPEND]  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_ZONE_MGMT_SEND]   = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
-[NVME_CMD_ZONE_MGMT_RECV]   = NVME_CMD_EFF_CSUPP,
-};
+static const uint32_t nvme_cse_iocs_none[NVME_MAX_COMMANDS];
 
 static void nvme_process_sq(void *opaque);
 
@@ -2884,17 +2866,17 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t 
csi, uint32_t buf_len,
 
 switch (NVME_CC_CSS(n->bar.cc)) {
 case NVME_CC_CSS_NVM:
-src_iocs = nvme_cse_iocs_nvm;
+src_iocs = n->iocs.nvm;
 /* fall through */
 case NVME_CC_CSS_ADMIN_ONLY:
 break;
 case NVME_CC_CSS_CSI:
 switch (csi) {
 case NVME_CSI_NVM:
-src_iocs = nvme_cse_iocs_nvm;
+src_iocs = n->iocs.nvm;
 break;
 case NVME_CSI_ZONED:
-src_iocs = nvme_cse_iocs_zoned;
+src_iocs = n->iocs.zoned;
 break;
 }
 }
@@ -3422,6 +3404,10 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+if (!(le16_to_cpu(n->id_ctrl.oncs) & NVME_ONCS_FEATURES) && sel) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
 if (!nvme_nsid_v

[PATCH 2/2] hw/block/nvme: add write uncorrectable command

2021-02-09 Thread Klaus Jensen
From: Gollu Appalanaidu 

Add support for marking blocks invalid with the Write Uncorrectable
command. Block status is tracked in a (non-persistent) bitmap that is
checked on all reads and written to on all writes. This is potentially
expensive, so keep Write Uncorrectable disabled by default.

Signed-off-by: Gollu Appalanaidu 
Signed-off-by: Klaus Jensen 
---
 docs/specs/nvme.txt   |  3 ++
 hw/block/nvme-ns.h|  2 ++
 hw/block/nvme.h   |  1 +
 hw/block/nvme-ns.c|  2 ++
 hw/block/nvme.c   | 65 +--
 hw/block/trace-events |  1 +
 6 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
index 56d393884e7a..88f9cc278d4c 100644
--- a/docs/specs/nvme.txt
+++ b/docs/specs/nvme.txt
@@ -19,5 +19,8 @@ Known issues
 
 * The accounting numbers in the SMART/Health are reset across power cycles
 
+* Marking blocks invalid with the Write Uncorrectable is not persisted across
+  power cycles.
+
 * Interrupt Coalescing is not supported and is disabled by default in volation
   of the specification.
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7af6884862b5..15fa422ded03 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -72,6 +72,8 @@ typedef struct NvmeNamespace {
 struct {
 uint32_t err_rec;
 } features;
+
+unsigned long *uncorrectable;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 98082b2dfba3..9b8f85b9cf16 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -68,6 +68,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
 case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH";
 case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
+case NVME_CMD_WRITE_UNCOR:  return "NVME_CMD_WRITE_UNCOR";
 case NVME_CMD_COMPARE:  return "NVME_NVM_CMD_COMPARE";
 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
 case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index ade46e2f3739..742bbc4b4b62 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -72,6 +72,8 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 id_ns->mcl = cpu_to_le32(ns->params.mcl);
 id_ns->msrc = ns->params.msrc;
 
+ns->uncorrectable = bitmap_new(id_ns->nsze);
+
 return 0;
 }
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e5f725d7..56048046c193 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1112,6 +1112,20 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, 
uint64_t slba,
 return NVME_SUCCESS;
 }
 
+static inline uint16_t nvme_check_uncor(NvmeNamespace *ns, uint64_t slba,
+uint32_t nlb)
+{
+uint64_t elba = nlb + slba;
+
+if (ns->uncorrectable) {
+if (find_next_bit(ns->uncorrectable, elba, slba) < elba) {
+return NVME_UNRECOVERED_READ | NVME_DNR;
+}
+}
+
+return NVME_SUCCESS;
+}
+
 static void nvme_aio_err(NvmeRequest *req, int ret)
 {
 uint16_t status = NVME_SUCCESS;
@@ -1423,14 +1437,24 @@ static void nvme_rw_cb(void *opaque, int ret)
 BlockAcctCookie *acct = &req->acct;
 BlockAcctStats *stats = blk_get_stats(blk);
 
+bool is_write = nvme_is_write(req);
+
 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
 
-if (ns->params.zoned && nvme_is_write(req)) {
+if (ns->params.zoned && is_write) {
 nvme_finalize_zoned_write(ns, req);
 }
 
 if (!ret) {
 block_acct_done(stats, acct);
+
+if (is_write) {
+NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
+
+bitmap_clear(ns->uncorrectable, slba, nlb);
+}
 } else {
 block_acct_failed(stats, acct);
 nvme_aio_err(req, ret);
@@ -1521,13 +1545,13 @@ static void nvme_copy_cb(void *opaque, int ret)
 {
 NvmeRequest *req = opaque;
 NvmeNamespace *ns = req->ns;
+NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
+uint64_t sdlba = le64_to_cpu(copy->sdlba);
 struct nvme_copy_ctx *ctx = req->opaque;
 
 trace_pci_nvme_copy_cb(nvme_cid(req));
 
 if (ns->params.zoned) {
-NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
-uint64_t sdlba = le64_to_cpu(copy->sdlba);
 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
 
 __nvme_advance_zone_wp(ns, zone, ctx->nlb);
@@ -1535,6 +1559,7 @@ static void nvme_copy_cb(void *opaque, int ret)
 
 if (!ret) {
 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
+bitmap_clear(ns->uncorrectable, sdlba, ctx->nlb);
 } else {
 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
 nvme_aio_err(req, ret);
@@ -1953,6 +1978,12 @@ static uint16_t n

Re: [PATCH] hw/block/nvme: add broadcast nsid support flush command

2021-02-09 Thread Klaus Jensen
On Feb 10 12:32, Keith Busch wrote:
> On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote:
> > On Feb  9 03:59, Keith Busch wrote:
> > > This whole implementation would be much simpler with the synchronous
> > > blk_flush() routine instead of the AIO equivalent. This is not really a
> > > performant feature, so I don't think it's critical to get these
> > > operations happening in parallel. What do you think?
> > 
> > It would definitely be simpler, but I believe that if there is a lot to
> > flush, then we won't just block the nvme device. We are holding the Big
> > QEMU Lock and will block most other devices as well.
> 
> Hm, I feel like you may have told me this same explanation for a
> different patch. :) Okay, I'm convinced: this is the way.
> 

Is that an Acked-by? ;)

And yes, I might have used that argument for Copy, can't remember ;)


signature.asc
Description: PGP signature


Re: [PATCH] hw/block/nvme: improve invalid zasl value reporting

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 8:39 PM, Dmitry Fomichev wrote:
> On Mon, 2021-02-08 at 09:25 +0100, Klaus Jensen wrote:
>> From: Klaus Jensen 
>>
>> The Zone Append Size Limit (ZASL) must be at least 4096 bytes, so
>> improve the user experience by adding an early parameter check in
>> nvme_check_constraints.
>>
>> When ZASL is still too small due to the host configuring the device for
>> an even larger page size, convert the trace point in nvme_start_ctrl to
>> an NVME_GUEST_ERR such that this is logged by QEMU instead of only
>> traced.
>>
>> Reported-by: "i...@dantalion.nl" 

Apparently the reporter signed 'Corne'.

>> Cc: Dmitry Fomichev 
>> Signed-off-by: Klaus Jensen 
>> ---
>>  hw/block/nvme.c | 12 ++--
>>  1 file changed, 10 insertions(+), 2 deletions(-)
>>
>> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
>> index c2f0c88fbf39..d96888cd2333 100644
>> --- a/hw/block/nvme.c
>> +++ b/hw/block/nvme.c
>> @@ -3983,8 +3983,10 @@ static int nvme_start_ctrl(NvmeCtrl *n)
>>  n->zasl = n->params.mdts;
>>  } else {
>>  if (n->params.zasl_bs < n->page_size) {
>> -trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs,
>> -n->page_size);
>> +NVME_GUEST_ERR(pci_nvme_err_startfail_zasl_too_small,
>> +   "Zone Append Size Limit (ZASL) of %d bytes is 
>> too "
>> +   "small; must be at least %d bytes",
>> +   n->params.zasl_bs, n->page_size);
>>  return -1;
>>  }
>>  n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size);
>> @@ -4503,6 +4505,12 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
>> **errp)
>>  error_setg(errp, "zone append size limit has to be a power of 
>> 2");
>>  return;
>>  }
>> +
>> +if (n->params.zasl_bs < 4096) {
>> +error_setg(errp, "zone append size limit must be at least "
>> +   "4096 bytes");
>> +return;
>> +}
>>  }
>>  }
> 
> The guest error is less confusing than simply a trace. LGTM.

Trace events are meant for the developers when debugging, they
are usually stripped out in final build.

Errors are reported to the user / operator (i.e. incorrect
configuration).

Regards,

Phil.




Re: [PATCH] hw/block/nvme: add broadcast nsid support flush command

2021-02-09 Thread Keith Busch
On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote:
> On Feb  9 03:59, Keith Busch wrote:
> > This whole implementation would be much simpler with the synchronous
> > blk_flush() routine instead of the AIO equivalent. This is not really a
> > performant feature, so I don't think it's critical to get these
> > operations happening in parallel. What do you think?
> 
> It would definitely be simpler, but I believe that if there is a lot to
> flush, then we won't just block the nvme device. We are holding the Big
> QEMU Lock and will block most other devices as well.

Hm, I feel like you may have told me this same explanation for a
different patch. :) Okay, I'm convinced: this is the way.



Re: [PATCH v5 0/9] block: Add retry for werror=/rerror= mechanism

2021-02-09 Thread Jiahui Cen
Kindly ping.
Any comments and reviews are wellcome :)

Thanks,
Jiahui

On 2021/2/5 18:13, Jiahui Cen wrote:
> A VM in the cloud environment may use a virutal disk as the backend storage,
> and there are usually filesystems on the virtual block device. When backend
> storage is temporarily down, any I/O issued to the virtual block device
> will cause an error. For example, an error occurred in ext4 filesystem would
> make the filesystem readonly. In production environment, a cloud backend
> storage can be soon recovered. For example, an IP-SAN may be down due to
> network failure and will be online soon after network is recovered. However,
> the error in the filesystem may not be recovered unless a device reattach
> or system restart. Thus an I/O retry mechanism is in need to implement a
> self-healing system.
> 
> This patch series propose to extend the werror=/rerror= mechanism to add
> a 'retry' feature. It can automatically retry failed I/O requests on error
> without sending error back to guest, and guest can get back running smoothly
> when I/O is recovred.
> 
> v4->v5:
> * Add document for 'retry' in qapi.
> * Support werror=/rerror=retry for scsi-disk.
> * Pause retry when draining.
> 
> v3->v4:
> * Adapt to werror=/rerror= mechanism.
> 
> v2->v3:
> * Add a doc to describe I/O hang.
> 
> v1->v2:
> * Rebase to fix compile problems.
> * Fix incorrect remove of rehandle list.
> * Provide rehandle pause interface.
> 
> REF: https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg06560.html
> 
> Jiahui Cen (9):
>   qapi/block-core: Add retry option for error action
>   block-backend: Introduce retry timer
>   block-backend: Add device specific retry callback
>   block-backend: Enable retry action on errors
>   block-backend: Add timeout support for retry
>   block: Add error retry param setting
>   virtio_blk: Add support for retry on errors
>   scsi-bus: Refactor the code that retries requests
>   scsi-disk: Add support for retry on errors
> 
>  block/block-backend.c  | 68 
>  blockdev.c | 52 +++
>  hw/block/block.c   | 10 +++
>  hw/block/virtio-blk.c  | 21 +-
>  hw/scsi/scsi-bus.c | 16 +++--
>  hw/scsi/scsi-disk.c| 16 +
>  include/hw/block/block.h   |  7 +-
>  include/hw/scsi/scsi.h |  1 +
>  include/sysemu/block-backend.h | 10 +++
>  qapi/block-core.json   |  9 ++-
>  10 files changed, 199 insertions(+), 11 deletions(-)
> 



Re: [PATCH] hw/block/nvme: improve invalid zasl value reporting

2021-02-09 Thread Dmitry Fomichev
On Mon, 2021-02-08 at 09:25 +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> The Zone Append Size Limit (ZASL) must be at least 4096 bytes, so
> improve the user experience by adding an early parameter check in
> nvme_check_constraints.
> 
> When ZASL is still too small due to the host configuring the device for
> an even larger page size, convert the trace point in nvme_start_ctrl to
> an NVME_GUEST_ERR such that this is logged by QEMU instead of only
> traced.
> 
> Reported-by: "i...@dantalion.nl" 
> Cc: Dmitry Fomichev 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index c2f0c88fbf39..d96888cd2333 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -3983,8 +3983,10 @@ static int nvme_start_ctrl(NvmeCtrl *n)
>  n->zasl = n->params.mdts;
>  } else {
>  if (n->params.zasl_bs < n->page_size) {
> -trace_pci_nvme_err_startfail_zasl_too_small(n->params.zasl_bs,
> -n->page_size);
> +NVME_GUEST_ERR(pci_nvme_err_startfail_zasl_too_small,
> +   "Zone Append Size Limit (ZASL) of %d bytes is too 
> "
> +   "small; must be at least %d bytes",
> +   n->params.zasl_bs, n->page_size);
>  return -1;
>  }
>  n->zasl = 31 - clz32(n->params.zasl_bs / n->page_size);
> @@ -4503,6 +4505,12 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
> **errp)
>  error_setg(errp, "zone append size limit has to be a power of 
> 2");
>  return;
>  }
> +
> +if (n->params.zasl_bs < 4096) {
> +error_setg(errp, "zone append size limit must be at least "
> +   "4096 bytes");
> +return;
> +}
>  }
>  }

The guest error is less confusing than simply a trace. LGTM.
Reviewed-by: Dmitry Fomichev 

>  
> 
> 
> 



Re: [PATCH 0/7] qcow2: compressed write cache

2021-02-09 Thread Vladimir Sementsov-Ogievskiy

09.02.2021 21:41, Denis V. Lunev wrote:

On 2/9/21 9:36 PM, Vladimir Sementsov-Ogievskiy wrote:

09.02.2021 19:39, Vladimir Sementsov-Ogievskiy wrote:

09.02.2021 17:47, Max Reitz wrote:

On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote:

09.02.2021 16:25, Max Reitz wrote:

On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

I know, I have several series waiting for a resend, but I had to
switch
to another task spawned from our customer's bug.

Original problem: we use O_DIRECT for all vm images in our
product, it's
the policy. The only exclusion is backup target qcow2 image for
compressed backup, because compressed backup is extremely slow with
O_DIRECT (due to unaligned writes). Customer complains that backup
produces a lot of pagecache.

So we can either implement some internal cache or use fadvise
somehow.
Backup has several async workes, which writes simultaneously, so
in both
ways we have to track host cluster filling (before dropping the
cache
corresponding to the cluster).  So, if we have to track anyway,
let's
try to implement the cache.


I wanted to be excited here, because that sounds like it would be
very easy to implement caching.  Like, just keep the cluster at
free_byte_offset cached until the cluster it points to changes,
then flush the cluster.


The problem is that chunks are written asynchronously.. That's why
this all is not so easy.



But then I see like 900 new lines of code, and I’m much less
excited...


Idea is simple: cache small unaligned write and flush the cluster
when
filled.

Performance result is very good (results in a table is time of
compressed backup of 1000M disk filled with ones in seconds):


“Filled with ones” really is an edge case, though.


Yes, I think, all clusters are compressed to rather small chunks :)




---  ---  ---
   backup(old)  backup(new)
ssd:hdd(direct)  3e+02    4.4
  -99%
ssd:hdd(cached)  5.7  5.4
  -5%
---  ---  ---

So, we have benefit even for cached mode! And the fastest thing is
O_DIRECT with new implemented cache. So, I suggest to enable the new
cache by default (which is done by the series).


First, I’m not sure how O_DIRECT really is relevant, because I
don’t really see the point for writing compressed images.


compressed backup is a point


(Perhaps irrelevant, but just to be clear:) I meant the point of
using O_DIRECT, which one can decide to not use for backup targets
(as you have done already).


Second, I find it a bit cheating if you say there is a huge
improvement for the no-cache case, when actually, well, you just
added a cache.  So the no-cache case just became faster because
there is a cache now.


Still, performance comparison is relevant to show that O_DIRECT as
is unusable for compressed backup.


(Again, perhaps irrelevant, but:) Yes, but my first point was
exactly whether O_DIRECT is even relevant for writing compressed
images.


Well, I suppose I could follow that if O_DIRECT doesn’t make much
sense for compressed images, qemu’s format drivers are free to
introduce some caching (because technically the cache.direct
option only applies to the protocol driver) for collecting
compressed writes.


Yes I thought in this way, enabling the cache by default.


That conclusion makes both of my complaints kind of moot.

*shrug*

Third, what is the real-world impact on the page cache?  You
described that that’s the reason why you need the cache in qemu,
because otherwise the page cache is polluted too much.  How much
is the difference really?  (I don’t know how good the compression
ratio is for real-world images.)


Hm. I don't know the ratio.. Customer reported that most of RAM is
polluted by Qemu's cache, and we use O_DIRECT for everything except
for target of compressed backup.. Still the pollution may relate to
several backups and of course it is simple enough to drop the cache
after each backup. But I think that even one backup of 16T disk may
pollute RAM enough.


Oh, sorry, I just realized I had a brain fart there.  I was
referring to whether this series improves the page cache pollution.
But obviously it will if it allows you to re-enable O_DIRECT.


Related to that, I remember a long time ago we had some discussion
about letting qemu-img convert set a special cache mode for the
target image that would make Linux drop everything before the last
offset written (i.e., I suppose fadvise() with
POSIX_FADV_SEQUENTIAL).  You discard that idea based on the fact
that implementing a cache in qemu would be simple, but it isn’t,
really.  What would the impact of POSIX_FADV_SEQUENTIAL be?  (One
advantage of using that would be that we could reuse it for
non-compressed images that are written by backup or qemu-img
convert.)


The problem is that writes are async. And therefore, not sequential.


In theory, yes, but all compressed writes still goes through
qcow2_alloc_byt

Re: [PATCH] iotests: Fix unsupported_imgopts for refcount_bits

2021-02-09 Thread Eric Blake
On 2/9/21 12:27 PM, Max Reitz wrote:
> Many _unsupported_imgopts lines for refcount_bits values use something
> like "refcount_bits=1[^0-9]" to forbid everything but "refcount_bits=1"
> (e.g. "refcount_bits=16" is allowed).
> 
> That does not work when $IMGOPTS does not have any entry past the
> refcount_bits option, which now became apparent with the "check" script
> rewrite.
> 
> Use \b instead of [^0-9] to check for a word boundary, which is what we
> really want.

\b is a Linux-ism (that is, glibc supports it, but BSD libc does not).

https://mail-index.netbsd.org/tech-userlevel/2012/12/02/msg006954.html


> 
> Signed-off-by: Max Reitz 
> ---
> Reproducible with:
> $ ./check -qcow2 -o refcount_bits=1
> (The tests touched here should be skipped)
> 
> I don't know whether \b is portable.  I hope it is.
> (This is why I CC-ed you, Eric.)

No, it's not portable.  \> and [[:>:]] are other spellings for the same
task, equally non-portable.

> 
> Then again, it appears that nobody ever runs the iotests with
> refcount_bits=1 but me, and I do that on Linux.  So even if it isn't
> portable, it shouldn't be an issue in practice... O:)

What exactly is failing?  Is it merely a case of our python script
running the regex against "${unsupported_imgopts}" instead of
"${unsupported_imgsopts} " with an added trailing space to guarantee
that we have something to match against?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 0/7] qcow2: compressed write cache

2021-02-09 Thread Denis V. Lunev
On 2/9/21 9:36 PM, Vladimir Sementsov-Ogievskiy wrote:
> 09.02.2021 19:39, Vladimir Sementsov-Ogievskiy wrote:
>> 09.02.2021 17:47, Max Reitz wrote:
>>> On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote:
 09.02.2021 16:25, Max Reitz wrote:
> On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote:
>> Hi all!
>>
>> I know, I have several series waiting for a resend, but I had to
>> switch
>> to another task spawned from our customer's bug.
>>
>> Original problem: we use O_DIRECT for all vm images in our
>> product, it's
>> the policy. The only exclusion is backup target qcow2 image for
>> compressed backup, because compressed backup is extremely slow with
>> O_DIRECT (due to unaligned writes). Customer complains that backup
>> produces a lot of pagecache.
>>
>> So we can either implement some internal cache or use fadvise
>> somehow.
>> Backup has several async workes, which writes simultaneously, so
>> in both
>> ways we have to track host cluster filling (before dropping the
>> cache
>> corresponding to the cluster).  So, if we have to track anyway,
>> let's
>> try to implement the cache.
>
> I wanted to be excited here, because that sounds like it would be
> very easy to implement caching.  Like, just keep the cluster at
> free_byte_offset cached until the cluster it points to changes,
> then flush the cluster.

 The problem is that chunks are written asynchronously.. That's why
 this all is not so easy.

>
> But then I see like 900 new lines of code, and I’m much less
> excited...
>
>> Idea is simple: cache small unaligned write and flush the cluster
>> when
>> filled.
>>
>> Performance result is very good (results in a table is time of
>> compressed backup of 1000M disk filled with ones in seconds):
>
> “Filled with ones” really is an edge case, though.

 Yes, I think, all clusters are compressed to rather small chunks :)

>
>> ---  ---  ---
>>   backup(old)  backup(new)
>> ssd:hdd(direct)  3e+02    4.4
>>  -99%
>> ssd:hdd(cached)  5.7  5.4
>>  -5%
>> ---  ---  ---
>>
>> So, we have benefit even for cached mode! And the fastest thing is
>> O_DIRECT with new implemented cache. So, I suggest to enable the new
>> cache by default (which is done by the series).
>
> First, I’m not sure how O_DIRECT really is relevant, because I
> don’t really see the point for writing compressed images.

 compressed backup is a point
>>>
>>> (Perhaps irrelevant, but just to be clear:) I meant the point of
>>> using O_DIRECT, which one can decide to not use for backup targets
>>> (as you have done already).
>>>
> Second, I find it a bit cheating if you say there is a huge
> improvement for the no-cache case, when actually, well, you just
> added a cache.  So the no-cache case just became faster because
> there is a cache now.

 Still, performance comparison is relevant to show that O_DIRECT as
 is unusable for compressed backup.
>>>
>>> (Again, perhaps irrelevant, but:) Yes, but my first point was
>>> exactly whether O_DIRECT is even relevant for writing compressed
>>> images.
>>>
> Well, I suppose I could follow that if O_DIRECT doesn’t make much
> sense for compressed images, qemu’s format drivers are free to
> introduce some caching (because technically the cache.direct
> option only applies to the protocol driver) for collecting
> compressed writes.

 Yes I thought in this way, enabling the cache by default.

> That conclusion makes both of my complaints kind of moot.
>
> *shrug*
>
> Third, what is the real-world impact on the page cache?  You
> described that that’s the reason why you need the cache in qemu,
> because otherwise the page cache is polluted too much.  How much
> is the difference really?  (I don’t know how good the compression
> ratio is for real-world images.)

 Hm. I don't know the ratio.. Customer reported that most of RAM is
 polluted by Qemu's cache, and we use O_DIRECT for everything except
 for target of compressed backup.. Still the pollution may relate to
 several backups and of course it is simple enough to drop the cache
 after each backup. But I think that even one backup of 16T disk may
 pollute RAM enough.
>>>
>>> Oh, sorry, I just realized I had a brain fart there.  I was
>>> referring to whether this series improves the page cache pollution. 
>>> But obviously it will if it allows you to re-enable O_DIRECT.
>>>
> Related to that, I remember a long time ago we had some discussion
> about letting qemu-img convert set a special cache mode for the
> target im

Re: [PATCH] iotests/210: Fix reference output

2021-02-09 Thread Vladimir Sementsov-Ogievskiy

09.02.2021 21:19, Max Reitz wrote:

Commit 69b55e03f has changed an error message, adjust the reference
output to account for it.

Fixes: 69b55e03f7e65a36eb954d0b7d4698b258df2708
("block: refactor bdrv_check_request: add errp")
Signed-off-by: Max Reitz 


Reviewed-by: Vladimir Sementsov-Ogievskiy 


---
Fun fact: The branch name "fix-210-v1" was already taken for
8ba9c4d9b088d66aebfcb019f61ddc36fba2db88, which was only two months
ago.  Ah, well. :)


Me again :( Hmm. I should definitely start running iotests with -luks, not only 
my favorite -raw, -qcow2, -nbd. Sorry.


---
  tests/qemu-iotests/210.out | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/210.out b/tests/qemu-iotests/210.out
index dc1a3c9786..2e9fc596eb 100644
--- a/tests/qemu-iotests/210.out
+++ b/tests/qemu-iotests/210.out
@@ -182,7 +182,7 @@ Job failed: The requested file size is too large
  === Resize image with invalid sizes ===
  
  {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 9223372036854775296}}

-{"error": {"class": "GenericError", "desc": "Required too big image size, it must 
be not greater than 9223372035781033984"}}
+{"error": {"class": "GenericError", "desc": "offset(9223372036854775296) exceeds 
maximum(9223372035781033984)"}}
  {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 
9223372036854775808}}
  {"error": {"class": "GenericError", "desc": "Invalid parameter type for 'size', 
expected: integer"}}
  {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 
18446744073709551104}}




--
Best regards,
Vladimir



Re: [PATCH 0/7] qcow2: compressed write cache

2021-02-09 Thread Vladimir Sementsov-Ogievskiy

09.02.2021 19:39, Vladimir Sementsov-Ogievskiy wrote:

09.02.2021 17:47, Max Reitz wrote:

On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote:

09.02.2021 16:25, Max Reitz wrote:

On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

I know, I have several series waiting for a resend, but I had to switch
to another task spawned from our customer's bug.

Original problem: we use O_DIRECT for all vm images in our product, it's
the policy. The only exclusion is backup target qcow2 image for
compressed backup, because compressed backup is extremely slow with
O_DIRECT (due to unaligned writes). Customer complains that backup
produces a lot of pagecache.

So we can either implement some internal cache or use fadvise somehow.
Backup has several async workes, which writes simultaneously, so in both
ways we have to track host cluster filling (before dropping the cache
corresponding to the cluster).  So, if we have to track anyway, let's
try to implement the cache.


I wanted to be excited here, because that sounds like it would be very easy to 
implement caching.  Like, just keep the cluster at free_byte_offset cached 
until the cluster it points to changes, then flush the cluster.


The problem is that chunks are written asynchronously.. That's why this all is 
not so easy.



But then I see like 900 new lines of code, and I’m much less excited...


Idea is simple: cache small unaligned write and flush the cluster when
filled.

Performance result is very good (results in a table is time of
compressed backup of 1000M disk filled with ones in seconds):


“Filled with ones” really is an edge case, though.


Yes, I think, all clusters are compressed to rather small chunks :)




---  ---  ---
  backup(old)  backup(new)
ssd:hdd(direct)  3e+02    4.4
 -99%
ssd:hdd(cached)  5.7  5.4
 -5%
---  ---  ---

So, we have benefit even for cached mode! And the fastest thing is
O_DIRECT with new implemented cache. So, I suggest to enable the new
cache by default (which is done by the series).


First, I’m not sure how O_DIRECT really is relevant, because I don’t really see 
the point for writing compressed images.


compressed backup is a point


(Perhaps irrelevant, but just to be clear:) I meant the point of using 
O_DIRECT, which one can decide to not use for backup targets (as you have done 
already).


Second, I find it a bit cheating if you say there is a huge improvement for the 
no-cache case, when actually, well, you just added a cache.  So the no-cache 
case just became faster because there is a cache now.


Still, performance comparison is relevant to show that O_DIRECT as is unusable 
for compressed backup.


(Again, perhaps irrelevant, but:) Yes, but my first point was exactly whether 
O_DIRECT is even relevant for writing compressed images.


Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for 
compressed images, qemu’s format drivers are free to introduce some caching 
(because technically the cache.direct option only applies to the protocol 
driver) for collecting compressed writes.


Yes I thought in this way, enabling the cache by default.


That conclusion makes both of my complaints kind of moot.

*shrug*

Third, what is the real-world impact on the page cache?  You described that 
that’s the reason why you need the cache in qemu, because otherwise the page 
cache is polluted too much.  How much is the difference really?  (I don’t know 
how good the compression ratio is for real-world images.)


Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by 
Qemu's cache, and we use O_DIRECT for everything except for target of 
compressed backup.. Still the pollution may relate to several backups and of 
course it is simple enough to drop the cache after each backup. But I think 
that even one backup of 16T disk may pollute RAM enough.


Oh, sorry, I just realized I had a brain fart there.  I was referring to 
whether this series improves the page cache pollution.  But obviously it will 
if it allows you to re-enable O_DIRECT.


Related to that, I remember a long time ago we had some discussion about 
letting qemu-img convert set a special cache mode for the target image that 
would make Linux drop everything before the last offset written (i.e., I 
suppose fadvise() with POSIX_FADV_SEQUENTIAL).  You discard that idea based on 
the fact that implementing a cache in qemu would be simple, but it isn’t, 
really.  What would the impact of POSIX_FADV_SEQUENTIAL be?  (One advantage of 
using that would be that we could reuse it for non-compressed images that are 
written by backup or qemu-img convert.)


The problem is that writes are async. And therefore, not sequential.


In theory, yes, but all compressed writes still goes through 
qcow2_alloc_bytes() right before submitting the write, so I wonder whether in 
pract

Re: [PATCH] iotests/210: Fix reference output

2021-02-09 Thread Eric Blake
On 2/9/21 12:19 PM, Max Reitz wrote:
> Commit 69b55e03f has changed an error message, adjust the reference
> output to account for it.
> 
> Fixes: 69b55e03f7e65a36eb954d0b7d4698b258df2708
>("block: refactor bdrv_check_request: add errp")
> Signed-off-by: Max Reitz 
> ---

Reviewed-by: Eric Blake 

I'm happy to queue this through my tree (since that's where the original
came through) if you don't beat me to it.


-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




[PATCH] iotests: Fix unsupported_imgopts for refcount_bits

2021-02-09 Thread Max Reitz
Many _unsupported_imgopts lines for refcount_bits values use something
like "refcount_bits=1[^0-9]" to forbid everything but "refcount_bits=1"
(e.g. "refcount_bits=16" is allowed).

That does not work when $IMGOPTS does not have any entry past the
refcount_bits option, which now became apparent with the "check" script
rewrite.

Use \b instead of [^0-9] to check for a word boundary, which is what we
really want.

Signed-off-by: Max Reitz 
---
Reproducible with:
$ ./check -qcow2 -o refcount_bits=1
(The tests touched here should be skipped)

I don't know whether \b is portable.  I hope it is.
(This is why I CC-ed you, Eric.)

Then again, it appears that nobody ever runs the iotests with
refcount_bits=1 but me, and I do that on Linux.  So even if it isn't
portable, it shouldn't be an issue in practice... O:)
---
 tests/qemu-iotests/007 | 2 +-
 tests/qemu-iotests/015 | 2 +-
 tests/qemu-iotests/029 | 2 +-
 tests/qemu-iotests/058 | 2 +-
 tests/qemu-iotests/062 | 2 +-
 tests/qemu-iotests/066 | 2 +-
 tests/qemu-iotests/068 | 2 +-
 tests/qemu-iotests/080 | 2 +-
 tests/qemu-iotests/103 | 2 +-
 tests/qemu-iotests/201 | 2 +-
 tests/qemu-iotests/214 | 2 +-
 tests/qemu-iotests/217 | 2 +-
 tests/qemu-iotests/267 | 2 +-
 tests/qemu-iotests/271 | 3 ++-
 tests/qemu-iotests/286 | 2 +-
 15 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/tests/qemu-iotests/007 b/tests/qemu-iotests/007
index 936d3f14fb..a014f50a6b 100755
--- a/tests/qemu-iotests/007
+++ b/tests/qemu-iotests/007
@@ -44,7 +44,7 @@ _supported_proto generic
 # refcount_bits must be at least 4 so we can create ten internal snapshots
 # (1 bit supports none, 2 bits support two, 4 bits support 14);
 # snapshot are generally impossible with external data files
-_unsupported_imgopts 'refcount_bits=\(1\|2\)[^0-9]' data_file
+_unsupported_imgopts 'refcount_bits=\(1\|2\)\b' data_file
 
 echo
 echo "creating image"
diff --git a/tests/qemu-iotests/015 b/tests/qemu-iotests/015
index 40c23235a6..24e28643e4 100755
--- a/tests/qemu-iotests/015
+++ b/tests/qemu-iotests/015
@@ -43,7 +43,7 @@ _supported_fmt qcow2
 _supported_proto generic
 # Internal snapshots are (currently) impossible with refcount_bits=1,
 # and generally impossible with external data files
-_unsupported_imgopts 'refcount_bits=1[^0-9]' data_file
+_unsupported_imgopts 'refcount_bits=1\b' data_file
 
 echo
 echo "creating image"
diff --git a/tests/qemu-iotests/029 b/tests/qemu-iotests/029
index bd71dd2f22..9b345060e5 100755
--- a/tests/qemu-iotests/029
+++ b/tests/qemu-iotests/029
@@ -44,7 +44,7 @@ _supported_fmt qcow2
 _supported_proto generic
 # Internal snapshots are (currently) impossible with refcount_bits=1,
 # and generally impossible with external data files
-_unsupported_imgopts 'refcount_bits=1[^0-9]' data_file
+_unsupported_imgopts 'refcount_bits=1\b' data_file
 
 offset_size=24
 offset_l1_size=36
diff --git a/tests/qemu-iotests/058 b/tests/qemu-iotests/058
index ce35ff4ee0..0b0303fcd5 100755
--- a/tests/qemu-iotests/058
+++ b/tests/qemu-iotests/058
@@ -59,7 +59,7 @@ _supported_os Linux
 _require_command QEMU_NBD
 # Internal snapshots are (currently) impossible with refcount_bits=1,
 # and generally impossible with external data files
-_unsupported_imgopts 'refcount_bits=1[^0-9]' data_file
+_unsupported_imgopts 'refcount_bits=1\b' data_file
 
 nbd_snapshot_img="nbd:unix:$nbd_unix_socket"
 
diff --git a/tests/qemu-iotests/062 b/tests/qemu-iotests/062
index 321252298d..d7307f24ac 100755
--- a/tests/qemu-iotests/062
+++ b/tests/qemu-iotests/062
@@ -42,7 +42,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow2
 _supported_proto generic
 # We need zero clusters and snapshots
-_unsupported_imgopts 'compat=0.10' 'refcount_bits=1[^0-9]' data_file
+_unsupported_imgopts 'compat=0.10' 'refcount_bits=1\b' data_file
 
 IMG_SIZE=64M
 
diff --git a/tests/qemu-iotests/066 b/tests/qemu-iotests/066
index a780ed7ab5..ec9dab2025 100755
--- a/tests/qemu-iotests/066
+++ b/tests/qemu-iotests/066
@@ -43,7 +43,7 @@ _supported_proto generic
 # We need zero clusters and snapshots
 # (TODO: Consider splitting the snapshot part into a separate test
 #file, so this one runs with refcount_bits=1 and data_file)
-_unsupported_imgopts 'compat=0.10' 'refcount_bits=1[^0-9]' data_file
+_unsupported_imgopts 'compat=0.10' 'refcount_bits=1\b' data_file
 
 # Intentionally create an unaligned image
 IMG_SIZE=$((64 * 1024 * 1024 + 512))
diff --git a/tests/qemu-iotests/068 b/tests/qemu-iotests/068
index 03e03508a6..39a04a6ab6 100755
--- a/tests/qemu-iotests/068
+++ b/tests/qemu-iotests/068
@@ -42,7 +42,7 @@ _supported_fmt qcow2
 _supported_proto generic
 # Internal snapshots are (currently) impossible with refcount_bits=1,
 # and generally impossible with external data files
-_unsupported_imgopts 'compat=0.10' 'refcount_bits=1[^0-9]' data_file
+_unsupported_imgopts 'compat=0.10' 'refcount_bits=1\b' data_file
 
 IMG_SIZE=128K
 
diff --git a/tests/qemu-iotests/080 b/tests/qemu-iotests/080
index 3306500683..a7cf

[PATCH] iotests/210: Fix reference output

2021-02-09 Thread Max Reitz
Commit 69b55e03f has changed an error message, adjust the reference
output to account for it.

Fixes: 69b55e03f7e65a36eb954d0b7d4698b258df2708
   ("block: refactor bdrv_check_request: add errp")
Signed-off-by: Max Reitz 
---
Fun fact: The branch name "fix-210-v1" was already taken for
8ba9c4d9b088d66aebfcb019f61ddc36fba2db88, which was only two months
ago.  Ah, well. :)
---
 tests/qemu-iotests/210.out | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/210.out b/tests/qemu-iotests/210.out
index dc1a3c9786..2e9fc596eb 100644
--- a/tests/qemu-iotests/210.out
+++ b/tests/qemu-iotests/210.out
@@ -182,7 +182,7 @@ Job failed: The requested file size is too large
 === Resize image with invalid sizes ===
 
 {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 
9223372036854775296}}
-{"error": {"class": "GenericError", "desc": "Required too big image size, it 
must be not greater than 9223372035781033984"}}
+{"error": {"class": "GenericError", "desc": "offset(9223372036854775296) 
exceeds maximum(9223372035781033984)"}}
 {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 
9223372036854775808}}
 {"error": {"class": "GenericError", "desc": "Invalid parameter type for 
'size', expected: integer"}}
 {"execute": "block_resize", "arguments": {"node-name": "node1", "size": 
18446744073709551104}}
-- 
2.29.2




Re: [PATCH v4 0/9] hw/sd: Support block read/write in SPI mode

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 3:32 PM, Bin Meng wrote:
> Hi Philippe,
> 
> On Thu, Feb 4, 2021 at 2:02 PM Bin Meng  wrote:
>>
>> On Thu, Jan 28, 2021 at 2:30 PM Bin Meng  wrote:
>>>
>>> From: Bin Meng 
>>>
>>> This includes the previously v3 series [1], and one single patch [2].
>>>
>>> Compared to v3, this fixed the following issue in patch [v3,6/6]:
>>> - Keep the card state to SSI_SD_CMD instead of SSI_SD_RESPONSE after
>>>   receiving the STOP_TRAN token per the spec
>>>
>>> All software tested so far (U-Boot/Linux/VxWorks) do work without
>>> the fix, but it is better to comform with the spec.
>>>
>>> In addition to [2], one more issue was exposed when testing with
>>> VxWorks driver related to STOP_TRANSMISSION (CMD12) response.
>>>
>>> [1] http://patchwork.ozlabs.org/project/qemu-devel/list/?series=226136
>>> [2] 
>>> http://patchwork.ozlabs.org/project/qemu-devel/patch/1611636214-52427-1-git-send-email-bmeng...@gmail.com/
>>>
>>> Changes in v4:
>>> - Keep the card state to SSI_SD_CMD instead of SSI_SD_RESPONSE after
>>>   receiving the STOP_TRAN token per the spec
>>> - new patch: fix STOP_TRANSMISSION (CMD12) response
>>> - new patch: handle the rest commands with R1b response type
>>>
>>
>> Ping?
> 
> Will a PR be sent soon to include this series so that the SiFive SPI
> series can follow?

I had it planned for yesterday but had problems with the mails from
the list, + the CVE (you fixed) took priority.

Missing review is patch #8 "Fix STOP_TRANSMISSION (CMD12) response"
for which I don't have test yet.



Re: [RFC PATCH] block/null: Use 'read-zeroes' mode by default

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 6:11 PM, Eric Blake wrote:
> On 2/9/21 11:01 AM, Philippe Mathieu-Daudé wrote:
>> The null-co driver is meant for (performance) testing.
>> By default, read operation does nothing, the provided buffer
>> is not filled with zero values and its content is unchanged.
>>
>> This can confuse security experts. For example, using the default
>> null-co driver, buf[] is uninitialized, the blk_pread() call
>> succeeds and we then access uninitialized memory:
>>
>>   static int guess_disk_lchs(BlockBackend *blk,
>>  int *pcylinders, int *pheads,
>>  int *psectors)
>>   {
>>   uint8_t buf[BDRV_SECTOR_SIZE];
>>   ...
>>
>>   if (blk_pread(blk, 0, buf, BDRV_SECTOR_SIZE) < 0) {
>>   return -1;
>>   }
>>   /* test msdos magic */
>>   if (buf[510] != 0x55 || buf[511] != 0xaa) {
>>   return -1;
>>   }
>>
>> We could audit all the uninitialized buffers and the
>> bdrv_co_preadv() handlers, but it is simpler to change the
>> default of this testing driver. Performance tests will have
>> to adapt and use 'null-co,read-zeroes=on'.
> 
> Wouldn't this rather be read-zeroes=off when doing performance testing?

Oops, yes ;)

> 
>>
>> Suggested-by: Max Reitz 
>> Signed-off-by: Philippe Mathieu-Daudé 
>> ---
>> RFC maybe a stricter approach is required?
> 
> Since the null driver is only for testing in the first place, opting in
> to speed over security seems like a reasonable tradeoff.  But I consider
> the patch incomplete without an audit of the iotests that will want to
> use explicit read-zeroes=off.

Correct. I don't know about each iotest but I can send a patch with
explicit option, so review would be trivial.

Thanks,

Phil.




Re: [RFC PATCH] block/null: Use 'read-zeroes' mode by default

2021-02-09 Thread Eric Blake
On 2/9/21 11:01 AM, Philippe Mathieu-Daudé wrote:
> The null-co driver is meant for (performance) testing.
> By default, read operation does nothing, the provided buffer
> is not filled with zero values and its content is unchanged.
> 
> This can confuse security experts. For example, using the default
> null-co driver, buf[] is uninitialized, the blk_pread() call
> succeeds and we then access uninitialized memory:
> 
>   static int guess_disk_lchs(BlockBackend *blk,
>  int *pcylinders, int *pheads,
>  int *psectors)
>   {
>   uint8_t buf[BDRV_SECTOR_SIZE];
>   ...
> 
>   if (blk_pread(blk, 0, buf, BDRV_SECTOR_SIZE) < 0) {
>   return -1;
>   }
>   /* test msdos magic */
>   if (buf[510] != 0x55 || buf[511] != 0xaa) {
>   return -1;
>   }
> 
> We could audit all the uninitialized buffers and the
> bdrv_co_preadv() handlers, but it is simpler to change the
> default of this testing driver. Performance tests will have
> to adapt and use 'null-co,read-zeroes=on'.

Wouldn't this rather be read-zeroes=off when doing performance testing?

> 
> Suggested-by: Max Reitz 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
> RFC maybe a stricter approach is required?

Since the null driver is only for testing in the first place, opting in
to speed over security seems like a reasonable tradeoff.  But I consider
the patch incomplete without an audit of the iotests that will want to
use explicit read-zeroes=off.

> ---
>  block/null.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/block/null.c b/block/null.c
> index cc9b1d4ea72..f9658fd70ac 100644
> --- a/block/null.c
> +++ b/block/null.c
> @@ -93,7 +93,7 @@ static int null_file_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  error_setg(errp, "latency-ns is invalid");
>  ret = -EINVAL;
>  }
> -s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false);
> +s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, true);
>  qemu_opts_del(opts);
>  bs->supported_write_flags = BDRV_REQ_FUA;
>  return ret;
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [RFC PATCH] block/null: Use 'read-zeroes' mode by default

2021-02-09 Thread Max Reitz

On 09.02.21 18:01, Philippe Mathieu-Daudé wrote:

The null-co driver is meant for (performance) testing.
By default, read operation does nothing, the provided buffer
is not filled with zero values and its content is unchanged.

This can confuse security experts. For example, using the default
null-co driver, buf[] is uninitialized, the blk_pread() call
succeeds and we then access uninitialized memory:


I suppose in practice it’s going to be uninitialized guest memory most 
of the time, so it isn’t that bad, but yes.


Thanks!


   static int guess_disk_lchs(BlockBackend *blk,
  int *pcylinders, int *pheads,
  int *psectors)
   {
   uint8_t buf[BDRV_SECTOR_SIZE];
   ...

   if (blk_pread(blk, 0, buf, BDRV_SECTOR_SIZE) < 0) {
   return -1;
   }
   /* test msdos magic */
   if (buf[510] != 0x55 || buf[511] != 0xaa) {
   return -1;
   }

We could audit all the uninitialized buffers and the
bdrv_co_preadv() handlers, but it is simpler to change the
default of this testing driver. Performance tests will have
to adapt and use 'null-co,read-zeroes=on'.

Suggested-by: Max Reitz 
Signed-off-by: Philippe Mathieu-Daudé 
---
RFC maybe a stricter approach is required?


I think this is good.  If we do want a stricter approach, we might 
remove read-zeroes altogether (but I suppose that would require a 
deprecation period then) and add a new null-unsafe driver or something 
in its stead (that we can the conditionally compile out, or 
distributions can choose not to whitelist, or, or, or...).


If we just follow through with this patch, I don’t think we need a 
deprecation period, because this can well be considered a bug fix; and 
because I don’t know of any use for read-zeroes=false except for some 
very special performance tests.



---
  block/null.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/null.c b/block/null.c
index cc9b1d4ea72..f9658fd70ac 100644
--- a/block/null.c
+++ b/block/null.c
@@ -93,7 +93,7 @@ static int null_file_open(BlockDriverState *bs, QDict 
*options, int flags,
  error_setg(errp, "latency-ns is invalid");
  ret = -EINVAL;
  }
-s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false);
+s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, true);
  qemu_opts_del(opts);
  bs->supported_write_flags = BDRV_REQ_FUA;
  return ret;


The documentation in qapi/block-core.json has to be changed, too.

Are there any iotests (or other tests) that don’t set read-zeroes? 
Should they continue to use read-zeroes=false?


Max




[RFC PATCH] block/null: Use 'read-zeroes' mode by default

2021-02-09 Thread Philippe Mathieu-Daudé
The null-co driver is meant for (performance) testing.
By default, read operation does nothing, the provided buffer
is not filled with zero values and its content is unchanged.

This can confuse security experts. For example, using the default
null-co driver, buf[] is uninitialized, the blk_pread() call
succeeds and we then access uninitialized memory:

  static int guess_disk_lchs(BlockBackend *blk,
 int *pcylinders, int *pheads,
 int *psectors)
  {
  uint8_t buf[BDRV_SECTOR_SIZE];
  ...

  if (blk_pread(blk, 0, buf, BDRV_SECTOR_SIZE) < 0) {
  return -1;
  }
  /* test msdos magic */
  if (buf[510] != 0x55 || buf[511] != 0xaa) {
  return -1;
  }

We could audit all the uninitialized buffers and the
bdrv_co_preadv() handlers, but it is simpler to change the
default of this testing driver. Performance tests will have
to adapt and use 'null-co,read-zeroes=on'.

Suggested-by: Max Reitz 
Signed-off-by: Philippe Mathieu-Daudé 
---
RFC maybe a stricter approach is required?
---
 block/null.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/null.c b/block/null.c
index cc9b1d4ea72..f9658fd70ac 100644
--- a/block/null.c
+++ b/block/null.c
@@ -93,7 +93,7 @@ static int null_file_open(BlockDriverState *bs, QDict 
*options, int flags,
 error_setg(errp, "latency-ns is invalid");
 ret = -EINVAL;
 }
-s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false);
+s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, true);
 qemu_opts_del(opts);
 bs->supported_write_flags = BDRV_REQ_FUA;
 return ret;
-- 
2.26.2




Re: [PATCH 0/7] qcow2: compressed write cache

2021-02-09 Thread Denis V. Lunev
On 2/9/21 5:47 PM, Max Reitz wrote:
> On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote:
>> 09.02.2021 16:25, Max Reitz wrote:
>>> On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote:
 Hi all!

 I know, I have several series waiting for a resend, but I had to
 switch
 to another task spawned from our customer's bug.

 Original problem: we use O_DIRECT for all vm images in our product,
 it's
 the policy. The only exclusion is backup target qcow2 image for
 compressed backup, because compressed backup is extremely slow with
 O_DIRECT (due to unaligned writes). Customer complains that backup
 produces a lot of pagecache.

 So we can either implement some internal cache or use fadvise somehow.
 Backup has several async workes, which writes simultaneously, so in
 both
 ways we have to track host cluster filling (before dropping the cache
 corresponding to the cluster).  So, if we have to track anyway, let's
 try to implement the cache.
>>>
>>> I wanted to be excited here, because that sounds like it would be
>>> very easy to implement caching.  Like, just keep the cluster at
>>> free_byte_offset cached until the cluster it points to changes, then
>>> flush the cluster.
>>
>> The problem is that chunks are written asynchronously.. That's why
>> this all is not so easy.
>>
>>>
>>> But then I see like 900 new lines of code, and I’m much less excited...
>>>
 Idea is simple: cache small unaligned write and flush the cluster when
 filled.

 Performance result is very good (results in a table is time of
 compressed backup of 1000M disk filled with ones in seconds):
>>>
>>> “Filled with ones” really is an edge case, though.
>>
>> Yes, I think, all clusters are compressed to rather small chunks :)
>>
>>>
 ---  ---  ---
   backup(old)  backup(new)
 ssd:hdd(direct)  3e+02    4.4
  -99%
 ssd:hdd(cached)  5.7  5.4
  -5%
 ---  ---  ---

 So, we have benefit even for cached mode! And the fastest thing is
 O_DIRECT with new implemented cache. So, I suggest to enable the new
 cache by default (which is done by the series).
>>>
>>> First, I’m not sure how O_DIRECT really is relevant, because I don’t
>>> really see the point for writing compressed images.
>>
>> compressed backup is a point
>
> (Perhaps irrelevant, but just to be clear:) I meant the point of using
> O_DIRECT, which one can decide to not use for backup targets (as you
> have done already).
>
>>> Second, I find it a bit cheating if you say there is a huge
>>> improvement for the no-cache case, when actually, well, you just
>>> added a cache.  So the no-cache case just became faster because
>>> there is a cache now.
>>
>> Still, performance comparison is relevant to show that O_DIRECT as is
>> unusable for compressed backup.
>
> (Again, perhaps irrelevant, but:) Yes, but my first point was exactly
> whether O_DIRECT is even relevant for writing compressed images.
>
>>> Well, I suppose I could follow that if O_DIRECT doesn’t make much
>>> sense for compressed images, qemu’s format drivers are free to
>>> introduce some caching (because technically the cache.direct option
>>> only applies to the protocol driver) for collecting compressed writes.
>>
>> Yes I thought in this way, enabling the cache by default.
>>
>>> That conclusion makes both of my complaints kind of moot.
>>>
>>> *shrug*
>>>
>>> Third, what is the real-world impact on the page cache?  You
>>> described that that’s the reason why you need the cache in qemu,
>>> because otherwise the page cache is polluted too much.  How much is
>>> the difference really?  (I don’t know how good the compression ratio
>>> is for real-world images.)
>>
>> Hm. I don't know the ratio.. Customer reported that most of RAM is
>> polluted by Qemu's cache, and we use O_DIRECT for everything except
>> for target of compressed backup.. Still the pollution may relate to
>> several backups and of course it is simple enough to drop the cache
>> after each backup. But I think that even one backup of 16T disk may
>> pollute RAM enough.
>
> Oh, sorry, I just realized I had a brain fart there.  I was referring
> to whether this series improves the page cache pollution.  But
> obviously it will if it allows you to re-enable O_DIRECT.
>
>>> Related to that, I remember a long time ago we had some discussion
>>> about letting qemu-img convert set a special cache mode for the
>>> target image that would make Linux drop everything before the last
>>> offset written (i.e., I suppose fadvise() with
>>> POSIX_FADV_SEQUENTIAL).  You discard that idea based on the fact
>>> that implementing a cache in qemu would be simple, but it isn’t,
>>> really.  What would the impact of POSIX_FADV_SEQUENTIAL be?  (One
>>> advantage of using that would be that we could reuse

Re: [PATCH 0/7] qcow2: compressed write cache

2021-02-09 Thread Vladimir Sementsov-Ogievskiy

09.02.2021 17:47, Max Reitz wrote:

On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote:

09.02.2021 16:25, Max Reitz wrote:

On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

I know, I have several series waiting for a resend, but I had to switch
to another task spawned from our customer's bug.

Original problem: we use O_DIRECT for all vm images in our product, it's
the policy. The only exclusion is backup target qcow2 image for
compressed backup, because compressed backup is extremely slow with
O_DIRECT (due to unaligned writes). Customer complains that backup
produces a lot of pagecache.

So we can either implement some internal cache or use fadvise somehow.
Backup has several async workes, which writes simultaneously, so in both
ways we have to track host cluster filling (before dropping the cache
corresponding to the cluster).  So, if we have to track anyway, let's
try to implement the cache.


I wanted to be excited here, because that sounds like it would be very easy to 
implement caching.  Like, just keep the cluster at free_byte_offset cached 
until the cluster it points to changes, then flush the cluster.


The problem is that chunks are written asynchronously.. That's why this all is 
not so easy.



But then I see like 900 new lines of code, and I’m much less excited...


Idea is simple: cache small unaligned write and flush the cluster when
filled.

Performance result is very good (results in a table is time of
compressed backup of 1000M disk filled with ones in seconds):


“Filled with ones” really is an edge case, though.


Yes, I think, all clusters are compressed to rather small chunks :)




---  ---  ---
  backup(old)  backup(new)
ssd:hdd(direct)  3e+02    4.4
 -99%
ssd:hdd(cached)  5.7  5.4
 -5%
---  ---  ---

So, we have benefit even for cached mode! And the fastest thing is
O_DIRECT with new implemented cache. So, I suggest to enable the new
cache by default (which is done by the series).


First, I’m not sure how O_DIRECT really is relevant, because I don’t really see 
the point for writing compressed images.


compressed backup is a point


(Perhaps irrelevant, but just to be clear:) I meant the point of using 
O_DIRECT, which one can decide to not use for backup targets (as you have done 
already).


Second, I find it a bit cheating if you say there is a huge improvement for the 
no-cache case, when actually, well, you just added a cache.  So the no-cache 
case just became faster because there is a cache now.


Still, performance comparison is relevant to show that O_DIRECT as is unusable 
for compressed backup.


(Again, perhaps irrelevant, but:) Yes, but my first point was exactly whether 
O_DIRECT is even relevant for writing compressed images.


Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for 
compressed images, qemu’s format drivers are free to introduce some caching 
(because technically the cache.direct option only applies to the protocol 
driver) for collecting compressed writes.


Yes I thought in this way, enabling the cache by default.


That conclusion makes both of my complaints kind of moot.

*shrug*

Third, what is the real-world impact on the page cache?  You described that 
that’s the reason why you need the cache in qemu, because otherwise the page 
cache is polluted too much.  How much is the difference really?  (I don’t know 
how good the compression ratio is for real-world images.)


Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by 
Qemu's cache, and we use O_DIRECT for everything except for target of 
compressed backup.. Still the pollution may relate to several backups and of 
course it is simple enough to drop the cache after each backup. But I think 
that even one backup of 16T disk may pollute RAM enough.


Oh, sorry, I just realized I had a brain fart there.  I was referring to 
whether this series improves the page cache pollution.  But obviously it will 
if it allows you to re-enable O_DIRECT.


Related to that, I remember a long time ago we had some discussion about 
letting qemu-img convert set a special cache mode for the target image that 
would make Linux drop everything before the last offset written (i.e., I 
suppose fadvise() with POSIX_FADV_SEQUENTIAL).  You discard that idea based on 
the fact that implementing a cache in qemu would be simple, but it isn’t, 
really.  What would the impact of POSIX_FADV_SEQUENTIAL be?  (One advantage of 
using that would be that we could reuse it for non-compressed images that are 
written by backup or qemu-img convert.)


The problem is that writes are async. And therefore, not sequential.


In theory, yes, but all compressed writes still goes through 
qcow2_alloc_bytes() right before submitting the write, so I wonder whether in 
practice the writes aren’t usually sufficiently sequential t

Re: [PULL v3 00/27] Block patches

2021-02-09 Thread Jag Raman


> On Feb 9, 2021, at 1:03 AM, Thomas Huth  wrote:
> 
> On 08/02/2021 21.21, Stefan Hajnoczi wrote:
>> On Mon, Feb 08, 2021 at 11:02:57AM +0100, Philippe Mathieu-Daudé wrote:
>>> On 2/8/21 10:27 AM, Stefan Hajnoczi wrote:
 On Sat, Feb 06, 2021 at 05:03:20PM +, Peter Maydell wrote:
> On Fri, 5 Feb 2021 at 22:53, Peter Maydell  
> wrote:
>> 
>> On Fri, 5 Feb 2021 at 16:45, Stefan Hajnoczi  wrote:
>>> 
>>> The following changes since commit 
>>> e2c5093c993ef646e4e28f7aa78429853bcc06ac:
>>> 
>>>   iotests: 30: drop from auto group (and effectively from make check) 
>>> (2021-02-05 15:16:13 +)
>>> 
>>> are available in the Git repository at:
>>> 
>>>   https://gitlab.com/stefanha/qemu.git tags/block-pull-request
>>> 
>>> for you to fetch changes up to b07011f375bda3319cf72eee7cb18d310078387b:
>>> 
>>>   docs: fix Parallels Image "dirty bitmap" section (2021-02-05 16:36:36 
>>> +)
>>> 
>>> 
>>> Pull request
>>> 
>>> v3:
>>>  * Replace {0} array initialization with {} to make clang happy [Peter]
>>> 
>>> 
>> 
>> 
>> Fails 'make check' on s390x host:
> 
> I gave this a rerun to check it was reproducible (it is) and realised
> I missed what looks like an important line in the log. As usual,
> trying to disentangle which lines of a parallel make check correspond
> to the failure is pretty tricky, but the lines
>  Type 'remote-pcihost' is missing its parent 'pcie-host-bridge'
> 
> are probably the proximate causes of the assertion failures.
> 
> MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}
> QTEST_QEMU_IMG=./qemu-img
> G_TEST_DBUS_DAEMON=/home/ubuntu/qemu/tests/dbus-vmstate-daemon.sh
> QTEST_QEMU_BINARY=./qemu-system-rx tests/qtest/qos-test --tap -k
> PASS 45 qtest-rx/qmp-cmd-test /rx/qmp/query-memory-size-summary
> SKIP
> MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}
> QTEST_QEMU_IMG=./qemu-img
> G_TEST_DBUS_DAEMON=/home/ubuntu/qemu/tests/dbus-vmstate-daemon.sh
> QTEST_QEMU_BINARY=./qemu-system-s390x tests/qtest/pxe-test --tap -k
> PASS 46 qtest-rx/qmp-cmd-test /rx/qmp/query-memory-devices
> Type 'remote-pcihost' is missing its parent 'pcie-host-bridge'
> PASS 47 qtest-rx/qmp-cmd-test /rx/qmp/query-replay
> PASS 48 qtest-rx/qmp-cmd-test /rx/qmp/query-yank
> PASS 49 qtest-rx/qmp-cmd-test /rx/qmp/query-name
> PASS 50 qtest-rx/qmp-cmd-test /rx/qmp/query-iothreads
> PASS 51 qtest-rx/qmp-cmd-test /rx/qmp/query-fdsets
> PASS 52 qtest-rx/qmp-cmd-test /rx/qmp/query-command-line-options
> PASS 53 qtest-rx/qmp-cmd-test /rx/qmp/query-acpi-ospm-status
> PASS 54 qtest-rx/qmp-cmd-test /rx/qmp/object-add-failure-modes
> MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}
> QTEST_QEMU_IMG=./qemu-img
> G_TEST_DBUS_DAEMON=/home/ubuntu/qemu/tests/dbus-vmstate-daemon.sh
> QTEST_QEMU_BINARY=./qemu-system-s390x tests/qtest/test-netfilter --tap
> -k
> Type 'remote-pcihost' is missing its parent 'pcie-host-bridge'
> socket_accept failed: Resource temporarily unavailable
> socket_accept failed: Resource temporarily unavailable
> **
> ERROR:../../tests/qtest/libqtest.c:308:qtest_init_without_qmp_handshake:
> assertion failed: (s->fd >= 0 && s->qmp_fd >= 0)
> **
> ERROR:../../tests/qtest/libqtest.c:308:qtest_init_without_qmp_handshake:
> assertion failed: (s->fd >= 0 && s->qmp_fd >= 0)
> ../../tests/qtest/libqtest.c:181: kill_qemu() detected QEMU death from
> signal 6 (Aborted) (core dumped)
> ../../tests/qtest/libqtest.c:181: kill_qemu() detected QEMU death from
> signal 6 (Aborted) (core dumped)
> ERROR qtest-s390x/pxe-test - Bail out!
> ERROR:../../tests/qtest/libqtest.c:308:qtest_init_without_qmp_handshake:
> assertion failed: (s->fd >= 0 && s->qmp_fd >= 0)
> ERROR qtest-s390x/test-netfilter - Bail out!
> ERROR:../../tests/qtest/libqtest.c:308:qtest_init_without_qmp_handshake:
> assertion failed: (s->fd >= 0 && s->qmp_fd >= 0)
> Makefile.mtest:3113: recipe for target 'run-test-387' failed
> make: *** [run-test-387] Error 1
> make: *** Waiting for unfinished jobs
> Makefile.mtest:3121: recipe for target 'run-test-388' failed
 
 Hi Elena and Jag,
 Please take a look at this QOM failure. I guess remote-pcihost is being
 built but pcie-host-bridge is missing from the s390x-softmmu target.
>>> 
>>> Fix suggested here:
>>> https://www.mail-archive.com/qemu-block@nongnu.org/msg80536.html
>>> 
>>> But beside the fix what would be better is to restrict this feature
>>> where it makes sense (we are having hard time building/testing all
>>> features, better enable new ones where they are used).

Re: [PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog

2021-02-09 Thread Eric Blake
On 2/9/21 10:08 AM, Richard W.M. Jones wrote:
> On Tue, Feb 09, 2021 at 09:27:58AM -0600, Eric Blake wrote:
>> Our default of a backlog of 1 connection is rather puny; it gets in
>> the way when we are explicitly allowing multiple clients (such as
>> qemu-nbd -e N [--shared], or nbd-server-start with its default
>> "max-connections":0 for unlimited), but is even a problem when we
>> stick to qemu-nbd's default of only 1 active client but use -t
>> [--persistent] where a second client can start using the server once
>> the first finishes.  While the effects are less noticeable on TCP
>> sockets (since the client can poll() to learn when the server is ready
>> again), it is definitely observable on Unix sockets, where on Unix, a

s/where on Unix/where on Linux/

>> client will fail with EAGAIN and no recourse but to sleep an arbitrary
>> amount of time before retrying if the server backlog is already full.
>>
>> Since QMP nbd-server-start is always persistent, it now always
>> requests a backlog of SOMAXCONN; meanwhile, qemu-nbd will request
>> SOMAXCONN if persistent, otherwise its backlog should be based on the
>> expected number of clients.
>>
>> See https://bugzilla.redhat.com/1925045 for a demonstration of where
>> our low backlog prevents libnbd from connecting as many parallel
>> clients as it wants.
>>
>> Reported-by: Richard W.M. Jones 
>> Signed-off-by: Eric Blake 
>> CC: qemu-sta...@nongnu.org
>> ---
>>  blockdev-nbd.c |  7 ++-
>>  qemu-nbd.c | 10 +-
>>  2 files changed, 15 insertions(+), 2 deletions(-)
>>

> 
> Works fine here, so:
> 
> Tested-by: Richard W.M. Jones 
> 

Thanks for testing.

> Rich.
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog

2021-02-09 Thread Daniel P . Berrangé
On Tue, Feb 09, 2021 at 09:27:58AM -0600, Eric Blake wrote:
> Our default of a backlog of 1 connection is rather puny; it gets in
> the way when we are explicitly allowing multiple clients (such as
> qemu-nbd -e N [--shared], or nbd-server-start with its default
> "max-connections":0 for unlimited), but is even a problem when we
> stick to qemu-nbd's default of only 1 active client but use -t
> [--persistent] where a second client can start using the server once
> the first finishes.  While the effects are less noticeable on TCP
> sockets (since the client can poll() to learn when the server is ready
> again), it is definitely observable on Unix sockets, where on Unix, a
> client will fail with EAGAIN and no recourse but to sleep an arbitrary
> amount of time before retrying if the server backlog is already full.
> 
> Since QMP nbd-server-start is always persistent, it now always
> requests a backlog of SOMAXCONN; meanwhile, qemu-nbd will request
> SOMAXCONN if persistent, otherwise its backlog should be based on the
> expected number of clients.
> 
> See https://bugzilla.redhat.com/1925045 for a demonstration of where
> our low backlog prevents libnbd from connecting as many parallel
> clients as it wants.
> 
> Reported-by: Richard W.M. Jones 
> Signed-off-by: Eric Blake 
> CC: qemu-sta...@nongnu.org
> ---
>  blockdev-nbd.c |  7 ++-
>  qemu-nbd.c | 10 +-
>  2 files changed, 15 insertions(+), 2 deletions(-)

Reviewed-by: Daniel P. Berrangé 


Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v3 2/2] qemu-nbd: Permit --shared=0 for unlimited clients

2021-02-09 Thread Daniel P . Berrangé
On Tue, Feb 09, 2021 at 09:27:59AM -0600, Eric Blake wrote:
> This gives us better feature parity with QMP nbd-server-start, where
> max-connections defaults to 0 for unlimited.
> 
> Signed-off-by: Eric Blake 
> ---
>  docs/tools/qemu-nbd.rst | 4 ++--
>  qemu-nbd.c  | 7 +++
>  2 files changed, 5 insertions(+), 6 deletions(-)

Reviewed-by: Daniel P. Berrangé 

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog

2021-02-09 Thread Richard W.M. Jones
On Tue, Feb 09, 2021 at 09:27:58AM -0600, Eric Blake wrote:
> Our default of a backlog of 1 connection is rather puny; it gets in
> the way when we are explicitly allowing multiple clients (such as
> qemu-nbd -e N [--shared], or nbd-server-start with its default
> "max-connections":0 for unlimited), but is even a problem when we
> stick to qemu-nbd's default of only 1 active client but use -t
> [--persistent] where a second client can start using the server once
> the first finishes.  While the effects are less noticeable on TCP
> sockets (since the client can poll() to learn when the server is ready
> again), it is definitely observable on Unix sockets, where on Unix, a
> client will fail with EAGAIN and no recourse but to sleep an arbitrary
> amount of time before retrying if the server backlog is already full.
> 
> Since QMP nbd-server-start is always persistent, it now always
> requests a backlog of SOMAXCONN; meanwhile, qemu-nbd will request
> SOMAXCONN if persistent, otherwise its backlog should be based on the
> expected number of clients.
> 
> See https://bugzilla.redhat.com/1925045 for a demonstration of where
> our low backlog prevents libnbd from connecting as many parallel
> clients as it wants.
> 
> Reported-by: Richard W.M. Jones 
> Signed-off-by: Eric Blake 
> CC: qemu-sta...@nongnu.org
> ---
>  blockdev-nbd.c |  7 ++-
>  qemu-nbd.c | 10 +-
>  2 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/blockdev-nbd.c b/blockdev-nbd.c
> index d8443d235b73..b264620b98d8 100644
> --- a/blockdev-nbd.c
> +++ b/blockdev-nbd.c
> @@ -134,7 +134,12 @@ void nbd_server_start(SocketAddress *addr, const char 
> *tls_creds,
>  qio_net_listener_set_name(nbd_server->listener,
>"nbd-listener");
> 
> -if (qio_net_listener_open_sync(nbd_server->listener, addr, 1, errp) < 0) 
> {
> +/*
> + * Because this server is persistent, a backlog of SOMAXCONN is
> + * better than trying to size it to max_connections.
> + */
> +if (qio_net_listener_open_sync(nbd_server->listener, addr, SOMAXCONN,
> +   errp) < 0) {
>  goto error;
>  }
> 
> diff --git a/qemu-nbd.c b/qemu-nbd.c
> index 608c63e82a25..1a340ea4858d 100644
> --- a/qemu-nbd.c
> +++ b/qemu-nbd.c
> @@ -964,8 +964,16 @@ int main(int argc, char **argv)
> 
>  server = qio_net_listener_new();
>  if (socket_activation == 0) {
> +int backlog;
> +
> +if (persistent) {
> +backlog = SOMAXCONN;
> +} else {
> +backlog = MIN(shared, SOMAXCONN);
> +}
>  saddr = nbd_build_socket_address(sockpath, bindto, port);
> -if (qio_net_listener_open_sync(server, saddr, 1, &local_err) < 0) {
> +if (qio_net_listener_open_sync(server, saddr, backlog,
> +   &local_err) < 0) {
>  object_unref(OBJECT(server));
>  error_report_err(local_err);
>  exit(EXIT_FAILURE);

Works fine here, so:

Tested-by: Richard W.M. Jones 

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
Fedora Windows cross-compiler. Compile Windows programs, test, and
build Windows installers. Over 100 libraries supported.
http://fedoraproject.org/wiki/MinGW




[PATCH v3 2/2] qemu-nbd: Permit --shared=0 for unlimited clients

2021-02-09 Thread Eric Blake
This gives us better feature parity with QMP nbd-server-start, where
max-connections defaults to 0 for unlimited.

Signed-off-by: Eric Blake 
---
 docs/tools/qemu-nbd.rst | 4 ++--
 qemu-nbd.c  | 7 +++
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst
index fe41336dc550..ee862fa0bc02 100644
--- a/docs/tools/qemu-nbd.rst
+++ b/docs/tools/qemu-nbd.rst
@@ -136,8 +136,8 @@ driver options if ``--image-opts`` is specified.
 .. option:: -e, --shared=NUM

   Allow up to *NUM* clients to share the device (default
-  ``1``). Safe for readers, but for now, consistency is not
-  guaranteed between multiple writers.
+  ``1``), 0 for unlimited. Safe for readers, but for now,
+  consistency is not guaranteed between multiple writers.

 .. option:: -t, --persistent

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 1a340ea4858d..5416509ece18 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -328,7 +328,7 @@ static void *nbd_client_thread(void *arg)

 static int nbd_can_accept(void)
 {
-return state == RUNNING && nb_fds < shared;
+return state == RUNNING && (shared == 0 || nb_fds < shared);
 }

 static void nbd_update_server_watch(void);
@@ -706,8 +706,8 @@ int main(int argc, char **argv)
 device = optarg;
 break;
 case 'e':
 if (qemu_strtoi(optarg, NULL, 0, &shared) < 0 ||
-shared < 1) {
+shared < 0) {
 error_report("Invalid shared device number '%s'", optarg);
 exit(EXIT_FAILURE);
 }
@@ -966,7 +965,7 @@ int main(int argc, char **argv)
 if (socket_activation == 0) {
 int backlog;

-if (persistent) {
+if (persistent || shared == 0) {
 backlog = SOMAXCONN;
 } else {
 backlog = MIN(shared, SOMAXCONN);
-- 
2.30.0




[PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog

2021-02-09 Thread Eric Blake
Our default of a backlog of 1 connection is rather puny; it gets in
the way when we are explicitly allowing multiple clients (such as
qemu-nbd -e N [--shared], or nbd-server-start with its default
"max-connections":0 for unlimited), but is even a problem when we
stick to qemu-nbd's default of only 1 active client but use -t
[--persistent] where a second client can start using the server once
the first finishes.  While the effects are less noticeable on TCP
sockets (since the client can poll() to learn when the server is ready
again), it is definitely observable on Unix sockets, where on Unix, a
client will fail with EAGAIN and no recourse but to sleep an arbitrary
amount of time before retrying if the server backlog is already full.

Since QMP nbd-server-start is always persistent, it now always
requests a backlog of SOMAXCONN; meanwhile, qemu-nbd will request
SOMAXCONN if persistent, otherwise its backlog should be based on the
expected number of clients.

See https://bugzilla.redhat.com/1925045 for a demonstration of where
our low backlog prevents libnbd from connecting as many parallel
clients as it wants.

Reported-by: Richard W.M. Jones 
Signed-off-by: Eric Blake 
CC: qemu-sta...@nongnu.org
---
 blockdev-nbd.c |  7 ++-
 qemu-nbd.c | 10 +-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/blockdev-nbd.c b/blockdev-nbd.c
index d8443d235b73..b264620b98d8 100644
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -134,7 +134,12 @@ void nbd_server_start(SocketAddress *addr, const char 
*tls_creds,
 qio_net_listener_set_name(nbd_server->listener,
   "nbd-listener");

-if (qio_net_listener_open_sync(nbd_server->listener, addr, 1, errp) < 0) {
+/*
+ * Because this server is persistent, a backlog of SOMAXCONN is
+ * better than trying to size it to max_connections.
+ */
+if (qio_net_listener_open_sync(nbd_server->listener, addr, SOMAXCONN,
+   errp) < 0) {
 goto error;
 }

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 608c63e82a25..1a340ea4858d 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -964,8 +964,16 @@ int main(int argc, char **argv)

 server = qio_net_listener_new();
 if (socket_activation == 0) {
+int backlog;
+
+if (persistent) {
+backlog = SOMAXCONN;
+} else {
+backlog = MIN(shared, SOMAXCONN);
+}
 saddr = nbd_build_socket_address(sockpath, bindto, port);
-if (qio_net_listener_open_sync(server, saddr, 1, &local_err) < 0) {
+if (qio_net_listener_open_sync(server, saddr, backlog,
+   &local_err) < 0) {
 object_unref(OBJECT(server));
 error_report_err(local_err);
 exit(EXIT_FAILURE);
-- 
2.30.0




[PATCH v3 0/2] NBD socket backlog

2021-02-09 Thread Eric Blake
in v2:
- also adjust backlog of QMP nbd-server-start [Dan]
- tweak qemu-nbd backlog to -e when not persistent [Nir]
- allow qemu-nbd -e0 for symmetry with QMP [new patch 2]

Eric Blake (2):
  qemu-nbd: Use SOMAXCONN for socket listen() backlog
  qemu-nbd: Permit --shared=0 for unlimited clients

 docs/tools/qemu-nbd.rst |  4 ++--
 blockdev-nbd.c  |  7 ++-
 qemu-nbd.c  | 15 +++
 3 files changed, 19 insertions(+), 7 deletions(-)

-- 
2.30.0




Re: [PULL 00/56] emulated nvme patches

2021-02-09 Thread Peter Maydell
On Tue, 9 Feb 2021 at 07:31, Klaus Jensen  wrote:
>
> From: Klaus Jensen 
>
> The following changes since commit 4f799257b323e1238a900fd0c71c2057863e0308:
>
>   Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2021-02-08' 
> into staging (2021-02-08 16:12:21 +)
>
> are available in the Git repository at:
>
>   git://git.infradead.org/qemu-nvme.git tags/nvme-next-pull-request
>
> for you to fetch changes up to 3e22762edc74be3e1ecafc361351a9640d114978:
>
>   hw/block/nvme: refactor the logic for zone write checks (2021-02-08 
> 21:15:54 +0100)
>
> 
> Emulated NVMe device updates
>
>   * deallocate or unwritten logical block error feature (me)
>   * dataset management command (me)
>   * compare command (Gollu Appalanaidu)
>   * namespace types (Niklas Cassel)
>   * zoned namespaces (Dmitry Fomichev)
>   * smart critical warning toggle (Zhenwei Pi)
>   * allow cmb and pmr to coexist (Andrzej Jakowski, me)
>   * pmr rds/wds support (Naveen Nagar)
>   * cmb v1.4 logic (Padmakar Kalghatgi)
>
> And a lot of smaller fixes from Gollu Appalanaidu, Minwoo Im and me.
>
> 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/6.0
for any user-visible changes.

-- PMM



Re: [PATCH 0/7] qcow2: compressed write cache

2021-02-09 Thread Max Reitz

On 09.02.21 15:10, Vladimir Sementsov-Ogievskiy wrote:

09.02.2021 16:25, Max Reitz wrote:

On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

I know, I have several series waiting for a resend, but I had to switch
to another task spawned from our customer's bug.

Original problem: we use O_DIRECT for all vm images in our product, it's
the policy. The only exclusion is backup target qcow2 image for
compressed backup, because compressed backup is extremely slow with
O_DIRECT (due to unaligned writes). Customer complains that backup
produces a lot of pagecache.

So we can either implement some internal cache or use fadvise somehow.
Backup has several async workes, which writes simultaneously, so in both
ways we have to track host cluster filling (before dropping the cache
corresponding to the cluster).  So, if we have to track anyway, let's
try to implement the cache.


I wanted to be excited here, because that sounds like it would be very 
easy to implement caching.  Like, just keep the cluster at 
free_byte_offset cached until the cluster it points to changes, then 
flush the cluster.


The problem is that chunks are written asynchronously.. That's why this 
all is not so easy.




But then I see like 900 new lines of code, and I’m much less excited...


Idea is simple: cache small unaligned write and flush the cluster when
filled.

Performance result is very good (results in a table is time of
compressed backup of 1000M disk filled with ones in seconds):


“Filled with ones” really is an edge case, though.


Yes, I think, all clusters are compressed to rather small chunks :)




---  ---  ---
  backup(old)  backup(new)
ssd:hdd(direct)  3e+02    4.4
 -99%
ssd:hdd(cached)  5.7  5.4
 -5%
---  ---  ---

So, we have benefit even for cached mode! And the fastest thing is
O_DIRECT with new implemented cache. So, I suggest to enable the new
cache by default (which is done by the series).


First, I’m not sure how O_DIRECT really is relevant, because I don’t 
really see the point for writing compressed images.


compressed backup is a point


(Perhaps irrelevant, but just to be clear:) I meant the point of using 
O_DIRECT, which one can decide to not use for backup targets (as you 
have done already).


Second, I find it a bit cheating if you say there is a huge 
improvement for the no-cache case, when actually, well, you just added 
a cache.  So the no-cache case just became faster because there is a 
cache now.


Still, performance comparison is relevant to show that O_DIRECT as is 
unusable for compressed backup.


(Again, perhaps irrelevant, but:) Yes, but my first point was exactly 
whether O_DIRECT is even relevant for writing compressed images.


Well, I suppose I could follow that if O_DIRECT doesn’t make much 
sense for compressed images, qemu’s format drivers are free to 
introduce some caching (because technically the cache.direct option 
only applies to the protocol driver) for collecting compressed writes.


Yes I thought in this way, enabling the cache by default.


That conclusion makes both of my complaints kind of moot.

*shrug*

Third, what is the real-world impact on the page cache?  You described 
that that’s the reason why you need the cache in qemu, because 
otherwise the page cache is polluted too much.  How much is the 
difference really?  (I don’t know how good the compression ratio is 
for real-world images.)


Hm. I don't know the ratio.. Customer reported that most of RAM is 
polluted by Qemu's cache, and we use O_DIRECT for everything except for 
target of compressed backup.. Still the pollution may relate to several 
backups and of course it is simple enough to drop the cache after each 
backup. But I think that even one backup of 16T disk may pollute RAM 
enough.


Oh, sorry, I just realized I had a brain fart there.  I was referring to 
whether this series improves the page cache pollution.  But obviously it 
will if it allows you to re-enable O_DIRECT.


Related to that, I remember a long time ago we had some discussion 
about letting qemu-img convert set a special cache mode for the target 
image that would make Linux drop everything before the last offset 
written (i.e., I suppose fadvise() with POSIX_FADV_SEQUENTIAL).  You 
discard that idea based on the fact that implementing a cache in qemu 
would be simple, but it isn’t, really.  What would the impact of 
POSIX_FADV_SEQUENTIAL be?  (One advantage of using that would be that 
we could reuse it for non-compressed images that are written by backup 
or qemu-img convert.)


The problem is that writes are async. And therefore, not sequential.


In theory, yes, but all compressed writes still goes through 
qcow2_alloc_bytes() right before submitting the write, so I wonder 
whether in practice the writes aren’t usually sufficiently sequential to 
make POSIX_FADV_SEQUENTIAL 

Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress

2021-02-09 Thread Alexander Bulekov
On 210209 1745, Bin Meng wrote:
> Oops, hitting "send" by mistake ...
> 
> On Tue, Feb 9, 2021 at 5:42 PM Bin Meng  wrote:
> >
> > Hi Philippe,
> >
> > On Tue, Feb 9, 2021 at 5:38 PM Philippe Mathieu-Daudé  
> > wrote:
> > >
> > > On 2/9/21 9:28 AM, Bin Meng wrote:
> > > > Hi Philippe,
> > > >
> > > > On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé  
> > > > wrote:
> > > >>
> > > >> Per the "SD Host Controller Simplified Specification Version 2.00"
> > > >> spec. 'Table 2-4 : Block Size Register':
> > > >>
> > > >>   Transfer Block Size [...] can be accessed only if no
> > > >>   transaction is executing (i.e., after a transaction has stopped).
> > > >>   Read operations during transfers may return an invalid value,
> > > >>   and write operations shall be ignored.
> > > >>
> > > >> Transactions will update 'data_count', so do not modify 'blksize'
> > > >> and 'blkcnt' when 'data_count' is used. This fixes:
> > > >>
> > > >> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \
> > > >>-nographic -serial none -M pc-q35-5.0 \
> > > >>-device sdhci-pci,sd-spec-version=3 \
> > > >>-device sd-card,drive=mydrive \
> > > >>-drive 
> > > >> if=sd,index=0,file=null-co://,format=raw,id=mydrive
> > > >>   outl 0xcf8 0x80001810
> > > >>   outl 0xcfc 0xe1068000
> > > >>   outl 0xcf8 0x80001814
> > > >
> > > > Is this command needed?
> > >
> > > My guess is this makes the northbridge somehow map the device PCI space.
> > >
> > > Probably not needed in machines where SDHCI is MMIO mapped.
> >
> > I think this is not needed. Writing only the CFG_ADDR
> 
> I think this is not needed. Writing only the CFG_ADDR without wring
> CFG_DATA does not take any effect.
> 

Ran it through scripts/oss-fuzz/minimize_qtest_trace.py , though that's
probably not very useful now:

cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \
 -nographic -serial none -M pc-q35-5.0 \
 -device sdhci-pci,sd-spec-version=3 \
 -device sd-card,drive=mydrive \
 -drive if=sd,index=0,file=null-co://,format=raw,id=mydrive
outl 0xcf8 0x80001810
outl 0xcfc 0xe1068000
outl 0xcf8 0x80001804
outw 0xcfc 0x7
write 0xe106802c 0x1 0x0f
write 0xe1068004 0x1 0x20
write 0xe1068005 0x1 0x01
write 0xe1068007 0x1 0x01
write 0xe106800c 0x1 0x33
write 0xe106800e 0x1 0x20
write 0xe106800f 0x1 0x0
write 0xe106800c 0x1 0x0
write 0xe106802a 0x1 0x11
write 0xe1068003 0x1 0x0
write 0xe1068005 0x1 0x00
write 0xe106800c 0x1 0x22
write 0xe106802a 0x1 0x12
write 0xe1068003 0x1 0x10
EOF

> >
> > >
> > > >
> > > >>   outl 0xcf8 0x80001804
> > > >>   outw 0xcfc 0x7
> > > >>   outl 0xcf8 0x8000fa20
> > > >
> > > > and this one?
> > >
> > > Ditto.
> > >
> > > >
> > > >>   write 0xe106802c 0x1 0x0f
> > > >>   write 0xe1068004 0xc 0x2801d10101fbff28a384
> > > >
> > > > Are these fuzzy data?
> > >
> > > Yes, I didn't try to understand what this does, as often
> > > non-sense operations. But this is what would craft a malicious
> > > attacker.
> > >
> > > >
> > > >>   write 0xe106800c 0x1f 
> > > >> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f
> > > >>   write 0xe1068003 0x28 
> > > >> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576
> > > >>   write 0xe1068003 0x1 0xfe
> > > >>   EOF
> > > >>   =
> > > >>   ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address 
> > > >> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0
> > > >>   WRITE of size 4 at 0x6153bb00 thread T0
> > > >>   #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b)
> > > >>   #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5
> > > >>   #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1
> > > >>   #3 0x55ab483aeb4b in flatview_read_continue 
> > > >> softmmu/physmem.c:2839:13
> > > >>   #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12
> > > >>   #5 0x55ab483b028e in address_space_read_full 
> > > >> softmmu/physmem.c:2890:18
> > > >>   #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16
> > > >>   #7 0x55ab479374a2 in dma_memory_rw_relaxed 
> > > >> include/sysemu/dma.h:88:12
> > > >>   #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12
> > > >>   #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12
> > > >>   #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks 
> > > >> hw/sd/sdhci.c:639:13
> > > >>   #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17
> > > >>   #12 0x55ab483f8db8 in memory_region_write_accessor 
> > > >> softmmu/memory.c:491:5
> > > >>   #13 0x55ab483f868a in access_with_adjusted_size 
> > > >> softmmu/memory.c:552:18
> > > >>   #14 0x55ab483f6da5 in memory_region_dispatch_write 
> > > >> softmmu/memory.c:1501:16
> > > >>   #15 0x55ab483c3b11 in flatview_write_continue 
> > >

Re: [PATCH 2/2] travis: remove travis configuration and all references to Travis CI

2021-02-09 Thread Thomas Huth

On 09/02/2021 14.50, Daniel P. Berrangé wrote:

The Travis CI system QEMU has been using has removed the unlimited free
usage model, replacing it with a one-time only grant of CI minutes that
is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI
credits, leaving them unable to test with Travis. This is not a
sustainable situation, so we have no choice by to discontinue use of
Travis. GitLab CI is now the primary target, with Cirrus CI filling
in some platform gaps where needed.


I've currently got a series in flight that moves some of the remaining jobs 
to gitlab-CI:


https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg01924.html

Could you please hold this patch 'til my series got merged first?

Also I think we could still wait some more weeks with the final removal of 
the travis-CI either 'til travis-ci.org got shut down completely (and thus 
we cannot use it for QEMU at all anymore), or until we finally got the s390x 
and aarch64 runners up and running in the gitlab-CI.


 Thomas




Re: [PATCH v4 0/9] hw/sd: Support block read/write in SPI mode

2021-02-09 Thread Bin Meng
Hi Philippe,

On Thu, Feb 4, 2021 at 2:02 PM Bin Meng  wrote:
>
> On Thu, Jan 28, 2021 at 2:30 PM Bin Meng  wrote:
> >
> > From: Bin Meng 
> >
> > This includes the previously v3 series [1], and one single patch [2].
> >
> > Compared to v3, this fixed the following issue in patch [v3,6/6]:
> > - Keep the card state to SSI_SD_CMD instead of SSI_SD_RESPONSE after
> >   receiving the STOP_TRAN token per the spec
> >
> > All software tested so far (U-Boot/Linux/VxWorks) do work without
> > the fix, but it is better to comform with the spec.
> >
> > In addition to [2], one more issue was exposed when testing with
> > VxWorks driver related to STOP_TRANSMISSION (CMD12) response.
> >
> > [1] http://patchwork.ozlabs.org/project/qemu-devel/list/?series=226136
> > [2] 
> > http://patchwork.ozlabs.org/project/qemu-devel/patch/1611636214-52427-1-git-send-email-bmeng...@gmail.com/
> >
> > Changes in v4:
> > - Keep the card state to SSI_SD_CMD instead of SSI_SD_RESPONSE after
> >   receiving the STOP_TRAN token per the spec
> > - new patch: fix STOP_TRANSMISSION (CMD12) response
> > - new patch: handle the rest commands with R1b response type
> >
>
> Ping?

Will a PR be sent soon to include this series so that the SiFive SPI
series can follow?

Regards,
Bin



Re: [PATCH 0/7] qcow2: compressed write cache

2021-02-09 Thread Vladimir Sementsov-Ogievskiy

09.02.2021 16:25, Max Reitz wrote:

On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

I know, I have several series waiting for a resend, but I had to switch
to another task spawned from our customer's bug.

Original problem: we use O_DIRECT for all vm images in our product, it's
the policy. The only exclusion is backup target qcow2 image for
compressed backup, because compressed backup is extremely slow with
O_DIRECT (due to unaligned writes). Customer complains that backup
produces a lot of pagecache.

So we can either implement some internal cache or use fadvise somehow.
Backup has several async workes, which writes simultaneously, so in both
ways we have to track host cluster filling (before dropping the cache
corresponding to the cluster).  So, if we have to track anyway, let's
try to implement the cache.


I wanted to be excited here, because that sounds like it would be very easy to 
implement caching.  Like, just keep the cluster at free_byte_offset cached 
until the cluster it points to changes, then flush the cluster.


The problem is that chunks are written asynchronously.. That's why this all is 
not so easy.



But then I see like 900 new lines of code, and I’m much less excited...


Idea is simple: cache small unaligned write and flush the cluster when
filled.

Performance result is very good (results in a table is time of
compressed backup of 1000M disk filled with ones in seconds):


“Filled with ones” really is an edge case, though.


Yes, I think, all clusters are compressed to rather small chunks :)




---  ---  ---
  backup(old)  backup(new)
ssd:hdd(direct)  3e+02    4.4
 -99%
ssd:hdd(cached)  5.7  5.4
 -5%
---  ---  ---

So, we have benefit even for cached mode! And the fastest thing is
O_DIRECT with new implemented cache. So, I suggest to enable the new
cache by default (which is done by the series).


First, I’m not sure how O_DIRECT really is relevant, because I don’t really see 
the point for writing compressed images.


compressed backup is a point



Second, I find it a bit cheating if you say there is a huge improvement for the 
no-cache case, when actually, well, you just added a cache.  So the no-cache 
case just became faster because there is a cache now.


Still, performance comparison is relevant to show that O_DIRECT as is unusable 
for compressed backup.



Well, I suppose I could follow that if O_DIRECT doesn’t make much sense for 
compressed images, qemu’s format drivers are free to introduce some caching 
(because technically the cache.direct option only applies to the protocol 
driver) for collecting compressed writes.


Yes I thought in this way, enabling the cache by default.


That conclusion makes both of my complaints kind of moot.

*shrug*

Third, what is the real-world impact on the page cache?  You described that 
that’s the reason why you need the cache in qemu, because otherwise the page 
cache is polluted too much.  How much is the difference really?  (I don’t know 
how good the compression ratio is for real-world images.)


Hm. I don't know the ratio.. Customer reported that most of RAM is polluted by 
Qemu's cache, and we use O_DIRECT for everything except for target of 
compressed backup.. Still the pollution may relate to several backups and of 
course it is simple enough to drop the cache after each backup. But I think 
that even one backup of 16T disk may pollute RAM enough.



Related to that, I remember a long time ago we had some discussion about 
letting qemu-img convert set a special cache mode for the target image that 
would make Linux drop everything before the last offset written (i.e., I 
suppose fadvise() with POSIX_FADV_SEQUENTIAL).  You discard that idea based on 
the fact that implementing a cache in qemu would be simple, but it isn’t, 
really.  What would the impact of POSIX_FADV_SEQUENTIAL be?  (One advantage of 
using that would be that we could reuse it for non-compressed images that are 
written by backup or qemu-img convert.)


The problem is that writes are async. And therefore, not sequential. So I have 
to track the writes and wait until the whole cluster is filled. It's simple use 
fadvise as an option to my cache: instead of caching data and write when 
cluster is filled we can instead mark cluster POSIX_FADV_DONTNEED.



(I don’t remember why that qemu-img discussion died back then.)


Fourth, regarding the code, would it be simpler if it were a pure write cache?  
I.e., on read, everything is flushed, so we don’t have to deal with that.  I 
don’t think there are many valid cases where a compressed image is both written 
to and read from at the same time.  (Just asking, because I’d really want this 
code to be simpler.  I can imagine that reading from the cache is the least bit 
of complexity, but perhaps...)



Hm. I really didn't want to support reads, and do it

Re: [PATCH 2/2] travis: remove travis configuration and all references to Travis CI

2021-02-09 Thread Daniel P . Berrangé
On Tue, Feb 09, 2021 at 02:58:46PM +0100, Philippe Mathieu-Daudé wrote:
> On 2/9/21 2:50 PM, Daniel P. Berrangé wrote:
> > The Travis CI system QEMU has been using has removed the unlimited free
> > usage model, replacing it with a one-time only grant of CI minutes that
> > is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI
> > credits, leaving them unable to test with Travis. This is not a
> > sustainable situation, so we have no choice by to discontinue use of
> > Travis. GitLab CI is now the primary target, with Cirrus CI filling
> > in some platform gaps where needed.
> > 
> > Signed-off-by: Daniel P. Berrangé 
> > ---
> >  .travis.yml| 439 -
> >  MAINTAINERS|   3 -
> >  configure  |   1 -
> >  contrib/gitdm/filetypes.txt|   2 +-
> >  scripts/travis/coverage-summary.sh |  27 --
> >  tests/docker/docker.py |   2 +-
> >  tests/qemu-iotests/079 |   2 +-
> >  tests/test-util-filemonitor.c  |  11 -
> >  8 files changed, 3 insertions(+), 484 deletions(-)
> >  delete mode 100644 .travis.yml
> >  delete mode 100755 scripts/travis/coverage-summary.sh
> ...
> 
> > diff --git a/configure b/configure
> > index 7c496d81fc..058a7c7967 100755
> > --- a/configure
> > +++ b/configure
> > @@ -4872,7 +4872,6 @@ fi
> >  
> >  # See if __attribute__((alias)) is supported.
> >  # This false for Xcode 9, but has been remedied for Xcode 10.
> 
> Not related to this patch, but I don't think Xcode 9 is supported
> anymore.
> 
> > -# Unfortunately, travis uses Xcode 9 by default.
> >  
> >  attralias=no
> >  cat > $TMPC << EOF
> 
> > diff --git a/scripts/travis/coverage-summary.sh 
> > b/scripts/travis/coverage-summary.sh
> > deleted file mode 100755
> > index d7086cf9ca..00
> > --- a/scripts/travis/coverage-summary.sh
> > +++ /dev/null
> > @@ -1,27 +0,0 @@
> > -#!/bin/sh
> > -#
> > -# Author: Alex Bennée 
> > -#
> > -# Summerise the state of code coverage with gcovr and tweak the output
> > -# to be more sane on Travis hosts. As we expect to be executed on a
> > -# throw away CI instance we do spam temp files all over the shop. You
> > -# most likely don't want to execute this script but just call gcovr
> > -# directly. See also "make coverage-report"
> > -#
> > -# This code is licensed under the GPL version 2 or later.  See
> > -# the COPYING file in the top-level directory.
> > -
> > -# first generate the coverage report
> > -gcovr -p -o raw-report.txt
> > -
> > -# strip the full-path and line markers
> > -sed s@$PWD\/@@ raw-report.txt | sed s/[0-9]\*[,-]//g > simplified.txt
> > -
> > -# reflow lines that got split
> > -awk '/.[ch]$/ { printf("%s", $0); next } 1' simplified.txt > rejoined.txt
> > -
> > -# columnify
> > -column -t rejoined.txt > final.txt
> > -
> > -# and dump, stripping out 0% coverage
> > -grep -v "0%" final.txt
> 
> This script can be run on other CI.
> 
> Keeping scripts/travis/coverage-summary.sh (moved to
> scripts/ci/coverage-summary.sh):

I notice that the "gcovr" program used here should be able to output
an XML document in a format that is supported by GitLab, which can
then pretty-display the results.

If we do that, perhaps we won't ned this coverage-summary script
for post-processing the text output format ?

I guess we need to make sure  gcovr is actually installed in all
our dockerfiles used by gitlab.

> Reviewed-by: Philippe Mathieu-Daudé 
> 

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH 2/2] travis: remove travis configuration and all references to Travis CI

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 2:50 PM, Daniel P. Berrangé wrote:
> The Travis CI system QEMU has been using has removed the unlimited free
> usage model, replacing it with a one-time only grant of CI minutes that
> is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI
> credits, leaving them unable to test with Travis. This is not a
> sustainable situation, so we have no choice by to discontinue use of
> Travis. GitLab CI is now the primary target, with Cirrus CI filling
> in some platform gaps where needed.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  .travis.yml| 439 -
>  MAINTAINERS|   3 -
>  configure  |   1 -
>  contrib/gitdm/filetypes.txt|   2 +-
>  scripts/travis/coverage-summary.sh |  27 --
>  tests/docker/docker.py |   2 +-
>  tests/qemu-iotests/079 |   2 +-
>  tests/test-util-filemonitor.c  |  11 -
>  8 files changed, 3 insertions(+), 484 deletions(-)
>  delete mode 100644 .travis.yml
>  delete mode 100755 scripts/travis/coverage-summary.sh
...

> diff --git a/configure b/configure
> index 7c496d81fc..058a7c7967 100755
> --- a/configure
> +++ b/configure
> @@ -4872,7 +4872,6 @@ fi
>  
>  # See if __attribute__((alias)) is supported.
>  # This false for Xcode 9, but has been remedied for Xcode 10.

Not related to this patch, but I don't think Xcode 9 is supported
anymore.

> -# Unfortunately, travis uses Xcode 9 by default.
>  
>  attralias=no
>  cat > $TMPC << EOF

> diff --git a/scripts/travis/coverage-summary.sh 
> b/scripts/travis/coverage-summary.sh
> deleted file mode 100755
> index d7086cf9ca..00
> --- a/scripts/travis/coverage-summary.sh
> +++ /dev/null
> @@ -1,27 +0,0 @@
> -#!/bin/sh
> -#
> -# Author: Alex Bennée 
> -#
> -# Summerise the state of code coverage with gcovr and tweak the output
> -# to be more sane on Travis hosts. As we expect to be executed on a
> -# throw away CI instance we do spam temp files all over the shop. You
> -# most likely don't want to execute this script but just call gcovr
> -# directly. See also "make coverage-report"
> -#
> -# This code is licensed under the GPL version 2 or later.  See
> -# the COPYING file in the top-level directory.
> -
> -# first generate the coverage report
> -gcovr -p -o raw-report.txt
> -
> -# strip the full-path and line markers
> -sed s@$PWD\/@@ raw-report.txt | sed s/[0-9]\*[,-]//g > simplified.txt
> -
> -# reflow lines that got split
> -awk '/.[ch]$/ { printf("%s", $0); next } 1' simplified.txt > rejoined.txt
> -
> -# columnify
> -column -t rejoined.txt > final.txt
> -
> -# and dump, stripping out 0% coverage
> -grep -v "0%" final.txt

This script can be run on other CI.

Keeping scripts/travis/coverage-summary.sh (moved to
scripts/ci/coverage-summary.sh):
Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 2/2] travis: remove travis configuration and all references to Travis CI

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 3:03 PM, Daniel P. Berrangé wrote:
> On Tue, Feb 09, 2021 at 02:58:46PM +0100, Philippe Mathieu-Daudé wrote:
>> On 2/9/21 2:50 PM, Daniel P. Berrangé wrote:
>>> The Travis CI system QEMU has been using has removed the unlimited free
>>> usage model, replacing it with a one-time only grant of CI minutes that
>>> is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI
>>> credits, leaving them unable to test with Travis. This is not a
>>> sustainable situation, so we have no choice by to discontinue use of
>>> Travis. GitLab CI is now the primary target, with Cirrus CI filling
>>> in some platform gaps where needed.
>>>
>>> Signed-off-by: Daniel P. Berrangé 
>>> ---
>>>  .travis.yml| 439 -
>>>  MAINTAINERS|   3 -
>>>  configure  |   1 -
>>>  contrib/gitdm/filetypes.txt|   2 +-
>>>  scripts/travis/coverage-summary.sh |  27 --
>>>  tests/docker/docker.py |   2 +-
>>>  tests/qemu-iotests/079 |   2 +-
>>>  tests/test-util-filemonitor.c  |  11 -
>>>  8 files changed, 3 insertions(+), 484 deletions(-)
>>>  delete mode 100644 .travis.yml
>>>  delete mode 100755 scripts/travis/coverage-summary.sh
>> ...
>>
>>> diff --git a/configure b/configure
>>> index 7c496d81fc..058a7c7967 100755
>>> --- a/configure
>>> +++ b/configure
>>> @@ -4872,7 +4872,6 @@ fi
>>>  
>>>  # See if __attribute__((alias)) is supported.
>>>  # This false for Xcode 9, but has been remedied for Xcode 10.
>>
>> Not related to this patch, but I don't think Xcode 9 is supported
>> anymore.
>>
>>> -# Unfortunately, travis uses Xcode 9 by default.
>>>  
>>>  attralias=no
>>>  cat > $TMPC << EOF
>>
>>> diff --git a/scripts/travis/coverage-summary.sh 
>>> b/scripts/travis/coverage-summary.sh
>>> deleted file mode 100755
>>> index d7086cf9ca..00
>>> --- a/scripts/travis/coverage-summary.sh
>>> +++ /dev/null
>>> @@ -1,27 +0,0 @@
>>> -#!/bin/sh
>>> -#
>>> -# Author: Alex Bennée 
>>> -#
>>> -# Summerise the state of code coverage with gcovr and tweak the output
>>> -# to be more sane on Travis hosts. As we expect to be executed on a
>>> -# throw away CI instance we do spam temp files all over the shop. You
>>> -# most likely don't want to execute this script but just call gcovr
>>> -# directly. See also "make coverage-report"
>>> -#
>>> -# This code is licensed under the GPL version 2 or later.  See
>>> -# the COPYING file in the top-level directory.
>>> -
>>> -# first generate the coverage report
>>> -gcovr -p -o raw-report.txt
>>> -
>>> -# strip the full-path and line markers
>>> -sed s@$PWD\/@@ raw-report.txt | sed s/[0-9]\*[,-]//g > simplified.txt
>>> -
>>> -# reflow lines that got split
>>> -awk '/.[ch]$/ { printf("%s", $0); next } 1' simplified.txt > rejoined.txt
>>> -
>>> -# columnify
>>> -column -t rejoined.txt > final.txt
>>> -
>>> -# and dump, stripping out 0% coverage
>>> -grep -v "0%" final.txt
>>
>> This script can be run on other CI.
>>
>> Keeping scripts/travis/coverage-summary.sh (moved to
>> scripts/ci/coverage-summary.sh):
> 
> I notice that the "gcovr" program used here should be able to output
> an XML document in a format that is supported by GitLab, which can
> then pretty-display the results.

Good idea.

> If we do that, perhaps we won't ned this coverage-summary script
> for post-processing the text output format ?

This indeed requires further testing. I'd worry about that later.

I'll let Alex see how he wants to deal with that, we can still
add the script back later.

> I guess we need to make sure  gcovr is actually installed in all
> our dockerfiles used by gitlab.
> 
>> Reviewed-by: Philippe Mathieu-Daudé 
>>
> 
> Regards,
> Daniel
> 




[PATCH 2/2] travis: remove travis configuration and all references to Travis CI

2021-02-09 Thread Daniel P . Berrangé
The Travis CI system QEMU has been using has removed the unlimited free
usage model, replacing it with a one-time only grant of CI minutes that
is not renewed. The QEMU CI jobs quickly exhaust maintainer's free CI
credits, leaving them unable to test with Travis. This is not a
sustainable situation, so we have no choice by to discontinue use of
Travis. GitLab CI is now the primary target, with Cirrus CI filling
in some platform gaps where needed.

Signed-off-by: Daniel P. Berrangé 
---
 .travis.yml| 439 -
 MAINTAINERS|   3 -
 configure  |   1 -
 contrib/gitdm/filetypes.txt|   2 +-
 scripts/travis/coverage-summary.sh |  27 --
 tests/docker/docker.py |   2 +-
 tests/qemu-iotests/079 |   2 +-
 tests/test-util-filemonitor.c  |  11 -
 8 files changed, 3 insertions(+), 484 deletions(-)
 delete mode 100644 .travis.yml
 delete mode 100755 scripts/travis/coverage-summary.sh

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 5f1dea873e..00
--- a/.travis.yml
+++ /dev/null
@@ -1,439 +0,0 @@
-# The current Travis default is a VM based 16.04 Xenial on GCE
-# Additional builds with specific requirements for a full VM need to
-# be added as additional matrix: entries later on
-os: linux
-dist: focal
-language: c
-compiler:
-  - gcc
-cache:
-  # There is one cache per branch and compiler version.
-  # characteristics of each job are used to identify the cache:
-  # - OS name (currently only linux)
-  # - OS distribution (for Linux, bionic or focal)
-  # - Names and values of visible environment variables set in .travis.yml or 
Settings panel
-  timeout: 1200
-  ccache: true
-  pip: true
-  directories:
-  - $HOME/avocado/data/cache
-
-
-addons:
-  apt:
-packages:
-  # Build dependencies
-  - libaio-dev
-  - libattr1-dev
-  - libbrlapi-dev
-  - libcap-ng-dev
-  - libgcc-7-dev
-  - libgnutls28-dev
-  - libgtk-3-dev
-  - libiscsi-dev
-  - liblttng-ust-dev
-  - libncurses5-dev
-  - libnfs-dev
-  - libnss3-dev
-  - libpixman-1-dev
-  - libpng-dev
-  - librados-dev
-  - libsdl2-dev
-  - libsdl2-image-dev
-  - libseccomp-dev
-  - libspice-protocol-dev
-  - libspice-server-dev
-  - libssh-dev
-  - liburcu-dev
-  - libusb-1.0-0-dev
-  - libvdeplug-dev
-  - libvte-2.91-dev
-  - libzstd-dev
-  - ninja-build
-  - sparse
-  - uuid-dev
-  - gcovr
-  # Tests dependencies
-  - genisoimage
-
-
-# The channel name "irc.oftc.net#qemu" is encrypted against qemu/qemu
-# to prevent IRC notifications from forks. This was created using:
-# $ travis encrypt -r "qemu/qemu" "irc.oftc.net#qemu"
-notifications:
-  irc:
-channels:
-  - secure: 
"F7GDRgjuOo5IUyRLqSkmDL7kvdU4UcH3Lm/W2db2JnDHTGCqgEdaYEYKciyCLZ57vOTsTsOgesN8iUT7hNHBd1KWKjZe9KDTZWppWRYVwAwQMzVeSOsbbU4tRoJ6Pp+3qhH1Z0eGYR9ZgKYAoTumDFgSAYRp4IscKS8jkoedOqM="
-on_success: change
-on_failure: always
-
-
-env:
-  global:
-- SRC_DIR=".."
-- BUILD_DIR="build"
-- BASE_CONFIG="--disable-docs --disable-tools"
-- TEST_BUILD_CMD=""
-- TEST_CMD="make check V=1"
-# This is broadly a list of "mainline" softmmu targets which have support 
across the major distros
-- 
MAIN_SOFTMMU_TARGETS="aarch64-softmmu,mips64-softmmu,ppc64-softmmu,riscv64-softmmu,s390x-softmmu,x86_64-softmmu"
-- CCACHE_SLOPPINESS="include_file_ctime,include_file_mtime"
-- CCACHE_MAXSIZE=1G
-- G_MESSAGES_DEBUG=error
-
-
-git:
-  # we want to do this ourselves
-  submodules: false
-
-# Common first phase for all steps
-before_install:
-  - if command -v ccache ; then ccache --zero-stats ; fi
-  - export JOBS=$(($(getconf _NPROCESSORS_ONLN) + 1))
-  - echo "=== Using ${JOBS} simultaneous jobs ==="
-
-# Configure step - may be overridden
-before_script:
-  - mkdir -p ${BUILD_DIR} && cd ${BUILD_DIR}
-  - ${SRC_DIR}/configure ${BASE_CONFIG} ${CONFIG} || { cat config.log 
meson-logs/meson-log.txt && exit 1; }
-
-# Main build & test - rarely overridden - controlled by TEST_CMD
-script:
-  - BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$?
-  - |
-if [ "$BUILD_RC" -eq 0 ] && [ -n "$TEST_BUILD_CMD" ]; then
-${TEST_BUILD_CMD} || BUILD_RC=$?
-else
-$(exit $BUILD_RC);
-fi
-  - |
-if [ "$BUILD_RC" -eq 0 ] ; then
-${TEST_CMD} ;
-else
-$(exit $BUILD_RC);
-fi
-after_script:
-  - df -h
-  - if command -v ccache ; then ccache --show-stats ; fi
-
-
-jobs:
-  include:
-# --enable-debug implies --enable-debug-tcg, also runs quite a bit slower
-- name: "GCC debug (main-softmmu)"
-  env:
-- CONFIG="--enable-debug --target-list=${MAIN_SOFTMMU_TARGETS}"
-- CACHE_NAME="${TRAVIS_BRANCH}-linux-gcc-debug"
-
-
-# TCG debug can be run just on its own and is mostly agnostic to 
user/softmmu distinctions
-- name: "GCC debug (user)"
- 

[PATCH 1/2] tests/docker: remove travis container

2021-02-09 Thread Daniel P . Berrangé
The travis container that we have no longer matches what travis
currently uses. As all x86 jobs are being moved to GitLab CI too,
there is no compelling reason to update the travis container. It
is simpler to just remove it.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Wainer dos Santos Moschetta 
Signed-off-by: Daniel P. Berrangé 
---
 docs/devel/testing.rst | 14 
 tests/docker/Makefile.include  | 11 ++
 tests/docker/dockerfiles/travis.docker | 17 --
 tests/docker/travis| 22 
 tests/docker/travis.py | 47 --
 5 files changed, 2 insertions(+), 109 deletions(-)
 delete mode 100644 tests/docker/dockerfiles/travis.docker
 delete mode 100755 tests/docker/travis
 delete mode 100755 tests/docker/travis.py

diff --git a/docs/devel/testing.rst b/docs/devel/testing.rst
index 209f9d8172..00ce16de48 100644
--- a/docs/devel/testing.rst
+++ b/docs/devel/testing.rst
@@ -357,20 +357,6 @@ source and build it.
 
 The full list of tests is printed in the ``make docker`` help.
 
-Tools
--
-
-There are executables that are created to run in a specific Docker environment.
-This makes it easy to write scripts that have heavy or special dependencies,
-but are still very easy to use.
-
-Currently the only tool is ``travis``, which mimics the Travis-CI tests in a
-container. It runs in the ``travis`` image:
-
-.. code::
-
-  make docker-travis@travis
-
 Debugging a Docker test failure
 ---
 
diff --git a/tests/docker/Makefile.include b/tests/docker/Makefile.include
index 93b29ad823..7cab761bf5 100644
--- a/tests/docker/Makefile.include
+++ b/tests/docker/Makefile.include
@@ -21,8 +21,6 @@ DOCKER_REGISTRY := $(if 
$(REGISTRY),$(REGISTRY),registry.gitlab.com/qemu-project
 DOCKER_TESTS := $(notdir $(shell \
find $(SRC_PATH)/tests/docker/ -name 'test-*' -type f))
 
-DOCKER_TOOLS := travis
-
 ENGINE := auto
 
 DOCKER_SCRIPT=$(SRC_PATH)/tests/docker/docker.py --engine $(ENGINE)
@@ -126,7 +124,7 @@ ifneq ($(HOST_ARCH),x86_64)
 DOCKER_PARTIAL_IMAGES += debian-mips-cross debian-mipsel-cross 
debian-mips64el-cross
 DOCKER_PARTIAL_IMAGES += debian-ppc64el-cross
 DOCKER_PARTIAL_IMAGES += debian-s390x-cross
-DOCKER_PARTIAL_IMAGES += fedora travis
+DOCKER_PARTIAL_IMAGES += fedora
 endif
 
 docker-image-debian-alpha-cross: docker-image-debian10
@@ -147,8 +145,6 @@ docker-image-debian-s390x-cross: docker-image-debian10
 docker-image-debian-sh4-cross: docker-image-debian10
 docker-image-debian-sparc64-cross: docker-image-debian10
 
-docker-image-travis: NOUSER=1
-
 # Specialist build images, sometimes very limited tools
 docker-image-debian-tricore-cross: docker-image-debian10
 docker-image-debian-all-test-cross: docker-image-debian10
@@ -174,7 +170,7 @@ DOCKER_PARTIAL_IMAGES += fedora-i386-cross fedora-cris-cross
 
 # Expand all the pre-requistes for each docker image and test combination
 $(foreach i,$(filter-out $(DOCKER_PARTIAL_IMAGES),$(DOCKER_IMAGES)), \
-   $(foreach t,$(DOCKER_TESTS) $(DOCKER_TOOLS), \
+   $(foreach t,$(DOCKER_TESTS), \
$(eval .PHONY: docker-$t@$i) \
$(eval docker-$t@$i: docker-image-$i docker-run-$t@$i) \
) \
@@ -212,9 +208,6 @@ endif
@echo 'Available tests:'
@echo '$(DOCKER_TESTS)'
@echo
-   @echo 'Available tools:'
-   @echo '$(DOCKER_TOOLS)'
-   @echo
@echo 'Special variables:'
@echo 'TARGET_LIST=a,b,cOverride target list in builds.'
@echo 'EXTRA_CONFIGURE_OPTS="..."'
diff --git a/tests/docker/dockerfiles/travis.docker 
b/tests/docker/dockerfiles/travis.docker
deleted file mode 100644
index cd1435a7e9..00
--- a/tests/docker/dockerfiles/travis.docker
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# Travis Image - this is broadly the same image that we run our CI
-# tests on.
-#
-FROM travisci/ci-sardonyx:packer-1552557266-f909ac5
-ENV DEBIAN_FRONTEND noninteractive
-ENV LANG en_US.UTF-8
-ENV LC_ALL en_US.UTF-8
-RUN sed -i "s/# deb-src/deb-src/" /etc/apt/sources.list
-RUN apt-get update
-RUN apt-get -y build-dep qemu
-RUN apt-get -y install device-tree-compiler python3 python3-yaml dh-autoreconf 
gdb strace lsof net-tools gcovr ninja-build
-# Travis tools require PhantomJS / Neo4j / Maven accessible
-# in their PATH (QEMU build won't access them).
-ENV PATH 
/usr/local/phantomjs/bin:/usr/local/phantomjs:/usr/local/neo4j-3.2.7/bin:/usr/local/maven-3.5.2/bin:/usr/local/cmake-3.9.2/bin:/usr/local/clang-5.0.0/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
-ENV FEATURES clang pyyaml docs
-USER travis
diff --git a/tests/docker/travis b/tests/docker/travis
deleted file mode 100755
index 47c03677d6..00
--- a/tests/docker/travis
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash -e
-#
-# Mimic a travis testing matrix
-#
-# Copyright (c) 2016 Red Hat Inc.
-#
-# Authors:
-#  Fam Zheng 
-#
-# This work is licensed under the terms of t

[PATCH 0/2] travis: remove all use of Travis CI

2021-02-09 Thread Daniel P . Berrangé
It is not sustainable to keep supporting Travis CI when our maintainer
have exhausted their free CI credit allowance and it isn't easily
renewable for most

While there are still some unique scenarios covered by Travis, this is
not useful when maintainers can't run the pipelines.  If people see
scenarios that are desirable for GitLab CI they can be added as jobs
when desired.

Daniel P. Berrangé (2):
  tests/docker: remove travis container
  travis: remove travis configuration and all references to Travis CI

 .travis.yml| 439 -
 MAINTAINERS|   3 -
 configure  |   1 -
 contrib/gitdm/filetypes.txt|   2 +-
 docs/devel/testing.rst |  14 -
 scripts/travis/coverage-summary.sh |  27 --
 tests/docker/Makefile.include  |  11 +-
 tests/docker/docker.py |   2 +-
 tests/docker/dockerfiles/travis.docker |  17 -
 tests/docker/travis|  22 --
 tests/docker/travis.py |  47 ---
 tests/qemu-iotests/079 |   2 +-
 tests/test-util-filemonitor.c  |  11 -
 13 files changed, 5 insertions(+), 593 deletions(-)
 delete mode 100644 .travis.yml
 delete mode 100755 scripts/travis/coverage-summary.sh
 delete mode 100644 tests/docker/dockerfiles/travis.docker
 delete mode 100755 tests/docker/travis
 delete mode 100755 tests/docker/travis.py

-- 
2.29.2





Re: [PATCH 0/7] qcow2: compressed write cache

2021-02-09 Thread Max Reitz

On 29.01.21 17:50, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

I know, I have several series waiting for a resend, but I had to switch
to another task spawned from our customer's bug.

Original problem: we use O_DIRECT for all vm images in our product, it's
the policy. The only exclusion is backup target qcow2 image for
compressed backup, because compressed backup is extremely slow with
O_DIRECT (due to unaligned writes). Customer complains that backup
produces a lot of pagecache.

So we can either implement some internal cache or use fadvise somehow.
Backup has several async workes, which writes simultaneously, so in both
ways we have to track host cluster filling (before dropping the cache
corresponding to the cluster).  So, if we have to track anyway, let's
try to implement the cache.


I wanted to be excited here, because that sounds like it would be very 
easy to implement caching.  Like, just keep the cluster at 
free_byte_offset cached until the cluster it points to changes, then 
flush the cluster.


But then I see like 900 new lines of code, and I’m much less excited...


Idea is simple: cache small unaligned write and flush the cluster when
filled.

Performance result is very good (results in a table is time of
compressed backup of 1000M disk filled with ones in seconds):


“Filled with ones” really is an edge case, though.


---  ---  ---
  backup(old)  backup(new)
ssd:hdd(direct)  3e+024.4
 -99%
ssd:hdd(cached)  5.7  5.4
 -5%
---  ---  ---

So, we have benefit even for cached mode! And the fastest thing is
O_DIRECT with new implemented cache. So, I suggest to enable the new
cache by default (which is done by the series).


First, I’m not sure how O_DIRECT really is relevant, because I don’t 
really see the point for writing compressed images.


Second, I find it a bit cheating if you say there is a huge improvement 
for the no-cache case, when actually, well, you just added a cache.  So 
the no-cache case just became faster because there is a cache now.


Well, I suppose I could follow that if O_DIRECT doesn’t make much sense 
for compressed images, qemu’s format drivers are free to introduce some 
caching (because technically the cache.direct option only applies to the 
protocol driver) for collecting compressed writes.  That conclusion 
makes both of my complaints kind of moot.


*shrug*

Third, what is the real-world impact on the page cache?  You described 
that that’s the reason why you need the cache in qemu, because otherwise 
the page cache is polluted too much.  How much is the difference really? 
 (I don’t know how good the compression ratio is for real-world images.)


Related to that, I remember a long time ago we had some discussion about 
letting qemu-img convert set a special cache mode for the target image 
that would make Linux drop everything before the last offset written 
(i.e., I suppose fadvise() with POSIX_FADV_SEQUENTIAL).  You discard 
that idea based on the fact that implementing a cache in qemu would be 
simple, but it isn’t, really.  What would the impact of 
POSIX_FADV_SEQUENTIAL be?  (One advantage of using that would be that we 
could reuse it for non-compressed images that are written by backup or 
qemu-img convert.)


(I don’t remember why that qemu-img discussion died back then.)


Fourth, regarding the code, would it be simpler if it were a pure write 
cache?  I.e., on read, everything is flushed, so we don’t have to deal 
with that.  I don’t think there are many valid cases where a compressed 
image is both written to and read from at the same time.  (Just asking, 
because I’d really want this code to be simpler.  I can imagine that 
reading from the cache is the least bit of complexity, but perhaps...)


Max




[PATCH 1/2] hw/nvme: move nvme emulation out of hw/block

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

With the introduction of the nvme-subsystem device we are really
cluttering up the hw/block directory.

As suggested by Philippe previously, move the nvme emulation to
hw/nvme.

Suggested-by: Philippe Mathieu-Daudé 
Signed-off-by: Klaus Jensen 
---
 meson.build   |   1 +
 hw/block/nvme-ns.h| 193 -
 hw/block/nvme-subsys.h|  32 
 hw/{block => nvme}/nvme.h | 198 +-
 hw/nvme/trace.h   |   1 +
 hw/{block/nvme.c => nvme/ctrl.c}  |   1 -
 hw/{block/nvme-ns.c => nvme/ns.c} |   1 -
 hw/{block/nvme-subsys.c => nvme/subsys.c} |   2 +-
 MAINTAINERS   |   2 +-
 hw/Kconfig|   1 +
 hw/block/Kconfig  |   5 -
 hw/block/meson.build  |   1 -
 hw/block/trace-events | 180 
 hw/meson.build|   1 +
 hw/nvme/Kconfig   |   4 +
 hw/nvme/meson.build   |   1 +
 hw/nvme/trace-events  | 178 +++
 17 files changed, 385 insertions(+), 417 deletions(-)
 delete mode 100644 hw/block/nvme-ns.h
 delete mode 100644 hw/block/nvme-subsys.h
 rename hw/{block => nvme}/nvme.h (55%)
 create mode 100644 hw/nvme/trace.h
 rename hw/{block/nvme.c => nvme/ctrl.c} (99%)
 rename hw/{block/nvme-ns.c => nvme/ns.c} (99%)
 rename hw/{block/nvme-subsys.c => nvme/subsys.c} (98%)
 create mode 100644 hw/nvme/Kconfig
 create mode 100644 hw/nvme/meson.build
 create mode 100644 hw/nvme/trace-events

diff --git a/meson.build b/meson.build
index e3386196ba41..255f54918786 100644
--- a/meson.build
+++ b/meson.build
@@ -1433,6 +1433,7 @@ if have_system
 'hw/misc',
 'hw/misc/macio',
 'hw/net',
+'hw/nvme',
 'hw/nvram',
 'hw/pci',
 'hw/pci-host',
diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
deleted file mode 100644
index 7af6884862b5..
--- a/hw/block/nvme-ns.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * QEMU NVM Express Virtual Namespace
- *
- * Copyright (c) 2019 CNEX Labs
- * Copyright (c) 2020 Samsung Electronics
- *
- * Authors:
- *  Klaus Jensen  
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See the
- * COPYING file in the top-level directory.
- *
- */
-
-#ifndef NVME_NS_H
-#define NVME_NS_H
-
-#define TYPE_NVME_NS "nvme-ns"
-#define NVME_NS(obj) \
-OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
-
-typedef struct NvmeZone {
-NvmeZoneDescr   d;
-uint64_tw_ptr;
-QTAILQ_ENTRY(NvmeZone) entry;
-} NvmeZone;
-
-typedef struct NvmeNamespaceParams {
-uint32_t nsid;
-QemuUUID uuid;
-
-uint16_t mssrl;
-uint32_t mcl;
-uint8_t  msrc;
-
-bool zoned;
-bool cross_zone_read;
-uint64_t zone_size_bs;
-uint64_t zone_cap_bs;
-uint32_t max_active_zones;
-uint32_t max_open_zones;
-uint32_t zd_extension_size;
-} NvmeNamespaceParams;
-
-typedef struct NvmeNamespace {
-DeviceState  parent_obj;
-BlockConfblkconf;
-int32_t  bootindex;
-int64_t  size;
-NvmeIdNs id_ns;
-const uint32_t *iocs;
-uint8_t  csi;
-
-NvmeSubsystem   *subsys;
-
-NvmeIdNsZoned   *id_ns_zoned;
-NvmeZone*zone_array;
-QTAILQ_HEAD(, NvmeZone) exp_open_zones;
-QTAILQ_HEAD(, NvmeZone) imp_open_zones;
-QTAILQ_HEAD(, NvmeZone) closed_zones;
-QTAILQ_HEAD(, NvmeZone) full_zones;
-uint32_tnum_zones;
-uint64_tzone_size;
-uint64_tzone_capacity;
-uint32_tzone_size_log2;
-uint8_t *zd_extensions;
-int32_t nr_open_zones;
-int32_t nr_active_zones;
-
-NvmeNamespaceParams params;
-
-struct {
-uint32_t err_rec;
-} features;
-} NvmeNamespace;
-
-static inline uint32_t nvme_nsid(NvmeNamespace *ns)
-{
-if (ns) {
-return ns->params.nsid;
-}
-
-return -1;
-}
-
-static inline bool nvme_ns_shared(NvmeNamespace *ns)
-{
-return !!ns->subsys;
-}
-
-static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
-{
-NvmeIdNs *id_ns = &ns->id_ns;
-return &id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)];
-}
-
-static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns)
-{
-return nvme_ns_lbaf(ns)->ds;
-}
-
-/* calculate the number of LBAs that the namespace can accomodate */
-static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns)
-{
-return ns->size >> nvme_ns_lbads(ns);
-}
-
-/* convert an LBA to the equivalent in bytes */
-static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
-{
-return lba << nvme_ns_lbads(ns);
-}
-
-typedef struct NvmeCtrl NvmeCtrl;
-
-static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
-{
-return zone->d.zs >> 4;
-}
-
-static inline void nvme_set_zone_state(NvmeZone *zone, NvmeZoneState state)
-{
-zo

[PATCH 2/2] hw/nvme: move device-scoped functions

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

Move a bunch of functions that are internal to a device out of the
shared header.

Signed-off-by: Klaus Jensen 
---
 hw/nvme/nvme.h | 110 +
 hw/nvme/ctrl.c |  90 +++-
 hw/nvme/ns.c   |   7 +++-
 3 files changed, 97 insertions(+), 110 deletions(-)

diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 452a64499b1b..929c6c553ca2 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -96,36 +96,13 @@ static inline uint32_t nvme_nsid(NvmeNamespace *ns)
 return -1;
 }
 
-static inline bool nvme_ns_shared(NvmeNamespace *ns)
-{
-return !!ns->subsys;
-}
-
-static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
-{
-NvmeIdNs *id_ns = &ns->id_ns;
-return &id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)];
-}
-
 static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns)
 {
-return nvme_ns_lbaf(ns)->ds;
-}
+NvmeLBAF lbaf = ns->id_ns.lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)];
 
-/* calculate the number of LBAs that the namespace can accomodate */
-static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns)
-{
-return ns->size >> nvme_ns_lbads(ns);
+return lbaf.ds;
 }
 
-/* convert an LBA to the equivalent in bytes */
-static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
-{
-return lba << nvme_ns_lbads(ns);
-}
-
-typedef struct NvmeCtrl NvmeCtrl;
-
 static inline NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
 {
 return zone->d.zs >> 4;
@@ -136,31 +113,6 @@ static inline void nvme_set_zone_state(NvmeZone *zone, 
NvmeZoneState state)
 zone->d.zs = state << 4;
 }
 
-static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
-{
-return zone->d.zslba + ns->zone_size;
-}
-
-static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
-{
-return zone->d.zslba + zone->d.zcap;
-}
-
-static inline bool nvme_wp_is_valid(NvmeZone *zone)
-{
-uint8_t st = nvme_get_zone_state(zone);
-
-return st != NVME_ZONE_STATE_FULL &&
-   st != NVME_ZONE_STATE_READ_ONLY &&
-   st != NVME_ZONE_STATE_OFFLINE;
-}
-
-static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
- uint32_t zone_idx)
-{
-return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size];
-}
-
 static inline void nvme_aor_inc_open(NvmeNamespace *ns)
 {
 assert(ns->nr_open_zones >= 0);
@@ -203,7 +155,6 @@ void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
 void nvme_ns_cleanup(NvmeNamespace *ns);
 
-
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
@@ -237,40 +188,6 @@ typedef struct NvmeRequest {
 QTAILQ_ENTRY(NvmeRequest)entry;
 } NvmeRequest;
 
-static inline const char *nvme_adm_opc_str(uint8_t opc)
-{
-switch (opc) {
-case NVME_ADM_CMD_DELETE_SQ:return "NVME_ADM_CMD_DELETE_SQ";
-case NVME_ADM_CMD_CREATE_SQ:return "NVME_ADM_CMD_CREATE_SQ";
-case NVME_ADM_CMD_GET_LOG_PAGE: return "NVME_ADM_CMD_GET_LOG_PAGE";
-case NVME_ADM_CMD_DELETE_CQ:return "NVME_ADM_CMD_DELETE_CQ";
-case NVME_ADM_CMD_CREATE_CQ:return "NVME_ADM_CMD_CREATE_CQ";
-case NVME_ADM_CMD_IDENTIFY: return "NVME_ADM_CMD_IDENTIFY";
-case NVME_ADM_CMD_ABORT:return "NVME_ADM_CMD_ABORT";
-case NVME_ADM_CMD_SET_FEATURES: return "NVME_ADM_CMD_SET_FEATURES";
-case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES";
-case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
-default:return "NVME_ADM_CMD_UNKNOWN";
-}
-}
-
-static inline const char *nvme_io_opc_str(uint8_t opc)
-{
-switch (opc) {
-case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH";
-case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
-case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
-case NVME_CMD_COMPARE:  return "NVME_NVM_CMD_COMPARE";
-case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
-case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
-case NVME_CMD_COPY: return "NVME_NVM_CMD_COPY";
-case NVME_CMD_ZONE_MGMT_SEND:   return "NVME_ZONED_CMD_MGMT_SEND";
-case NVME_CMD_ZONE_MGMT_RECV:   return "NVME_ZONED_CMD_MGMT_RECV";
-case NVME_CMD_ZONE_APPEND:  return "NVME_ZONED_CMD_ZONE_APPEND";
-default:return "NVME_NVM_CMD_UNKNOWN";
-}
-}
-
 typedef struct NvmeSQueue {
 struct NvmeCtrl *ctrl;
 uint16_tsqid;
@@ -379,29 +296,6 @@ typedef struct NvmeCtrl {
 NvmeFeatureVal  features;
 } NvmeCtrl;
 
-static inline NvmeNamespace *nvme_ns(NvmeCtrl *n, uint32_t nsid)
-{
-if (!nsid || nsid > n->num_namespaces) {
-return NULL;
-}
-
-return n->namespaces[nsid - 1];
-}
-
-static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
-{
-NvmeSQueue *sq = req->sq;
-NvmeCtrl *n = s

Re: [PATCH v2] hw/block/nvme: use locally assigned QEMU IEEE OUI

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 11:45 AM, Klaus Jensen wrote:
> From: Gollu Appalanaidu 
> 
> Commit 6eb7a071292a ("hw/block/nvme: change controller pci id") changed
> the controller to use a Red Hat assigned PCI Device and Vendor ID, but
> did not change the IEEE OUI away from the Intel IEEE OUI.
> 
> Fix that and use the locally assigned QEMU IEEE OUI instead if the
> `use-intel-id` parameter is not explicitly set. Also reverse the Intel
> IEEE OUI bytes.
> 
> Signed-off-by: Gollu Appalanaidu 
> Signed-off-by: Klaus Jensen 
> ---
> 
> v2: drop telemetry and add a check on the use_intel_id parameter.
> 
>  hw/block/nvme.c | 14 +++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index c2f0c88fbf39..870e9d8e1c17 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -4685,9 +4685,17 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>  id->cntlid = cpu_to_le16(n->cntlid);
>  
>  id->rab = 6;
> -id->ieee[0] = 0x00;
> -id->ieee[1] = 0x02;
> -id->ieee[2] = 0xb3;
> +
> +if (n->params.use_intel_id) {
> +id->ieee[0] = 0xb3;
> +id->ieee[1] = 0x02;
> +id->ieee[2] = 0x00;
> +} else {
> +id->ieee[0] = 0x00;
> +id->ieee[1] = 0x54;
> +id->ieee[2] = 0x52;
> +}

Correct.

Reviewed-by: Philippe Mathieu-Daudé 

Ideally we should have definitions and use them here and in
qemu_macaddr_default_if_unset() instead of this magic values.

> +
>  id->mdts = n->params.mdts;
>  id->ver = cpu_to_le32(NVME_SPEC_VER);
>  id->oacs = cpu_to_le16(0);
> 




[PATCH 0/2] hw/nvme: move nvme emulation out of hw/block

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

With the introduction of the nvme-subsystem device we are really
cluttering up the hw/block directory.

As suggested by Philippe previously, move the nvme emulation to hw/nvme.

Klaus Jensen (2):
  hw/nvme: move nvme emulation out of hw/block
  hw/nvme: move device-scoped functions

 meson.build   |   1 +
 hw/block/nvme-ns.h| 193 
 hw/block/nvme-subsys.h|  32 
 hw/{block => nvme}/nvme.h | 206 +++---
 hw/nvme/trace.h   |   1 +
 hw/{block/nvme.c => nvme/ctrl.c}  |  91 +-
 hw/{block/nvme-ns.c => nvme/ns.c} |   8 +-
 hw/{block/nvme-subsys.c => nvme/subsys.c} |   2 +-
 MAINTAINERS   |   2 +-
 hw/Kconfig|   1 +
 hw/block/Kconfig  |   5 -
 hw/block/meson.build  |   1 -
 hw/block/trace-events | 180 ---
 hw/meson.build|   1 +
 hw/nvme/Kconfig   |   4 +
 hw/nvme/meson.build   |   1 +
 hw/nvme/trace-events  | 178 +++
 17 files changed, 431 insertions(+), 476 deletions(-)
 delete mode 100644 hw/block/nvme-ns.h
 delete mode 100644 hw/block/nvme-subsys.h
 rename hw/{block => nvme}/nvme.h (51%)
 create mode 100644 hw/nvme/trace.h
 rename hw/{block/nvme.c => nvme/ctrl.c} (97%)
 rename hw/{block/nvme-ns.c => nvme/ns.c} (98%)
 rename hw/{block/nvme-subsys.c => nvme/subsys.c} (98%)
 create mode 100644 hw/nvme/Kconfig
 create mode 100644 hw/nvme/meson.build
 create mode 100644 hw/nvme/trace-events

-- 
2.30.0




Re: [PATCH] hw/sd: sdhci: Do not transfer any data when command fails

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 11:54 AM, Bin Meng wrote:
> At the end of sdhci_send_command(), it starts a data transfer if
> the command register indicates a data is associated. However the
> data transfer should only be initiated when the command execution
> has succeeded.
> 
> Cc: qemu-sta...@nongnu.org
> Fixes: CVE-2020-17380
> Fixes: CVE-2020-25085
> Reported-by: Alexander Bulekov 
> Reported-by: Sergej Schumilo (Ruhr-University Bochum)
> Reported-by: Cornelius Aschermann (Ruhr-University Bochum)
> Reported-by: Simon Wrner (Ruhr-University Bochum)
> Buglink: https://bugs.launchpad.net/qemu/+bug/1892960
> Signed-off-by: Bin Meng 
> ---
> 
>  hw/sd/sdhci.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)

Tested-by: Philippe Mathieu-Daudé 



[PATCH] hw/sd: sdhci: Do not transfer any data when command fails

2021-02-09 Thread Bin Meng
At the end of sdhci_send_command(), it starts a data transfer if
the command register indicates a data is associated. However the
data transfer should only be initiated when the command execution
has succeeded.

Cc: qemu-sta...@nongnu.org
Fixes: CVE-2020-17380
Fixes: CVE-2020-25085
Reported-by: Alexander Bulekov 
Reported-by: Sergej Schumilo (Ruhr-University Bochum)
Reported-by: Cornelius Aschermann (Ruhr-University Bochum)
Reported-by: Simon Wrner (Ruhr-University Bochum)
Buglink: https://bugs.launchpad.net/qemu/+bug/1892960
Signed-off-by: Bin Meng 
---

 hw/sd/sdhci.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 8ffa539..0450110 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -326,6 +326,7 @@ static void sdhci_send_command(SDHCIState *s)
 SDRequest request;
 uint8_t response[16];
 int rlen;
+bool cmd_failure = false;
 
 s->errintsts = 0;
 s->acmd12errsts = 0;
@@ -349,6 +350,7 @@ static void sdhci_send_command(SDHCIState *s)
 trace_sdhci_response16(s->rspreg[3], s->rspreg[2],
s->rspreg[1], s->rspreg[0]);
 } else {
+cmd_failure = true;
 trace_sdhci_error("timeout waiting for command response");
 if (s->errintstsen & SDHC_EISEN_CMDTIMEOUT) {
 s->errintsts |= SDHC_EIS_CMDTIMEOUT;
@@ -369,7 +371,7 @@ static void sdhci_send_command(SDHCIState *s)
 
 sdhci_update_irq(s);
 
-if (s->blksize && (s->cmdreg & SDHC_CMD_DATA_PRESENT)) {
+if (!cmd_failure && s->blksize && (s->cmdreg & SDHC_CMD_DATA_PRESENT)) {
 s->data_count = 0;
 sdhci_data_transfer(s);
 }
-- 
2.7.4




[PATCH v2] hw/block/nvme: use locally assigned QEMU IEEE OUI

2021-02-09 Thread Klaus Jensen
From: Gollu Appalanaidu 

Commit 6eb7a071292a ("hw/block/nvme: change controller pci id") changed
the controller to use a Red Hat assigned PCI Device and Vendor ID, but
did not change the IEEE OUI away from the Intel IEEE OUI.

Fix that and use the locally assigned QEMU IEEE OUI instead if the
`use-intel-id` parameter is not explicitly set. Also reverse the Intel
IEEE OUI bytes.

Signed-off-by: Gollu Appalanaidu 
Signed-off-by: Klaus Jensen 
---

v2: drop telemetry and add a check on the use_intel_id parameter.

 hw/block/nvme.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c2f0c88fbf39..870e9d8e1c17 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4685,9 +4685,17 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->cntlid = cpu_to_le16(n->cntlid);
 
 id->rab = 6;
-id->ieee[0] = 0x00;
-id->ieee[1] = 0x02;
-id->ieee[2] = 0xb3;
+
+if (n->params.use_intel_id) {
+id->ieee[0] = 0xb3;
+id->ieee[1] = 0x02;
+id->ieee[2] = 0x00;
+} else {
+id->ieee[0] = 0x00;
+id->ieee[1] = 0x54;
+id->ieee[2] = 0x52;
+}
+
 id->mdts = n->params.mdts;
 id->ver = cpu_to_le32(NVME_SPEC_VER);
 id->oacs = cpu_to_le16(0);
-- 
2.30.0




Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress

2021-02-09 Thread Bin Meng
Oops, hitting "send" by mistake ...

On Tue, Feb 9, 2021 at 5:42 PM Bin Meng  wrote:
>
> Hi Philippe,
>
> On Tue, Feb 9, 2021 at 5:38 PM Philippe Mathieu-Daudé  wrote:
> >
> > On 2/9/21 9:28 AM, Bin Meng wrote:
> > > Hi Philippe,
> > >
> > > On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé  
> > > wrote:
> > >>
> > >> Per the "SD Host Controller Simplified Specification Version 2.00"
> > >> spec. 'Table 2-4 : Block Size Register':
> > >>
> > >>   Transfer Block Size [...] can be accessed only if no
> > >>   transaction is executing (i.e., after a transaction has stopped).
> > >>   Read operations during transfers may return an invalid value,
> > >>   and write operations shall be ignored.
> > >>
> > >> Transactions will update 'data_count', so do not modify 'blksize'
> > >> and 'blkcnt' when 'data_count' is used. This fixes:
> > >>
> > >> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \
> > >>-nographic -serial none -M pc-q35-5.0 \
> > >>-device sdhci-pci,sd-spec-version=3 \
> > >>-device sd-card,drive=mydrive \
> > >>-drive if=sd,index=0,file=null-co://,format=raw,id=mydrive
> > >>   outl 0xcf8 0x80001810
> > >>   outl 0xcfc 0xe1068000
> > >>   outl 0xcf8 0x80001814
> > >
> > > Is this command needed?
> >
> > My guess is this makes the northbridge somehow map the device PCI space.
> >
> > Probably not needed in machines where SDHCI is MMIO mapped.
>
> I think this is not needed. Writing only the CFG_ADDR

I think this is not needed. Writing only the CFG_ADDR without wring
CFG_DATA does not take any effect.

>
> >
> > >
> > >>   outl 0xcf8 0x80001804
> > >>   outw 0xcfc 0x7
> > >>   outl 0xcf8 0x8000fa20
> > >
> > > and this one?
> >
> > Ditto.
> >
> > >
> > >>   write 0xe106802c 0x1 0x0f
> > >>   write 0xe1068004 0xc 0x2801d10101fbff28a384
> > >
> > > Are these fuzzy data?
> >
> > Yes, I didn't try to understand what this does, as often
> > non-sense operations. But this is what would craft a malicious
> > attacker.
> >
> > >
> > >>   write 0xe106800c 0x1f 
> > >> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f
> > >>   write 0xe1068003 0x28 
> > >> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576
> > >>   write 0xe1068003 0x1 0xfe
> > >>   EOF
> > >>   =
> > >>   ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address 
> > >> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0
> > >>   WRITE of size 4 at 0x6153bb00 thread T0
> > >>   #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b)
> > >>   #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5
> > >>   #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1
> > >>   #3 0x55ab483aeb4b in flatview_read_continue 
> > >> softmmu/physmem.c:2839:13
> > >>   #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12
> > >>   #5 0x55ab483b028e in address_space_read_full 
> > >> softmmu/physmem.c:2890:18
> > >>   #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16
> > >>   #7 0x55ab479374a2 in dma_memory_rw_relaxed 
> > >> include/sysemu/dma.h:88:12
> > >>   #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12
> > >>   #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12
> > >>   #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks 
> > >> hw/sd/sdhci.c:639:13
> > >>   #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17
> > >>   #12 0x55ab483f8db8 in memory_region_write_accessor 
> > >> softmmu/memory.c:491:5
> > >>   #13 0x55ab483f868a in access_with_adjusted_size 
> > >> softmmu/memory.c:552:18
> > >>   #14 0x55ab483f6da5 in memory_region_dispatch_write 
> > >> softmmu/memory.c:1501:16
> > >>   #15 0x55ab483c3b11 in flatview_write_continue 
> > >> softmmu/physmem.c:2774:23
> > >>   #16 0x55ab483b0eb6 in flatview_write softmmu/physmem.c:2814:14
> > >>   #17 0x55ab483b0a3e in address_space_write softmmu/physmem.c:2906:18
> > >>   #18 0x55ab48465c56 in qtest_process_command softmmu/qtest.c:654:9
> > >>
> > >>   0x6153bb00 is located 0 bytes to the right of 512-byte region 
> > >> [0x6153b900,0x6153bb00)
> > >>   allocated by thread T0 here:
> > >>   #0 0x55ab469f58a7 in calloc (qemu-system-i386+0x1ceb8a7)
> > >>   #1 0x7f21d678f9b0 in g_malloc0 (/lib64/libglib-2.0.so.0+0x589b0)
> > >>   #2 0x55ab479530ed in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5
> > >>   #3 0x55ab476f102a in pci_qdev_realize hw/pci/pci.c:2108:9
> > >>   #4 0x55ab48baaad2 in device_set_realized hw/core/qdev.c:761:13
> > >>
> > >>   SUMMARY: AddressSanitizer: heap-buffer-overflow 
> > >> (qemu-system-i386+0x1cea56b) in __asan_memcpy
> > >>   Shadow bytes around the buggy address:
> > >> 0x0c2a7710: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
> > >> 0x0c2a7720: 00 00 00 00 00 00 

Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress

2021-02-09 Thread Bin Meng
Hi Philippe,

On Tue, Feb 9, 2021 at 5:38 PM Philippe Mathieu-Daudé  wrote:
>
> On 2/9/21 9:28 AM, Bin Meng wrote:
> > Hi Philippe,
> >
> > On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé  
> > wrote:
> >>
> >> Per the "SD Host Controller Simplified Specification Version 2.00"
> >> spec. 'Table 2-4 : Block Size Register':
> >>
> >>   Transfer Block Size [...] can be accessed only if no
> >>   transaction is executing (i.e., after a transaction has stopped).
> >>   Read operations during transfers may return an invalid value,
> >>   and write operations shall be ignored.
> >>
> >> Transactions will update 'data_count', so do not modify 'blksize'
> >> and 'blkcnt' when 'data_count' is used. This fixes:
> >>
> >> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \
> >>-nographic -serial none -M pc-q35-5.0 \
> >>-device sdhci-pci,sd-spec-version=3 \
> >>-device sd-card,drive=mydrive \
> >>-drive if=sd,index=0,file=null-co://,format=raw,id=mydrive
> >>   outl 0xcf8 0x80001810
> >>   outl 0xcfc 0xe1068000
> >>   outl 0xcf8 0x80001814
> >
> > Is this command needed?
>
> My guess is this makes the northbridge somehow map the device PCI space.
>
> Probably not needed in machines where SDHCI is MMIO mapped.

I think this is not needed. Writing only the CFG_ADDR

>
> >
> >>   outl 0xcf8 0x80001804
> >>   outw 0xcfc 0x7
> >>   outl 0xcf8 0x8000fa20
> >
> > and this one?
>
> Ditto.
>
> >
> >>   write 0xe106802c 0x1 0x0f
> >>   write 0xe1068004 0xc 0x2801d10101fbff28a384
> >
> > Are these fuzzy data?
>
> Yes, I didn't try to understand what this does, as often
> non-sense operations. But this is what would craft a malicious
> attacker.
>
> >
> >>   write 0xe106800c 0x1f 
> >> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f
> >>   write 0xe1068003 0x28 
> >> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576
> >>   write 0xe1068003 0x1 0xfe
> >>   EOF
> >>   =
> >>   ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address 
> >> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0
> >>   WRITE of size 4 at 0x6153bb00 thread T0
> >>   #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b)
> >>   #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5
> >>   #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1
> >>   #3 0x55ab483aeb4b in flatview_read_continue softmmu/physmem.c:2839:13
> >>   #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12
> >>   #5 0x55ab483b028e in address_space_read_full 
> >> softmmu/physmem.c:2890:18
> >>   #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16
> >>   #7 0x55ab479374a2 in dma_memory_rw_relaxed include/sysemu/dma.h:88:12
> >>   #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12
> >>   #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12
> >>   #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks 
> >> hw/sd/sdhci.c:639:13
> >>   #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17
> >>   #12 0x55ab483f8db8 in memory_region_write_accessor 
> >> softmmu/memory.c:491:5
> >>   #13 0x55ab483f868a in access_with_adjusted_size 
> >> softmmu/memory.c:552:18
> >>   #14 0x55ab483f6da5 in memory_region_dispatch_write 
> >> softmmu/memory.c:1501:16
> >>   #15 0x55ab483c3b11 in flatview_write_continue 
> >> softmmu/physmem.c:2774:23
> >>   #16 0x55ab483b0eb6 in flatview_write softmmu/physmem.c:2814:14
> >>   #17 0x55ab483b0a3e in address_space_write softmmu/physmem.c:2906:18
> >>   #18 0x55ab48465c56 in qtest_process_command softmmu/qtest.c:654:9
> >>
> >>   0x6153bb00 is located 0 bytes to the right of 512-byte region 
> >> [0x6153b900,0x6153bb00)
> >>   allocated by thread T0 here:
> >>   #0 0x55ab469f58a7 in calloc (qemu-system-i386+0x1ceb8a7)
> >>   #1 0x7f21d678f9b0 in g_malloc0 (/lib64/libglib-2.0.so.0+0x589b0)
> >>   #2 0x55ab479530ed in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5
> >>   #3 0x55ab476f102a in pci_qdev_realize hw/pci/pci.c:2108:9
> >>   #4 0x55ab48baaad2 in device_set_realized hw/core/qdev.c:761:13
> >>
> >>   SUMMARY: AddressSanitizer: heap-buffer-overflow 
> >> (qemu-system-i386+0x1cea56b) in __asan_memcpy
> >>   Shadow bytes around the buggy address:
> >> 0x0c2a7710: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
> >> 0x0c2a7720: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >> 0x0c2a7730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >> 0x0c2a7740: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >> 0x0c2a7750: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>   =>0x0c2a7760:[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
> >> 0x0c2a7770: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
> >> 0x0c2a7780: fd 

Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress

2021-02-09 Thread Philippe Mathieu-Daudé
On 2/9/21 9:28 AM, Bin Meng wrote:
> Hi Philippe,
> 
> On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé  wrote:
>>
>> Per the "SD Host Controller Simplified Specification Version 2.00"
>> spec. 'Table 2-4 : Block Size Register':
>>
>>   Transfer Block Size [...] can be accessed only if no
>>   transaction is executing (i.e., after a transaction has stopped).
>>   Read operations during transfers may return an invalid value,
>>   and write operations shall be ignored.
>>
>> Transactions will update 'data_count', so do not modify 'blksize'
>> and 'blkcnt' when 'data_count' is used. This fixes:
>>
>> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \
>>-nographic -serial none -M pc-q35-5.0 \
>>-device sdhci-pci,sd-spec-version=3 \
>>-device sd-card,drive=mydrive \
>>-drive if=sd,index=0,file=null-co://,format=raw,id=mydrive
>>   outl 0xcf8 0x80001810
>>   outl 0xcfc 0xe1068000
>>   outl 0xcf8 0x80001814
> 
> Is this command needed?

My guess is this makes the northbridge somehow map the device PCI space.

Probably not needed in machines where SDHCI is MMIO mapped.

> 
>>   outl 0xcf8 0x80001804
>>   outw 0xcfc 0x7
>>   outl 0xcf8 0x8000fa20
> 
> and this one?

Ditto.

> 
>>   write 0xe106802c 0x1 0x0f
>>   write 0xe1068004 0xc 0x2801d10101fbff28a384
> 
> Are these fuzzy data?

Yes, I didn't try to understand what this does, as often
non-sense operations. But this is what would craft a malicious
attacker.

> 
>>   write 0xe106800c 0x1f 
>> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f
>>   write 0xe1068003 0x28 
>> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576
>>   write 0xe1068003 0x1 0xfe
>>   EOF
>>   =
>>   ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address 
>> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0
>>   WRITE of size 4 at 0x6153bb00 thread T0
>>   #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b)
>>   #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5
>>   #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1
>>   #3 0x55ab483aeb4b in flatview_read_continue softmmu/physmem.c:2839:13
>>   #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12
>>   #5 0x55ab483b028e in address_space_read_full softmmu/physmem.c:2890:18
>>   #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16
>>   #7 0x55ab479374a2 in dma_memory_rw_relaxed include/sysemu/dma.h:88:12
>>   #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12
>>   #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12
>>   #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks 
>> hw/sd/sdhci.c:639:13
>>   #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17
>>   #12 0x55ab483f8db8 in memory_region_write_accessor 
>> softmmu/memory.c:491:5
>>   #13 0x55ab483f868a in access_with_adjusted_size softmmu/memory.c:552:18
>>   #14 0x55ab483f6da5 in memory_region_dispatch_write 
>> softmmu/memory.c:1501:16
>>   #15 0x55ab483c3b11 in flatview_write_continue softmmu/physmem.c:2774:23
>>   #16 0x55ab483b0eb6 in flatview_write softmmu/physmem.c:2814:14
>>   #17 0x55ab483b0a3e in address_space_write softmmu/physmem.c:2906:18
>>   #18 0x55ab48465c56 in qtest_process_command softmmu/qtest.c:654:9
>>
>>   0x6153bb00 is located 0 bytes to the right of 512-byte region 
>> [0x6153b900,0x6153bb00)
>>   allocated by thread T0 here:
>>   #0 0x55ab469f58a7 in calloc (qemu-system-i386+0x1ceb8a7)
>>   #1 0x7f21d678f9b0 in g_malloc0 (/lib64/libglib-2.0.so.0+0x589b0)
>>   #2 0x55ab479530ed in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5
>>   #3 0x55ab476f102a in pci_qdev_realize hw/pci/pci.c:2108:9
>>   #4 0x55ab48baaad2 in device_set_realized hw/core/qdev.c:761:13
>>
>>   SUMMARY: AddressSanitizer: heap-buffer-overflow 
>> (qemu-system-i386+0x1cea56b) in __asan_memcpy
>>   Shadow bytes around the buggy address:
>> 0x0c2a7710: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>> 0x0c2a7720: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 0x0c2a7730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 0x0c2a7740: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 0x0c2a7750: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>   =>0x0c2a7760:[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>> 0x0c2a7770: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
>> 0x0c2a7780: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
>> 0x0c2a7790: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
>> 0x0c2a77a0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
>> 0x0c2a77b0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>>   Shadow byte legend (one shadow byte represents 8 application bytes):
>> Addressable:

Re: [PATCH 0/9] hw/block: m25p80: Fix the mess of dummy bytes needed for fast read commands

2021-02-09 Thread Francisco Iglesias
Hello Edgar,

On [2021 Feb 08] Mon 16:30:00, Edgar E. Iglesias wrote:
>On Mon, Feb 8, 2021 at 3:42 PM Bin Meng  wrote:
> 
>  On Thu, Jan 21, 2021 at 10:18 PM Francisco Iglesias
>   wrote:
>  >
>  > Hi Bin,
>  >
>  > On [2021 Jan 21] Thu 16:59:51, Bin Meng wrote:
>  > > Hi Francisco,
>  > >
>  > > On Thu, Jan 21, 2021 at 4:50 PM Francisco Iglesias
>  > >  wrote:
>  > > >
>  > > > Dear Bin,
>  > > >
>  > > > On [2021 Jan 20] Wed 22:20:25, Bin Meng wrote:
>  > > > > Hi Francisco,
>  > > > >
>  > > > > On Tue, Jan 19, 2021 at 9:01 PM Francisco Iglesias
>  > > > >  wrote:
>  > > > > >
>  > > > > > Hi Bin,
>  > > > > >
>  > > > > > On [2021 Jan 18] Mon 20:32:19, Bin Meng wrote:
>  > > > > > > Hi Francisco,
>  > > > > > >
>  > > > > > > On Mon, Jan 18, 2021 at 6:06 PM Francisco Iglesias
>  > > > > > >  wrote:
>  > > > > > > >
>  > > > > > > > Hi Bin,
>  > > > > > > >
>  > > > > > > > On [2021 Jan 15] Fri 22:38:18, Bin Meng wrote:
>  > > > > > > > > Hi Francisco,
>  > > > > > > > >
>  > > > > > > > > On Fri, Jan 15, 2021 at 8:26 PM Francisco Iglesias
>  > > > > > > > >  wrote:
>  > > > > > > > > >
>  > > > > > > > > > Hi Bin,
>  > > > > > > > > >
>  > > > > > > > > > On [2021 Jan 15] Fri 10:07:52, Bin Meng wrote:
>  > > > > > > > > > > Hi Francisco,
>  > > > > > > > > > >
>  > > > > > > > > > > On Fri, Jan 15, 2021 at 2:13 AM Francisco Iglesias
>  > > > > > > > > > >  wrote:
>  > > > > > > > > > > >
>  > > > > > > > > > > > Hi Bin,
>  > > > > > > > > > > >
>  > > > > > > > > > > > On [2021 Jan 14] Thu 23:08:53, Bin Meng wrote:
>  > > > > > > > > > > > > From: Bin Meng 
>  > > > > > > > > > > > >
>  > > > > > > > > > > > > The m25p80 model uses s->needed_bytes to
>  indicate how many follow-up
>  > > > > > > > > > > > > bytes are expected to be received after it
>  receives a command. For
>  > > > > > > > > > > > > example, depending on the address mode, either
>  3-byte address or
>  > > > > > > > > > > > > 4-byte address is needed.
>  > > > > > > > > > > > >
>  > > > > > > > > > > > > For fast read family commands, some dummy cycles
>  are required after
>  > > > > > > > > > > > > sending the address bytes, and the dummy cycles
>  need to be counted
>  > > > > > > > > > > > > in s->needed_bytes. This is where the mess
>  began.
>  > > > > > > > > > > > >
>  > > > > > > > > > > > > As the variable name (needed_bytes) indicates,
>  the unit is in byte.
>  > > > > > > > > > > > > It is not in bit, or cycle. However for some
>  reason the model has
>  > > > > > > > > > > > > been using the number of dummy cycles for
>  s->needed_bytes. The right
>  > > > > > > > > > > > > approach is to convert the number of dummy
>  cycles to bytes based on
>  > > > > > > > > > > > > the SPI protocol, for example, 6 dummy cycles
>  for the Fast Read Quad
>  > > > > > > > > > > > > I/O (EBh) should be converted to 3 bytes per the
>  formula (6 * 4 / 8).
>  > > > > > > > > > > >
>  > > > > > > > > > > > While not being the original implementor I must
>  assume that above solution was
>  > > > > > > > > > > > considered but not chosen by the developers due to
>  it is inaccuracy (it
>  > > > > > > > > > > > wouldn't be possible to model exacly 6 dummy
>  cycles, only a multiple of 8,
>  > > > > > > > > > > > meaning that if the controller is wrongly
>  programmed to generate 7 the error
>  > > > > > > > > > > > wouldn't be caught and the controller will still
>  be considered "correct"). Now
>  > > > > > > > > > > > that we have this detail in the implementation I'm
>  in favor of keeping it, this
>  > > > > > > > > > > > also because the detail is already in use for
>  catching exactly above error.
>  > > > > > > > > > > >
>  > > > > > > > > > >
>  > > > > > > > > > > I found no clue from the commit message that my
>  proposed solution here
>  > > > > > > > > > > was ever considered, otherwise all SPI controller
>  models supporting
>  > > > > > > > > > > software generation should have been found out
>  seriously broken long
>  > > > > > > > > > > time ago!
>  > > > > > > > > >
>  > > > > > > > > >
>  > > > > > > > > > The controllers you are referring to might lack
>  support for commands requiring
>  > > > > > > > > > dummy clock cycles but I really hope they work with
>  the other commands? If so I
>  > > > > > > > >
>  > > > > > > > > I am not sure why you view dummy clock cycles as
>  something special
>  > > > > > > > > that needs some special support from the SPI controller.
>  For the case
>  > > > > > > > > 1 controller, it's nothing special from the controller
>  perspective,
>   

Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress

2021-02-09 Thread Mauro Matteo Cascella
On Mon, Feb 8, 2021 at 9:26 PM Philippe Mathieu-Daudé  wrote:
>
> On Mon, Feb 8, 2021 at 8:59 PM Mauro Matteo Cascella
>  wrote:
> > On Mon, Feb 8, 2021 at 8:35 PM Philippe Mathieu-Daudé  
> > wrote:
> > >
> > > Per the "SD Host Controller Simplified Specification Version 2.00"
> > > spec. 'Table 2-4 : Block Size Register':
> > >
> > >   Transfer Block Size [...] can be accessed only if no
> > >   transaction is executing (i.e., after a transaction has stopped).
> > >   Read operations during transfers may return an invalid value,
> > >   and write operations shall be ignored.
> > >
> ...
> > >
> > > Fixes: CVE-2020-17380
> > > Fixes: CVE-2020-25085
> > > Signed-off-by: Philippe Mathieu-Daudé 
> > > ---
> > > Cc: Mauro Matteo Cascella 
> > > Cc: Alexander Bulekov 
> > > Cc: Alistair Francis 
> > > Cc: Prasad J Pandit 
> > > Cc: Bandan Das 
> > >
> > > RFC because missing Reported-by tags, launchpad/bugzilla links and
> > > qtest reproducer. Sending for review meanwhile.
> ...
> > For the above CVEs:
> > Tested-by: Mauro Matteo Cascella 
>
> Thanks Mauro for testing. Do you know what tags I should add for the credits?
>
> Phil.
>

I think the credit should go to Alexander for reporting [1] as well as
people from Ruhr-University Bochum for CVE-2020-25085 (I don't know
about their emails, though):

Reported-by: Alexander Bulekov 
Reported-by: Sergej Schumilo (Ruhr-University Bochum)
Reported-by: Cornelius Aschermann (Ruhr-University Bochum)
Reported-by: Simon Wrner (Ruhr-University Bochum)

[1] https://bugs.launchpad.net/qemu/+bug/1892960



--
Mauro Matteo Cascella
Red Hat Product Security
PGP-Key ID: BB3410B0




Re: [PATCH] hw/sd/sdhci: Do not modify BlockSizeRegister if transaction in progress

2021-02-09 Thread Bin Meng
Hi Philippe,

On Tue, Feb 9, 2021 at 3:34 AM Philippe Mathieu-Daudé  wrote:
>
> Per the "SD Host Controller Simplified Specification Version 2.00"
> spec. 'Table 2-4 : Block Size Register':
>
>   Transfer Block Size [...] can be accessed only if no
>   transaction is executing (i.e., after a transaction has stopped).
>   Read operations during transfers may return an invalid value,
>   and write operations shall be ignored.
>
> Transactions will update 'data_count', so do not modify 'blksize'
> and 'blkcnt' when 'data_count' is used. This fixes:
>
> $ cat << EOF | qemu-system-x86_64 -qtest stdio -monitor none \
>-nographic -serial none -M pc-q35-5.0 \
>-device sdhci-pci,sd-spec-version=3 \
>-device sd-card,drive=mydrive \
>-drive if=sd,index=0,file=null-co://,format=raw,id=mydrive
>   outl 0xcf8 0x80001810
>   outl 0xcfc 0xe1068000
>   outl 0xcf8 0x80001814

Is this command needed?

>   outl 0xcf8 0x80001804
>   outw 0xcfc 0x7
>   outl 0xcf8 0x8000fa20

and this one?

>   write 0xe106802c 0x1 0x0f
>   write 0xe1068004 0xc 0x2801d10101fbff28a384

Are these fuzzy data?

>   write 0xe106800c 0x1f 
> 0x9dacbbcad9e8f7061524334251606f7e8d9cabbac9d8e7f60514233241505f
>   write 0xe1068003 0x28 
> 0x80d000251480d000252280d000253080d000253e80d000254c80d000255a80d000256880d0002576
>   write 0xe1068003 0x1 0xfe
>   EOF
>   =
>   ==2686219==ERROR: AddressSanitizer: heap-buffer-overflow on address 
> 0x6153bb00 at pc 0x55ab469f456c bp 0x7ffee71be330 sp 0x7ffee71bdae0
>   WRITE of size 4 at 0x6153bb00 thread T0
>   #0 0x55ab469f456b in __asan_memcpy (qemu-system-i386+0x1cea56b)
>   #1 0x55ab483dc396 in stl_he_p include/qemu/bswap.h:353:5
>   #2 0x55ab483af5e4 in stn_he_p include/qemu/bswap.h:546:1
>   #3 0x55ab483aeb4b in flatview_read_continue softmmu/physmem.c:2839:13
>   #4 0x55ab483b0705 in flatview_read softmmu/physmem.c:2877:12
>   #5 0x55ab483b028e in address_space_read_full softmmu/physmem.c:2890:18
>   #6 0x55ab483b1294 in address_space_rw softmmu/physmem.c:2918:16
>   #7 0x55ab479374a2 in dma_memory_rw_relaxed include/sysemu/dma.h:88:12
>   #8 0x55ab47936f50 in dma_memory_rw include/sysemu/dma.h:127:12
>   #9 0x55ab4793665f in dma_memory_read include/sysemu/dma.h:145:12
>   #10 0x55ab4792f176 in sdhci_sdma_transfer_multi_blocks 
> hw/sd/sdhci.c:639:13
>   #11 0x55ab4793dc9d in sdhci_write hw/sd/sdhci.c:1129:17
>   #12 0x55ab483f8db8 in memory_region_write_accessor 
> softmmu/memory.c:491:5
>   #13 0x55ab483f868a in access_with_adjusted_size softmmu/memory.c:552:18
>   #14 0x55ab483f6da5 in memory_region_dispatch_write 
> softmmu/memory.c:1501:16
>   #15 0x55ab483c3b11 in flatview_write_continue softmmu/physmem.c:2774:23
>   #16 0x55ab483b0eb6 in flatview_write softmmu/physmem.c:2814:14
>   #17 0x55ab483b0a3e in address_space_write softmmu/physmem.c:2906:18
>   #18 0x55ab48465c56 in qtest_process_command softmmu/qtest.c:654:9
>
>   0x6153bb00 is located 0 bytes to the right of 512-byte region 
> [0x6153b900,0x6153bb00)
>   allocated by thread T0 here:
>   #0 0x55ab469f58a7 in calloc (qemu-system-i386+0x1ceb8a7)
>   #1 0x7f21d678f9b0 in g_malloc0 (/lib64/libglib-2.0.so.0+0x589b0)
>   #2 0x55ab479530ed in sdhci_pci_realize hw/sd/sdhci-pci.c:36:5
>   #3 0x55ab476f102a in pci_qdev_realize hw/pci/pci.c:2108:9
>   #4 0x55ab48baaad2 in device_set_realized hw/core/qdev.c:761:13
>
>   SUMMARY: AddressSanitizer: heap-buffer-overflow 
> (qemu-system-i386+0x1cea56b) in __asan_memcpy
>   Shadow bytes around the buggy address:
> 0x0c2a7710: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
> 0x0c2a7720: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0x0c2a7730: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0x0c2a7740: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 0x0c2a7750: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>   =>0x0c2a7760:[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
> 0x0c2a7770: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
> 0x0c2a7780: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
> 0x0c2a7790: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
> 0x0c2a77a0: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
> 0x0c2a77b0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
>   Shadow byte legend (one shadow byte represents 8 application bytes):
> Addressable:   00
> Heap left redzone:   fa
> Freed heap region:   fd
>   ==2686219==ABORTING
>
> Fixes: CVE-2020-17380
> Fixes: CVE-2020-25085
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
> Cc: Mauro Matteo Cascella 
> Cc: Alexander Bulekov 
> Cc: Alistair Francis 
> Cc: Prasad J Pandit 
> Cc: Bandan Das 
>
> RFC because missing Reported-by tags, launchpad/bugzilla links and
> qtest reproducer. Sendin

Re: [RFC PATCH v2 3/4] block: Support multiple reopening with x-blockdev-reopen

2021-02-09 Thread Vladimir Sementsov-Ogievskiy

08.02.2021 21:44, Alberto Garcia wrote:

Signed-off-by: Alberto Garcia 
---
  qapi/block-core.json   |  2 +-
  include/block/block.h  |  1 +
  block.c| 16 +--
  blockdev.c | 85 +-
  tests/qemu-iotests/155 |  9 ++--
  tests/qemu-iotests/165 |  4 +-
  tests/qemu-iotests/245 | 27 +++-
  tests/qemu-iotests/248 |  2 +-
  tests/qemu-iotests/248.out |  2 +-
  tests/qemu-iotests/298 |  4 +-
  10 files changed, 89 insertions(+), 63 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index c0e7c23331..b9fcf20a81 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -4177,7 +4177,7 @@
  # Since: 4.0
  ##
  { 'command': 'x-blockdev-reopen',
-  'data': 'BlockdevOptions', 'boxed': true }
+  'data': { 'options': ['BlockdevOptions'] } }


Do we also want to drop x- prefix?

  
  ##

  # @blockdev-del:
diff --git a/include/block/block.h b/include/block/block.h
index 6dd687a69e..fe4a220da9 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -372,6 +372,7 @@ BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, 
const char *node_name,
  BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
  BlockDriverState *bs, QDict *options,
  bool keep_old_opts);
+void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue);
  int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
  int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
Error **errp);
diff --git a/block.c b/block.c
index 19b62da4af..b4fef2308f 100644
--- a/block.c
+++ b/block.c
@@ -3933,6 +3933,17 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue 
*bs_queue,
 NULL, 0, keep_old_opts);
  }
  
+void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue)

+{
+if (bs_queue) {
+BlockReopenQueueEntry *bs_entry, *next;
+QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
+g_free(bs_entry);
+}
+g_free(bs_queue);
+}
+}
+
  /*
   * Reopen multiple BlockDriverStates atomically & transactionally.
   *
@@ -4024,10 +4035,7 @@ abort:
  }
  
  cleanup:

-QTAILQ_FOREACH_SAFE(bs_entry, bs_queue, entry, next) {
-g_free(bs_entry);
-}
-g_free(bs_queue);
+bdrv_reopen_queue_free(bs_queue);


this may be a separate patch

  
  return ret;

  }
diff --git a/blockdev.c b/blockdev.c
index 098a05709d..6b688c0f73 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3528,38 +3528,16 @@ fail:
  visit_free(v);
  }
  
-void qmp_x_blockdev_reopen(BlockdevOptions *options, Error **errp)

+void qmp_x_blockdev_reopen(BlockdevOptionsList *reopen_list, Error **errp)
  {
-BlockDriverState *bs;
-QObject *obj;
-Visitor *v = qobject_output_visitor_new(&obj);
-BlockReopenQueue *queue;
-QDict *qdict;
-
-/* Check for the selected node name */
-if (!options->has_node_name) {
-error_setg(errp, "Node name not specified");
-goto fail;
-}
-
-bs = bdrv_find_node(options->node_name);
-if (!bs) {
-error_setg(errp, "Cannot find node named '%s'", options->node_name);
-goto fail;
-}
-
-/* Put all options in a QDict and flatten it */
-visit_type_BlockdevOptions(v, NULL, &options, &error_abort);
-visit_complete(v, &obj);
-qdict = qobject_to(QDict, obj);
-
-qdict_flatten(qdict);
-
-/* Perform the reopen operation */
+BlockReopenQueue *queue = NULL;
+GSList *aio_ctxs = NULL;
+GSList *visitors = NULL;
+GSList *drained = NULL;
  BdrvNextIterator it;
-GSList *aio_ctxs = NULL, *ctx;
  BlockDriverState *it_bs;
  
+/* Acquire all AIO contexts */

  for (it_bs = bdrv_first(&it); it_bs; it_bs = bdrv_next(&it)) {
  AioContext *aio_context = bdrv_get_aio_context(it_bs);
  
@@ -3569,19 +3547,50 @@ void qmp_x_blockdev_reopen(BlockdevOptions *options, Error **errp)

  }
  }
  
-bdrv_subtree_drained_begin(bs);

-queue = bdrv_reopen_queue(NULL, bs, qdict, false);
+/* Add each one of the BDS that we want to reopen to the queue */
+for (; reopen_list != NULL; reopen_list = reopen_list->next) {
+BlockdevOptions *options = reopen_list->value;
+QDict *qdict;
+Visitor *v;
+BlockDriverState *bs;
+QObject *obj;
+
+/* Check for the selected node name */
+if (!options->has_node_name) {
+error_setg(errp, "Node name not specified");
+goto fail;
+}
+
+bs = bdrv_find_node(options->node_name);
+if (!bs) {
+error_setg(errp, "Cannot find node named '%s'", 
options->node_name);
+goto fail;
+}
+
+v = qobject_output_visitor_new(&obj);
+visitors = g_slist_prepend(visitors, v);
+
+/* Put all options in a QDict and flatten it */
+ 

[PULL 52/56] hw/block/nvme: fix set feature save field check

2021-02-09 Thread Klaus Jensen
From: Gollu Appalanaidu 

Currently, no features are saveable, so the current check is not wrong,
but add a check against the feature capabilities to make sure this will
not regress if saveable features are added later.

Signed-off-by: Gollu Appalanaidu 
Reviewed-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index b3d072c8b2bb..c99a3fbf3461 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3324,7 +3324,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 
 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
 
-if (save) {
+if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
 }
 
-- 
2.30.0




[PULL 56/56] hw/block/nvme: refactor the logic for zone write checks

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

Refactor the zone write check logic such that the most "meaningful"
error is returned first. That is, first, if the zone is not writable,
return an appropriate status code for that. Then, make sure we are
actually writing at the write pointer and finally check that we do not
cross the zone write boundary. This aligns with the "priority" of status
codes for zone read checks.

Also add a couple of additional descriptive trace events and remove an
always true assert.

Cc: Dmitry Fomichev 
Tested-by: Niklas Cassel 
Tested-by: Dmitry Fomichev 
Reviewed-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c   | 49 ---
 hw/block/trace-events |  5 +
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index cedb4ad9ffd3..5ce21b7100b3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1161,56 +1161,53 @@ static inline NvmeZone 
*nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
 
 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
 {
-uint16_t status;
+uint64_t zslba = zone->d.zslba;
 
 switch (nvme_get_zone_state(zone)) {
 case NVME_ZONE_STATE_EMPTY:
 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
 case NVME_ZONE_STATE_CLOSED:
-status = NVME_SUCCESS;
-break;
+return NVME_SUCCESS;
 case NVME_ZONE_STATE_FULL:
-status = NVME_ZONE_FULL;
-break;
+trace_pci_nvme_err_zone_is_full(zslba);
+return NVME_ZONE_FULL;
 case NVME_ZONE_STATE_OFFLINE:
-status = NVME_ZONE_OFFLINE;
-break;
+trace_pci_nvme_err_zone_is_offline(zslba);
+return NVME_ZONE_OFFLINE;
 case NVME_ZONE_STATE_READ_ONLY:
-status = NVME_ZONE_READ_ONLY;
-break;
+trace_pci_nvme_err_zone_is_read_only(zslba);
+return NVME_ZONE_READ_ONLY;
 default:
 assert(false);
 }
 
-return status;
+return NVME_INTERNAL_DEV_ERROR;
 }
 
 static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
   NvmeZone *zone, uint64_t slba,
   uint32_t nlb)
 {
+uint64_t zcap = nvme_zone_wr_boundary(zone);
 uint16_t status;
 
-if (unlikely((slba + nlb) > nvme_zone_wr_boundary(zone))) {
-status = NVME_ZONE_BOUNDARY_ERROR;
-} else {
-status = nvme_check_zone_state_for_write(zone);
-}
-
+status = nvme_check_zone_state_for_write(zone);
 if (status) {
-trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
-} else {
-assert(nvme_wp_is_valid(zone));
-
-if (unlikely(slba != zone->w_ptr)) {
-trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
-   zone->w_ptr);
-status = NVME_ZONE_INVALID_WRITE;
-}
+return status;
 }
 
-return status;
+if (unlikely(slba != zone->w_ptr)) {
+trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
+return NVME_ZONE_INVALID_WRITE;
+}
+
+if (unlikely((slba + nlb) > zcap)) {
+trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
+return NVME_ZONE_BOUNDARY_ERROR;
+}
+
+return NVME_SUCCESS;
 }
 
 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 87ab6c509045..d32475c3989e 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -129,6 +129,11 @@ pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t 
slba, uint64_t zslba) "
 pci_nvme_err_invalid_zone_state_transition(uint8_t action, uint64_t slba, 
uint8_t attrs) "action=0x%"PRIx8", slba=%"PRIu64", attrs=0x%"PRIx32""
 pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) 
"writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64""
 pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at 
slba=%"PRIu64", but zone=%"PRIu64""
+pci_nvme_err_zone_is_full(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_is_read_only(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_is_offline(uint64_t zslba) "zslba 0x%"PRIx64""
+pci_nvme_err_zone_boundary(uint64_t slba, uint32_t nlb, uint64_t zcap) "lba 
0x%"PRIx64" nlb %"PRIu32" zcap 0x%"PRIx64""
+pci_nvme_err_zone_invalid_write(uint64_t slba, uint64_t wp) "lba 0x%"PRIx64" 
wp 0x%"PRIx64""
 pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) 
"slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
 pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint16_t status) 
"slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
 pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t zasl) 
"slba=%"PRIu64", nlb=%"PRIu32", zasl=%"PRIu8""
-- 
2.30.0




[PULL 55/56] hw/block/nvme: fix zone boundary check for append

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

When a zone append is processed the controller checks that validity of
the write before assigning the LBA to the append command. This causes
the boundary check to be wrong.

Fix this by checking the write *after* assigning the LBA. Remove the
append special case from the nvme_check_zone_write and open code it in
nvme_do_write, assigning the slba when basic sanity checks have been
performed. Then check the validity of the resulting write like any other
write command.

In the process, also fix a missing endianness conversion for the zone
append ALBA.

Reported-by: Niklas Cassel 
Cc: Dmitry Fomichev 
Tested-by: Niklas Cassel 
Tested-by: Dmitry Fomichev 
Reviewed-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 46 --
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e562d7467b3b..cedb4ad9ffd3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1188,7 +1188,7 @@ static uint16_t nvme_check_zone_state_for_write(NvmeZone 
*zone)
 
 static uint16_t nvme_check_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
   NvmeZone *zone, uint64_t slba,
-  uint32_t nlb, bool append)
+  uint32_t nlb)
 {
 uint16_t status;
 
@@ -1202,16 +1202,8 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
 } else {
 assert(nvme_wp_is_valid(zone));
-if (append) {
-if (unlikely(slba != zone->d.zslba)) {
-trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
-status = NVME_INVALID_FIELD;
-}
-if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
-trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
-status = NVME_INVALID_FIELD;
-}
-} else if (unlikely(slba != zone->w_ptr)) {
+
+if (unlikely(slba != zone->w_ptr)) {
 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
zone->w_ptr);
 status = NVME_ZONE_INVALID_WRITE;
@@ -1349,10 +1341,9 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, 
NvmeRequest *req,
 }
 }
 
-static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
- uint32_t nlb)
+static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
+ uint32_t nlb)
 {
-uint64_t result = zone->w_ptr;
 uint8_t zs;
 
 zone->w_ptr += nlb;
@@ -1368,8 +1359,6 @@ static uint64_t nvme_advance_zone_wp(NvmeNamespace *ns, 
NvmeZone *zone,
 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
 }
 }
-
-return result;
 }
 
 static inline bool nvme_is_write(NvmeRequest *req)
@@ -1747,7 +1736,24 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,
 if (ns->params.zoned) {
 zone = nvme_get_zone_by_slba(ns, slba);
 
-status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
+if (append) {
+if (unlikely(slba != zone->d.zslba)) {
+trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
+status = NVME_INVALID_FIELD;
+goto invalid;
+}
+
+if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
+trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
+status = NVME_INVALID_FIELD;
+goto invalid;
+}
+
+slba = zone->w_ptr;
+res->slba = cpu_to_le64(slba);
+}
+
+status = nvme_check_zone_write(n, ns, zone, slba, nlb);
 if (status) {
 goto invalid;
 }
@@ -1757,11 +1763,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,
 goto invalid;
 }
 
-if (append) {
-slba = zone->w_ptr;
-}
-
-res->slba = nvme_advance_zone_wp(ns, zone, nlb);
+nvme_advance_zone_wp(ns, zone, nlb);
 }
 
 data_offset = nvme_l2b(ns, slba);
-- 
2.30.0




[PULL 51/56] hw/block/nvme: fix set feature for error recovery

2021-02-09 Thread Klaus Jensen
From: Gollu Appalanaidu 

Only enable DULBE if the namespace supports it.

Signed-off-by: Gollu Appalanaidu 
Reviewed-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 40784bd908fb..b3d072c8b2bb 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3396,7 +3396,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 }
 
 assert(ns);
-ns->features.err_rec = dw11;
+if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat))  {
+ns->features.err_rec = dw11;
+}
 break;
 case NVME_VOLATILE_WRITE_CACHE:
 for (i = 1; i <= n->num_namespaces; i++) {
-- 
2.30.0




[PULL 54/56] hw/block/nvme: fix wrong parameter name 'cross_read'

2021-02-09 Thread Klaus Jensen
From: Minwoo Im 

The actual parameter name is 'cross_read' rather than 'cross_zone_read'.

Signed-off-by: Minwoo Im 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 2335739bdb17..e562d7467b3b 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -81,7 +81,7 @@
  * The default value means there is no limit to the number of
  * concurrently open zones.
  *
- * zoned.cross_zone_read=
+ * zoned.cross_read=
  * Setting this property to true enables Read Across Zone Boundaries.
  */
 
-- 
2.30.0




[PULL 50/56] hw/block/nvme: error if drive less than a zone size

2021-02-09 Thread Klaus Jensen
From: Minwoo Im 

If a user assigns a backing device with less capacity than the size of a
single zone, the namespace capacity will be reported as zero and the
kernel will silently fail to allocate the namespace.

This patch errors out in case that the backing device cannot accomodate
at least a single zone.

Signed-off-by: Minwoo Im 
[k.jensen: small fixup in the error and commit message]
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 3f52acb89c95..dfed71a950fa 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -134,6 +134,13 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 ns->num_zones = ns->size / lbasz / ns->zone_size;
 
 /* Do a few more sanity checks of ZNS properties */
+if (!ns->num_zones) {
+error_setg(errp,
+   "insufficient drive capacity, must be at least the size "
+   "of one zone (%"PRIu64"B)", zone_size);
+return -1;
+}
+
 if (ns->params.max_open_zones > ns->num_zones) {
 error_setg(errp,
"max_open_zones value %u exceeds the number of zones %u",
-- 
2.30.0




[PULL 49/56] hw/block/nvme: lift cmb restrictions

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

The controller now implements v1.4 and we can lift the restrictions on
CMB Data Pointer and Command Independent Locations Support (CDPCILS) and
CMB Data Pointer Mixed Locations Support (CDPMLS) since the device
really does not care about mixed host/cmb pointers in those cases.

Reviewed-by: Keith Busch 
Reviewed-by: Minwoo Im 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 33 ++---
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c4c968f5951e..40784bd908fb 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -509,7 +509,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 trans_len = MIN(len, trans_len);
 int num_prps = (len >> n->page_bits) + 1;
 uint16_t status;
-bool prp_list_in_cmb = false;
 int ret;
 
 QEMUSGList *qsg = &req->qsg;
@@ -535,10 +534,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 uint32_t nents, prp_trans;
 int i = 0;
 
-if (nvme_addr_is_cmb(n, prp2)) {
-prp_list_in_cmb = true;
-}
-
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
@@ -555,10 +550,6 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
 }
 
-if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) {
-return NVME_INVALID_USE_OF_CMB | NVME_DNR;
-}
-
 i = 0;
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
@@ -692,7 +683,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 uint64_t nsgld;
 uint32_t seg_len;
 uint16_t status;
-bool sgl_in_cmb = false;
 hwaddr addr;
 int ret;
 
@@ -714,18 +704,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 goto out;
 }
 
-/*
- * If the segment is located in the CMB, the submission queue of the
- * request must also reside there.
- */
-if (nvme_addr_is_cmb(n, addr)) {
-if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
-return NVME_INVALID_USE_OF_CMB | NVME_DNR;
-}
-
-sgl_in_cmb = true;
-}
-
 for (;;) {
 switch (NVME_SGL_TYPE(sgld->type)) {
 case NVME_SGL_DESCR_TYPE_SEGMENT:
@@ -814,15 +792,6 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 if (status) {
 goto unmap;
 }
-
-/*
- * If the next segment is in the CMB, make sure that the sgl was
- * already located there.
- */
-if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) {
-status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
-goto unmap;
-}
 }
 
 out:
@@ -3777,6 +3746,8 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 
 static void nvme_cmb_enable_regs(NvmeCtrl *n)
 {
+NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1);
+NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1);
 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
 
 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
-- 
2.30.0




Re: [RFC PATCH v2 1/4] block: Allow changing bs->file on reopen

2021-02-09 Thread Vladimir Sementsov-Ogievskiy

08.02.2021 21:44, Alberto Garcia wrote:

When the x-blockdev-reopen was added it allowed reconfiguring the
graph by replacing backing files, but changing the 'file' option was
forbidden. Because of this restriction some operations are not
possible, notably inserting and removing block filters.

This patch adds support for replacing the 'file' option. This is
similar to replacing the backing file and the user is likewise
responsible for the correctness of the resulting graph, otherwise this
can lead to data corruption.

Signed-off-by: Alberto Garcia 
---
  include/block/block.h  |  1 +
  block.c| 65 ++
  tests/qemu-iotests/245 |  7 +++--
  3 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 82271d9ccd..6dd687a69e 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -196,6 +196,7 @@ typedef struct BDRVReopenState {
  bool backing_missing;
  bool replace_backing_bs;  /* new_backing_bs is ignored if this is false */
  BlockDriverState *old_backing_bs; /* keep pointer for permissions update 
*/
+BlockDriverState *old_file_bs;/* keep pointer for permissions update */
  uint64_t perm, shared_perm;
  QDict *options;
  QDict *explicit_options;
diff --git a/block.c b/block.c
index 576b145cbf..19b62da4af 100644
--- a/block.c
+++ b/block.c
@@ -3978,6 +3978,10 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, 
Error **errp)
  refresh_list = bdrv_topological_dfs(refresh_list, found,
  state->old_backing_bs);
  }
+if (state->old_file_bs) {
+refresh_list = bdrv_topological_dfs(refresh_list, found,
+state->old_file_bs);
+}
  }
  
  ret = bdrv_list_refresh_perms(refresh_list, bs_queue, &tran, errp);

@@ -4196,6 +4200,61 @@ static int bdrv_reopen_parse_backing(BDRVReopenState 
*reopen_state,
  return 0;
  }
  
+static int bdrv_reopen_parse_file(BDRVReopenState *reopen_state,

+  GSList **tran,
+  Error **errp)
+{
+BlockDriverState *bs = reopen_state->bs;
+BlockDriverState *new_file_bs;
+QObject *value;
+const char *str;
+
+value = qdict_get(reopen_state->options, "file");
+if (value == NULL) {
+return 0;
+}
+
+/* The 'file' option only allows strings */
+assert(qobject_type(value) == QTYPE_QSTRING);
+
+str = qobject_get_try_str(value);
+new_file_bs = bdrv_lookup_bs(NULL, str, errp);
+if (new_file_bs == NULL) {
+return -EINVAL;
+} else if (bdrv_recurse_has_child(new_file_bs, bs)) {
+error_setg(errp, "Making '%s' a file of '%s' "
+   "would create a cycle", str, bs->node_name);
+return -EINVAL;
+}
+
+assert(bs->file && bs->file->bs);
+
+/* If 'file' points to the current child then there's nothing to do */
+if (bs->file->bs == new_file_bs) {
+return 0;
+}
+
+if (bs->file->frozen) {
+error_setg(errp, "Cannot change the 'file' link of '%s' "
+   "from '%s' to '%s'", bs->node_name,
+   bs->file->bs->node_name, new_file_bs->node_name);
+return -EPERM;
+}
+
+/* Check AioContext compatibility */
+if (!bdrv_reopen_can_attach(bs, bs->file, new_file_bs, errp)) {
+return -EINVAL;
+}
+
+/* Store the old file bs because we'll need to refresh its permissions */
+reopen_state->old_file_bs = bs->file->bs;
+
+/* And finally replace the child */
+bdrv_replace_child(bs->file, new_file_bs, tran);
+
+return 0;
+}


The function mostly do the same that bdrv_reopen_parse_backing().. I don't 
think that they
should really differ. Probably it should be one function.
At least, they should work absolutely the same way for backing-child
based and file-child based filters. And you lose bdrv_is_backing_chain_frozen() 
check


+
  /*
   * Prepares a BlockDriverState for reopen. All changes are staged in the
   * 'opaque' field of the BDRVReopenState, which is used and allocated by
@@ -4347,6 +4406,12 @@ static int bdrv_reopen_prepare(BDRVReopenState 
*reopen_state,
  }
  qdict_del(reopen_state->options, "backing");
  
+ret = bdrv_reopen_parse_file(reopen_state, set_backings_tran, errp);

+if (ret < 0) {
+goto error;
+}
+qdict_del(reopen_state->options, "file");
+
  /* Options that are not handled are only okay if they are unchanged
   * compared to the old state. It is expected that some options are only
   * used for the initial open, but not reopen (e.g. filename) */
diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245
index e60c8326d3..f9d68b3958 100755
--- a/tests/qemu-iotests/245
+++ b/tests/qemu-iotests/245
@@ -145,8 +145,8 @@ class TestBlockdevReopen(iotests.QMPTestCase):
  self.reopen(

[PULL 47/56] hw/block/nvme: move cmb logic to v1.4

2021-02-09 Thread Klaus Jensen
From: Padmakar Kalghatgi 

Implement v1.4 logic for configuring the Controller Memory Buffer. By
default, the v1.4 scheme will be used (CMB must be explicitly enabled by
the host), so drivers that only support v1.3 will not be able to use the
CMB anymore.

To retain the v1.3 behavior, set the boolean 'legacy-cmb' nvme device
parameter.

Reviewed-by: Keith Busch 
Reviewed-by: Minwoo Im 
Signed-off-by: Padmakar Kalghatgi 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h   |  10 +++-
 include/block/nvme.h  | 107 +-
 hw/block/nvme.c   | 101 +--
 hw/block/trace-events |   2 +
 4 files changed, 182 insertions(+), 38 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b7702e937e56..dee6092bd45f 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -20,6 +20,7 @@ typedef struct NvmeParams {
 uint8_t  mdts;
 bool use_intel_id;
 uint32_t zasl_bs;
+bool legacy_cmb;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -127,7 +128,6 @@ typedef struct NvmeCtrl {
 PCIDeviceparent_obj;
 MemoryRegion bar0;
 MemoryRegion iomem;
-MemoryRegion ctrl_mem;
 NvmeBar  bar;
 NvmeParams   params;
 NvmeBus  bus;
@@ -143,7 +143,6 @@ typedef struct NvmeCtrl {
 uint32_tnum_namespaces;
 uint32_tmax_q_ents;
 uint8_t outstanding_aers;
-uint8_t *cmbuf;
 uint32_tirq_status;
 uint64_thost_timestamp; /* Timestamp sent by the host 
*/
 uint64_ttimestamp_set_qemu_clock_ms;/* QEMU clock time */
@@ -151,6 +150,13 @@ typedef struct NvmeCtrl {
 uint16_ttemperature;
 uint8_t smart_critical_warning;
 
+struct {
+MemoryRegion mem;
+uint8_t  *buf;
+bool cmse;
+hwaddr   cba;
+} cmb;
+
 struct {
 HostMemoryBackend *dev;
 bool  cmse;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 008108bd1af8..2e85b97a6c4e 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -15,14 +15,19 @@ typedef struct QEMU_PACKED NvmeBar {
 uint64_tacq;
 uint32_tcmbloc;
 uint32_tcmbsz;
-uint8_t padding[3520]; /* not used by QEMU */
+uint32_tbpinfo;
+uint32_tbprsel;
+uint64_tbpmbl;
+uint64_tcmbmsc;
+uint32_tcmbsts;
+uint8_t rsvd92[3492];
 uint32_tpmrcap;
 uint32_tpmrctl;
 uint32_tpmrsts;
 uint32_tpmrebs;
 uint32_tpmrswtp;
 uint64_tpmrmsc;
-uint8_t reserved[484];
+uint8_t css[484];
 } NvmeBar;
 
 enum NvmeCapShift {
@@ -63,6 +68,7 @@ enum NvmeCapMask {
 #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
 #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
 #define NVME_CAP_PMRS(cap)  (((cap) >> CAP_PMRS_SHIFT)   & CAP_PMRS_MASK)
+#define NVME_CAP_CMBS(cap)  (((cap) >> CAP_CMBS_SHIFT)   & CAP_CMBS_MASK)
 
 #define NVME_CAP_SET_MQES(cap, val)   (cap |= (uint64_t)(val & CAP_MQES_MASK)  
\
<< CAP_MQES_SHIFT)
@@ -184,25 +190,64 @@ enum NvmeAqaMask {
 #define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
 
 enum NvmeCmblocShift {
-CMBLOC_BIR_SHIFT  = 0,
-CMBLOC_OFST_SHIFT = 12,
+CMBLOC_BIR_SHIFT = 0,
+CMBLOC_CQMMS_SHIFT   = 3,
+CMBLOC_CQPDS_SHIFT   = 4,
+CMBLOC_CDPMLS_SHIFT  = 5,
+CMBLOC_CDPCILS_SHIFT = 6,
+CMBLOC_CDMMMS_SHIFT  = 7,
+CMBLOC_CQDA_SHIFT= 8,
+CMBLOC_OFST_SHIFT= 12,
 };
 
 enum NvmeCmblocMask {
-CMBLOC_BIR_MASK  = 0x7,
-CMBLOC_OFST_MASK = 0xf,
+CMBLOC_BIR_MASK = 0x7,
+CMBLOC_CQMMS_MASK   = 0x1,
+CMBLOC_CQPDS_MASK   = 0x1,
+CMBLOC_CDPMLS_MASK  = 0x1,
+CMBLOC_CDPCILS_MASK = 0x1,
+CMBLOC_CDMMMS_MASK  = 0x1,
+CMBLOC_CQDA_MASK= 0x1,
+CMBLOC_OFST_MASK= 0xf,
 };
 
-#define NVME_CMBLOC_BIR(cmbloc) ((cmbloc >> CMBLOC_BIR_SHIFT)  & \
- CMBLOC_BIR_MASK)
-#define NVME_CMBLOC_OFST(cmbloc)((cmbloc >> CMBLOC_OFST_SHIFT) & \
- CMBLOC_OFST_MASK)
+#define NVME_CMBLOC_BIR(cmbloc) \
+((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK)
+#define NVME_CMBLOC_CQMMS(cmbloc) \
+((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK)
+#define NVME_CMBLOC_CQPDS(cmbloc) \
+((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK)
+#define NVME_CMBLOC_CDPMLS(cmbloc) \
+((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK)
+#define NVME_CMBLOC_CDPCILS(cmbloc) \
+((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK)
+#define NVME_CMBLOC_CDMMMS(cmbloc) \
+((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK)
+#define NVME_CMBLOC_CQDA(cmbloc) \
+((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK)
+#define NVME_CMBLOC_OFST(cmbloc) \
+((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK)
 
-#de

[PULL 43/56] hw/block/nvme: rename PMR/CMB shift/mask fields

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

Use the correct field names.

Reviewed-by: Minwoo Im 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 include/block/nvme.h | 18 +-
 hw/block/nvme.c  |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 151921da21f9..008108bd1af8 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -35,8 +35,8 @@ enum NvmeCapShift {
 CAP_CSS_SHIFT  = 37,
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
-CAP_PMR_SHIFT  = 56,
-CAP_CMB_SHIFT  = 57,
+CAP_PMRS_SHIFT = 56,
+CAP_CMBS_SHIFT = 57,
 };
 
 enum NvmeCapMask {
@@ -49,8 +49,8 @@ enum NvmeCapMask {
 CAP_CSS_MASK   = 0xff,
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
-CAP_PMR_MASK   = 0x1,
-CAP_CMB_MASK   = 0x1,
+CAP_PMRS_MASK  = 0x1,
+CAP_CMBS_MASK  = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -62,7 +62,7 @@ enum NvmeCapMask {
 #define NVME_CAP_CSS(cap)   (((cap) >> CAP_CSS_SHIFT)& CAP_CSS_MASK)
 #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
 #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
-#define NVME_CAP_PMR(cap)   (((cap) >> CAP_PMR_SHIFT)& CAP_PMR_MASK)
+#define NVME_CAP_PMRS(cap)  (((cap) >> CAP_PMRS_SHIFT)   & CAP_PMRS_MASK)
 
 #define NVME_CAP_SET_MQES(cap, val)   (cap |= (uint64_t)(val & CAP_MQES_MASK)  
\
<< CAP_MQES_SHIFT)
@@ -82,10 +82,10 @@ enum NvmeCapMask {
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
<< CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
-   << CAP_PMR_SHIFT)
-#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
-   << CAP_CMB_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMRS_MASK)  
\
+   << CAP_PMRS_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMBS_MASK)  
\
+   << CAP_CMBS_SHIFT)
 
 enum NvmeCapCss {
 NVME_CAP_CSS_NVM= 1 << 0,
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4ce75642f1a4..0057a02402b7 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4501,7 +4501,7 @@ static void nvme_set_smart_warning(Object *obj, Visitor 
*v, const char *name,
 
 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
   | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
-if (NVME_CAP_PMR(n->bar.cap)) {
+if (NVME_CAP_PMRS(n->bar.cap)) {
 cap |= NVME_SMART_PMR_UNRELIABLE;
 }
 
-- 
2.30.0




[PULL 40/56] hw/block/nvme: indicate CMB support through controller capabilities register

2021-02-09 Thread Klaus Jensen
From: Andrzej Jakowski 

This patch sets CMBS bit in controller capabilities register when user
configures NVMe driver with CMB support, so capabilites are correctly
reported to guest OS.

Signed-off-by: Andrzej Jakowski 
Reviewed-by: Maxim Levitsky 
Reviewed-by: Minwoo Im 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 include/block/nvme.h | 10 +++---
 hw/block/nvme.c  |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 854fb2abb6f8..151921da21f9 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -36,6 +36,7 @@ enum NvmeCapShift {
 CAP_MPSMIN_SHIFT   = 48,
 CAP_MPSMAX_SHIFT   = 52,
 CAP_PMR_SHIFT  = 56,
+CAP_CMB_SHIFT  = 57,
 };
 
 enum NvmeCapMask {
@@ -49,6 +50,7 @@ enum NvmeCapMask {
 CAP_MPSMIN_MASK= 0xf,
 CAP_MPSMAX_MASK= 0xf,
 CAP_PMR_MASK   = 0x1,
+CAP_CMB_MASK   = 0x1,
 };
 
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
@@ -79,9 +81,11 @@ enum NvmeCapMask {
 #define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMIN_MASK)\
<< CAP_MPSMIN_SHIFT)
 #define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & 
CAP_MPSMAX_MASK)\
-<< 
CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMR_MASK)\
-<< CAP_PMR_SHIFT)
+   << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMR_MASK)   
\
+   << CAP_PMR_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMB_MASK)   
\
+   << CAP_CMB_SHIFT)
 
 enum NvmeCapCss {
 NVME_CAP_CSS_NVM= 1 << 0,
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 2785127037db..5f12ac1200ec 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4374,6 +4374,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
+NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
 
 n->bar.vs = NVME_SPEC_VER;
 n->bar.intmc = n->bar.intms = 0;
-- 
2.30.0




[PULL 53/56] hw/block/nvme: align with existing style

2021-02-09 Thread Klaus Jensen
From: Gollu Appalanaidu 

Change status checks to align with the existing style and remove the
explicit check against NVME_SUCCESS.

Cc: Dmitry Fomichev 
Signed-off-by: Gollu Appalanaidu 
Reviewed-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Dmitry Fomichev 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c99a3fbf3461..2335739bdb17 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1198,7 +1198,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 status = nvme_check_zone_state_for_write(zone);
 }
 
-if (status != NVME_SUCCESS) {
+if (status) {
 trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
 } else {
 assert(nvme_wp_is_valid(zone));
@@ -1253,7 +1253,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, 
uint64_t slba,
 uint16_t status;
 
 status = nvme_check_zone_state_for_read(zone);
-if (status != NVME_SUCCESS) {
+if (status) {
 ;
 } else if (unlikely(end > bndry)) {
 if (!ns->params.cross_zone_read) {
@@ -1266,7 +1266,7 @@ static uint16_t nvme_check_zone_read(NvmeNamespace *ns, 
uint64_t slba,
 do {
 zone++;
 status = nvme_check_zone_state_for_read(zone);
-if (status != NVME_SUCCESS) {
+if (status) {
 break;
 }
 } while (end > nvme_zone_rd_boundary(ns, zone));
@@ -1677,7 +1677,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
 
 if (ns->params.zoned) {
 status = nvme_check_zone_read(ns, slba, nlb);
-if (status != NVME_SUCCESS) {
+if (status) {
 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
 goto invalid;
 }
@@ -1748,12 +1748,12 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,
 zone = nvme_get_zone_by_slba(ns, slba);
 
 status = nvme_check_zone_write(n, ns, zone, slba, nlb, append);
-if (status != NVME_SUCCESS) {
+if (status) {
 goto invalid;
 }
 
 status = nvme_auto_open_zone(ns, zone);
-if (status != NVME_SUCCESS) {
+if (status) {
 goto invalid;
 }
 
@@ -1852,14 +1852,14 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 switch (state) {
 case NVME_ZONE_STATE_EMPTY:
 status = nvme_aor_check(ns, 1, 0);
-if (status != NVME_SUCCESS) {
+if (status) {
 return status;
 }
 nvme_aor_inc_active(ns);
 /* fall through */
 case NVME_ZONE_STATE_CLOSED:
 status = nvme_aor_check(ns, 0, 1);
-if (status != NVME_SUCCESS) {
+if (status) {
 if (state == NVME_ZONE_STATE_EMPTY) {
 nvme_aor_dec_active(ns);
 }
@@ -1972,7 +1972,7 @@ static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, 
NvmeZone *zone)
 
 if (state == NVME_ZONE_STATE_EMPTY) {
 status = nvme_aor_check(ns, 1, 0);
-if (status != NVME_SUCCESS) {
+if (status) {
 return status;
 }
 nvme_aor_inc_active(ns);
@@ -3301,7 +3301,7 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, 
NvmeRequest *req)
 
 ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp),
DMA_DIRECTION_TO_DEVICE, req);
-if (ret != NVME_SUCCESS) {
+if (ret) {
 return ret;
 }
 
-- 
2.30.0




[PULL 44/56] hw/block/nvme: remove redundant zeroing of PMR registers

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

The controller registers are initially zero. Remove the redundant
zeroing.

Reviewed-by: Keith Busch 
Reviewed-by: Minwoo Im 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 34 --
 1 file changed, 34 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 0057a02402b7..f8dd771925f9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4217,43 +4217,9 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice 
*pci_dev)
 
 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
 {
-/* PMR Capabities register */
-n->bar.pmrcap = 0;
-NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
-NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 0);
 NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
-NVME_PMRCAP_SET_PMRTU(n->bar.pmrcap, 0);
 /* Turn on bit 1 support */
 NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
-NVME_PMRCAP_SET_PMRTO(n->bar.pmrcap, 0);
-NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 0);
-
-/* PMR Control register */
-n->bar.pmrctl = 0;
-NVME_PMRCTL_SET_EN(n->bar.pmrctl, 0);
-
-/* PMR Status register */
-n->bar.pmrsts = 0;
-NVME_PMRSTS_SET_ERR(n->bar.pmrsts, 0);
-NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 0);
-NVME_PMRSTS_SET_HSTS(n->bar.pmrsts, 0);
-NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 0);
-
-/* PMR Elasticity Buffer Size register */
-n->bar.pmrebs = 0;
-NVME_PMREBS_SET_PMRSZU(n->bar.pmrebs, 0);
-NVME_PMREBS_SET_RBB(n->bar.pmrebs, 0);
-NVME_PMREBS_SET_PMRWBZ(n->bar.pmrebs, 0);
-
-/* PMR Sustained Write Throughput register */
-n->bar.pmrswtp = 0;
-NVME_PMRSWTP_SET_PMRSWTU(n->bar.pmrswtp, 0);
-NVME_PMRSWTP_SET_PMRSWTV(n->bar.pmrswtp, 0);
-
-/* PMR Memory Space Control register */
-n->bar.pmrmsc = 0;
-NVME_PMRMSC_SET_CMSE(n->bar.pmrmsc, 0);
-NVME_PMRMSC_SET_CBA(n->bar.pmrmsc, 0);
 
 pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
  PCI_BASE_ADDRESS_SPACE_MEMORY |
-- 
2.30.0




[PULL 37/56] hw/block/nvme: trigger async event during injecting smart warning

2021-02-09 Thread Klaus Jensen
From: zhenwei pi 

During smart critical warning injection by setting property from QMP
command, also try to trigger asynchronous event.

Suggested by Keith, if a event has already been raised, there is no
need to enqueue the duplicate event any more.

Signed-off-by: zhenwei pi 
[k.jensen: fix typo in commit message]
Signed-off-by: Klaus Jensen 
---
 include/block/nvme.h |  1 +
 hw/block/nvme.c  | 48 +---
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 88af3b42348c..854fb2abb6f8 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -784,6 +784,7 @@ typedef struct QEMU_PACKED NvmeSmartLog {
 uint8_t reserved2[320];
 } NvmeSmartLog;
 
+#define NVME_SMART_WARN_MAX 6
 enum NvmeSmartWarn {
 NVME_SMART_SPARE  = 1 << 0,
 NVME_SMART_TEMPERATURE= 1 << 1,
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f0cb7acd7454..09eb1f06e8b1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -980,6 +980,35 @@ static void nvme_enqueue_event(NvmeCtrl *n, uint8_t 
event_type,
 nvme_process_aers(n);
 }
 
+static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
+{
+uint8_t aer_info;
+
+/* Ref SPEC  
*/
+if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
+return;
+}
+
+switch (event) {
+case NVME_SMART_SPARE:
+aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
+break;
+case NVME_SMART_TEMPERATURE:
+aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
+break;
+case NVME_SMART_RELIABILITY:
+case NVME_SMART_MEDIA_READ_ONLY:
+case NVME_SMART_FAILED_VOLATILE_MEDIA:
+case NVME_SMART_PMR_UNRELIABLE:
+aer_info = NVME_AER_INFO_SMART_RELIABILITY;
+break;
+default:
+return;
+}
+
+nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
+}
+
 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
 {
 n->aer_mask &= ~(1 << event_type);
@@ -3317,12 +3346,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-if (((n->temperature >= n->features.temp_thresh_hi) ||
- (n->temperature <= n->features.temp_thresh_low)) &&
-NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) 
{
-nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
-   NVME_AER_INFO_SMART_TEMP_THRESH,
-   NVME_LOG_SMART_INFO);
+if ((n->temperature >= n->features.temp_thresh_hi) ||
+(n->temperature <= n->features.temp_thresh_low)) {
+nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
 }
 
 break;
@@ -4446,7 +4472,7 @@ static void nvme_set_smart_warning(Object *obj, Visitor 
*v, const char *name,
void *opaque, Error **errp)
 {
 NvmeCtrl *n = NVME(obj);
-uint8_t value, cap = 0;
+uint8_t value, old_value, cap = 0, index, event;
 
 if (!visit_type_uint8(v, name, &value, errp)) {
 return;
@@ -4464,7 +4490,15 @@ static void nvme_set_smart_warning(Object *obj, Visitor 
*v, const char *name,
 return;
 }
 
+old_value = n->smart_critical_warning;
 n->smart_critical_warning = value;
+
+/* only inject new bits of smart critical warning */
+for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
+event = 1 << index;
+if (value & ~old_value & event)
+nvme_smart_event(n, event);
+}
 }
 
 static const VMStateDescription nvme_vmstate = {
-- 
2.30.0




[PULL 35/56] nvme: introduce bit 5 for critical warning

2021-02-09 Thread Klaus Jensen
From: zhenwei pi 

According to NVM Express v1.4, Section 5.14.1.2 ("SMART / Health
Information"), introduce bit 5 for "Persistent Memory Region has become
read-only or unreliable".

Signed-off-by: zhenwei pi 
[k.jensen: minor brush ups in commit message]
Signed-off-by: Klaus Jensen 
---
 include/block/nvme.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 45b2678db1f0..41614c5e12af 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -789,6 +789,7 @@ enum NvmeSmartWarn {
 NVME_SMART_RELIABILITY= 1 << 2,
 NVME_SMART_MEDIA_READ_ONLY= 1 << 3,
 NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
+NVME_SMART_PMR_UNRELIABLE = 1 << 5,
 };
 
 typedef struct NvmeEffectsLog {
-- 
2.30.0




[PULL 46/56] hw/block/nvme: add PMR RDS/WDS support

2021-02-09 Thread Klaus Jensen
From: Naveen Nagar 

Add support for the PMRMSCL and PMRMSCU MMIO registers. This allows
adding RDS/WDS support for PMR as well.

Reviewed-by: Keith Busch 
Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h |   6 ++-
 hw/block/nvme.c | 122 +++-
 2 files changed, 106 insertions(+), 22 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 1cdb360bc549..b7702e937e56 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -151,7 +151,11 @@ typedef struct NvmeCtrl {
 uint16_ttemperature;
 uint8_t smart_critical_warning;
 
-HostMemoryBackend *pmrdev;
+struct {
+HostMemoryBackend *dev;
+bool  cmse;
+hwaddrcba;
+} pmr;
 
 uint8_t aer_mask;
 NvmeRequest **aer_reqs;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d773796051d6..7f1c8dd7751c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -273,6 +273,24 @@ static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr 
addr)
 return &n->cmbuf[addr - n->ctrl_mem.addr];
 }
 
+static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
+{
+hwaddr hi;
+
+if (!n->pmr.cmse) {
+return false;
+}
+
+hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
+
+return addr >= n->pmr.cba && addr < hi;
+}
+
+static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
+{
+return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
+}
+
 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
 hwaddr hi = addr + size - 1;
@@ -285,6 +303,11 @@ static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void 
*buf, int size)
 return 0;
 }
 
+if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
+memcpy(buf, nvme_addr_to_pmr(n, addr), size);
+return 0;
+}
+
 return pci_dma_read(&n->parent_obj, addr, buf, size);
 }
 
@@ -406,9 +429,27 @@ static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, 
QEMUIOVector *iov, hwaddr addr,
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
+size_t len)
+{
+if (!len) {
+return NVME_SUCCESS;
+}
+
+if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
+return NVME_DATA_TRAS_ERROR;
+}
+
+qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
+
+return NVME_SUCCESS;
+}
+
 static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
   hwaddr addr, size_t len)
 {
+bool cmb = false, pmr = false;
+
 if (!len) {
 return NVME_SUCCESS;
 }
@@ -416,6 +457,12 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList 
*qsg, QEMUIOVector *iov,
 trace_pci_nvme_map_addr(addr, len);
 
 if (nvme_addr_is_cmb(n, addr)) {
+cmb = true;
+} else if (nvme_addr_is_pmr(n, addr)) {
+pmr = true;
+}
+
+if (cmb || pmr) {
 if (qsg && qsg->sg) {
 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
 }
@@ -426,7 +473,11 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList 
*qsg, QEMUIOVector *iov,
 qemu_iovec_init(iov, 1);
 }
 
-return nvme_map_addr_cmb(n, iov, addr, len);
+if (cmb) {
+return nvme_map_addr_cmb(n, iov, addr, len);
+} else {
+return nvme_map_addr_pmr(n, iov, addr, len);
+}
 }
 
 if (iov && iov->iov) {
@@ -459,7 +510,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 
 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
 
-if (nvme_addr_is_cmb(n, prp1)) {
+if (nvme_addr_is_cmb(n, prp1) || (nvme_addr_is_pmr(n, prp1))) {
 qemu_iovec_init(iov, num_prps);
 } else {
 pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
@@ -3561,8 +3612,8 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
 NvmeNamespace *ns;
 int i;
 
-if (n->pmrdev) {
-memory_region_msync(&n->pmrdev->mr, 0, n->pmrdev->size);
+if (n->pmr.dev) {
+memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
 }
 
 for (i = 1; i <= n->num_namespaces; i++) {
@@ -3851,11 +3902,12 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 case 0xE04: /* PMRCTL */
 n->bar.pmrctl = data;
 if (NVME_PMRCTL_EN(data)) {
-memory_region_set_enabled(&n->pmrdev->mr, true);
+memory_region_set_enabled(&n->pmr.dev->mr, true);
 n->bar.pmrsts = 0;
 } else {
-memory_region_set_enabled(&n->pmrdev->mr, false);
+memory_region_set_enabled(&n->pmr.dev->mr, false);
 NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
+n->pmr.cmse = false;
 }
 return;
 case 0xE08: /* PMRSTS */
@@ -3870,8 +3922,33 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp

[PULL 41/56] hw/block/nvme: move msix table and pba to BAR 0

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

In the interest of supporting both CMB and PMR to be enabled on the same
device, move the MSI-X table and pending bit array out of BAR 4 and into
BAR 0.

This is a simplified version of the patch contributed by Andrzej
Jakowski (see [1]). Leaving the CMB at offset 0 removes the need for
changes to CMB address mapping code.

  [1]: 
https://lore.kernel.org/qemu-devel/20200729220107.37758-3-andrzej.jakow...@linux.intel.com/

Reviewed-by: Minwoo Im 
Tested-by: Minwoo Im 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h |  1 +
 hw/block/nvme.c | 23 +--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b0d5b6409d8e..1cdb360bc549 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -125,6 +125,7 @@ typedef struct NvmeFeatureVal {
 
 typedef struct NvmeCtrl {
 PCIDeviceparent_obj;
+MemoryRegion bar0;
 MemoryRegion iomem;
 MemoryRegion ctrl_mem;
 NvmeBar  bar;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 5f12ac1200ec..85d3c43c4f74 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4268,6 +4268,8 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
 uint8_t *pci_conf = pci_dev->config;
+uint64_t bar_size, msix_table_size, msix_pba_size;
+unsigned msix_table_offset, msix_pba_offset;
 int ret;
 
 Error *err = NULL;
@@ -4286,11 +4288,28 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
 pcie_endpoint_cap_init(pci_dev, 0x80);
 
+bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
+msix_table_offset = bar_size;
+msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
+
+bar_size += msix_table_size;
+bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
+msix_pba_offset = bar_size;
+msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
+
+bar_size += msix_pba_size;
+bar_size = pow2ceil(bar_size);
+
+memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
   n->reg_size);
+memory_region_add_subregion(&n->bar0, 0, &n->iomem);
+
 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
- PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
-ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, &err);
+ PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
+ret = msix_init(pci_dev, n->params.msix_qsize,
+&n->bar0, 0, msix_table_offset,
+&n->bar0, 0, msix_pba_offset, 0, &err);
 if (ret < 0) {
 if (ret == -ENOTSUP) {
 warn_report_err(err);
-- 
2.30.0




[PULL 36/56] hw/block/nvme: add smart_critical_warning property

2021-02-09 Thread Klaus Jensen
From: zhenwei pi 

There is a very low probability that hitting physical NVMe disk
hardware critical warning case, it's hard to write & test a monitor
agent service.

For debugging purposes, add a new 'smart_critical_warning' property
to emulate this situation.

The orignal version of this change is implemented by adding a fixed
property which could be initialized by QEMU command line. Suggested
by Philippe & Klaus, rework like current version.

Test with this patch:
1, change smart_critical_warning property for a running VM:
 #virsh qemu-monitor-command nvme-upstream '{ "execute": "qom-set",
  "arguments": { "path": "/machine/peripheral-anon/device[0]",
  "property": "smart_critical_warning", "value":16 } }'
2, run smartctl in guest
 #smartctl -H -l error /dev/nvme0n1

  === START OF SMART DATA SECTION ===
  SMART overall-health self-assessment test result: FAILED!
  - volatile memory backup device has failed

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: zhenwei pi 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h  |  1 +
 include/block/nvme.h |  1 +
 hw/block/nvme.c  | 45 +---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 347c149e7905..b0d5b6409d8e 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -148,6 +148,7 @@ typedef struct NvmeCtrl {
 uint64_ttimestamp_set_qemu_clock_ms;/* QEMU clock time */
 uint64_tstarttime_ms;
 uint16_ttemperature;
+uint8_t smart_critical_warning;
 
 HostMemoryBackend *pmrdev;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 41614c5e12af..88af3b42348c 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -60,6 +60,7 @@ enum NvmeCapMask {
 #define NVME_CAP_CSS(cap)   (((cap) >> CAP_CSS_SHIFT)& CAP_CSS_MASK)
 #define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
 #define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
+#define NVME_CAP_PMR(cap)   (((cap) >> CAP_PMR_SHIFT)& CAP_PMR_MASK)
 
 #define NVME_CAP_SET_MQES(cap, val)   (cap |= (uint64_t)(val & CAP_MQES_MASK)  
\
<< CAP_MQES_SHIFT)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4d73398798f1..f0cb7acd7454 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2490,6 +2490,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, 
uint32_t buf_len,
 }
 
 trans_len = MIN(sizeof(smart) - off, buf_len);
+smart.critical_warning = n->smart_critical_warning;
 
 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
 1000));
@@ -4432,6 +4433,40 @@ static Property nvme_props[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
+static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
+   void *opaque, Error **errp)
+{
+NvmeCtrl *n = NVME(obj);
+uint8_t value = n->smart_critical_warning;
+
+visit_type_uint8(v, name, &value, errp);
+}
+
+static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
+   void *opaque, Error **errp)
+{
+NvmeCtrl *n = NVME(obj);
+uint8_t value, cap = 0;
+
+if (!visit_type_uint8(v, name, &value, errp)) {
+return;
+}
+
+cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
+  | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
+if (NVME_CAP_PMR(n->bar.cap)) {
+cap |= NVME_SMART_PMR_UNRELIABLE;
+}
+
+if ((value & cap) != value) {
+error_setg(errp, "unsupported smart critical warning bits: 0x%x",
+   value & ~cap);
+return;
+}
+
+n->smart_critical_warning = value;
+}
+
 static const VMStateDescription nvme_vmstate = {
 .name = "nvme",
 .unmigratable = 1,
@@ -4455,13 +4490,17 @@ static void nvme_class_init(ObjectClass *oc, void *data)
 
 static void nvme_instance_init(Object *obj)
 {
-NvmeCtrl *s = NVME(obj);
+NvmeCtrl *n = NVME(obj);
 
-if (s->namespace.blkconf.blk) {
-device_add_bootindex_property(obj, &s->namespace.blkconf.bootindex,
+if (n->namespace.blkconf.blk) {
+device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
   "bootindex", "/namespace@1,0",
   DEVICE(obj));
 }
+
+object_property_add(obj, "smart_critical_warning", "uint8",
+nvme_get_smart_warning,
+nvme_set_smart_warning, NULL, NULL);
 }
 
 static const TypeInfo nvme_info = {
-- 
2.30.0




[PULL 42/56] hw/block/nvme: allow cmb and pmr to coexist

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

With BAR 4 now free to use, allow PMR and CMB to be enabled
simultaneously.

Reviewed-by: Minwoo Im 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 85d3c43c4f74..4ce75642f1a4 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -29,14 +29,13 @@
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
  *
- * cmb_size_mb= and pmrdev= options are mutually exclusive due to limitation
- * in available BAR's. cmb_size_mb= will take precedence over pmrdev= when
- * both provided.
  * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
  *  size=  -device nvme,...,pmrdev=
  *
+ * The PMR will use BAR 4/5 exclusively.
+ *
  *
  * nvme device parameters
  * ~~
@@ -109,7 +108,7 @@
 #define NVME_DB_SIZE  4
 #define NVME_SPEC_VER 0x00010300
 #define NVME_CMB_BIR 2
-#define NVME_PMR_BIR 2
+#define NVME_PMR_BIR 4
 #define NVME_TEMPERATURE 0x143
 #define NVME_TEMPERATURE_WARNING 0x157
 #define NVME_TEMPERATURE_CRITICAL 0x175
@@ -4121,7 +4120,7 @@ static void nvme_check_constraints(NvmeCtrl *n, Error 
**errp)
 return;
 }
 
-if (!n->params.cmb_size_mb && n->pmrdev) {
+if (n->pmrdev) {
 if (host_memory_backend_is_mapped(n->pmrdev)) {
 error_setg(errp, "can't use already busy memdev: %s",
object_get_canonical_path_component(OBJECT(n->pmrdev)));
@@ -4218,9 +4217,6 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
 
 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
 {
-/* Controller Capabilities register */
-NVME_CAP_SET_PMRS(n->bar.cap, 1);
-
 /* PMR Capabities register */
 n->bar.pmrcap = 0;
 NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 0);
@@ -4321,7 +4317,9 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, 
Error **errp)
 
 if (n->params.cmb_size_mb) {
 nvme_init_cmb(n, pci_dev);
-} else if (n->pmrdev) {
+}
+
+if (n->pmrdev) {
 nvme_init_pmr(n, pci_dev);
 }
 
@@ -4394,6 +4392,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
 NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
+NVME_CAP_SET_PMRS(n->bar.cap, n->pmrdev ? 1 : 0);
 
 n->bar.vs = NVME_SPEC_VER;
 n->bar.intmc = n->bar.intms = 0;
-- 
2.30.0




[PULL 34/56] hw/block/nvme: fix zone write finalize

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

The zone write pointer is unconditionally advanced, even for write
faults. Make sure that the zone is always transitioned to Full if the
write pointer reaches zone capacity.

Cc: Dmitry Fomichev 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 30bd70fd5b07..4d73398798f1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1268,10 +1268,13 @@ static void nvme_finalize_zoned_write(NvmeNamespace 
*ns, NvmeRequest *req,
 nlb = le16_to_cpu(rw->nlb) + 1;
 zone = nvme_get_zone_by_slba(ns, slba);
 
+zone->d.wp += nlb;
+
 if (failed) {
 res->slba = 0;
-zone->d.wp += nlb;
-} else if (zone->w_ptr == nvme_zone_wr_boundary(zone)) {
+}
+
+if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
 switch (nvme_get_zone_state(zone)) {
 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1288,9 +1291,6 @@ static void nvme_finalize_zoned_write(NvmeNamespace *ns, 
NvmeRequest *req,
 default:
 assert(false);
 }
-zone->d.wp = zone->w_ptr;
-} else {
-zone->d.wp += nlb;
 }
 }
 
-- 
2.30.0




[PULL 32/56] hw/block/nvme: split setup and register for namespace

2021-02-09 Thread Klaus Jensen
From: Minwoo Im 

In NVMe, namespace is being attached to process I/O.  We register NVMe
namespace to a controller via nvme_register_namespace() during
nvme_ns_setup().  This is main reason of receiving NvmeCtrl object
instance to this function to map the namespace to a controller.

To make namespace instance more independent, it should be split into two
parts: setup and register.  This patch split them into two differnt
parts, and finally nvme_ns_setup() does not have nothing to do with
NvmeCtrl instance at all.

This patch is a former patch to introduce NVMe subsystem scheme to the
existing design especially for multi-path.  In that case, it should be
split into two to make namespace independent from a controller.

Signed-off-by: Minwoo Im 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 17e876e6bc44..ce79ad4a5319 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -321,10 +321,6 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp)
 nvme_ns_init_zoned(ns, 0);
 }
 
-if (nvme_register_namespace(n, ns, errp)) {
-return -1;
-}
-
 return 0;
 }
 
@@ -362,6 +358,13 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 "could not setup namespace: ");
 return;
 }
+
+if (nvme_register_namespace(n, ns, errp)) {
+error_propagate_prepend(errp, local_err,
+"could not register namespace: ");
+return;
+}
+
 }
 
 static Property nvme_ns_props[] = {
-- 
2.30.0




[PULL 31/56] hw/block/nvme: remove unused argument in nvme_ns_init_blk

2021-02-09 Thread Klaus Jensen
From: Minwoo Im 

Removed no longer used aregument NvmeCtrl object in nvme_ns_init_blk().

Signed-off-by: Minwoo Im 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 7a5a77983798..17e876e6bc44 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -66,7 +66,7 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 return 0;
 }
 
-static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+static int nvme_ns_init_blk(NvmeNamespace *ns, Error **errp)
 {
 bool read_only;
 
@@ -307,7 +307,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp)
 return -1;
 }
 
-if (nvme_ns_init_blk(n, ns, errp)) {
+if (nvme_ns_init_blk(ns, errp)) {
 return -1;
 }
 
-- 
2.30.0




[PULL 38/56] hw/block/nvme: add size to mmio read/write trace events

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

Add the size of the mmio read/write to the trace event.

Reviewed-by: Minwoo Im 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c   | 4 ++--
 hw/block/trace-events | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 09eb1f06e8b1..2407b6578abc 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3878,7 +3878,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, 
unsigned size)
 uint8_t *ptr = (uint8_t *)&n->bar;
 uint64_t val = 0;
 
-trace_pci_nvme_mmio_read(addr);
+trace_pci_nvme_mmio_read(addr, size);
 
 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
@@ -4042,7 +4042,7 @@ static void nvme_mmio_write(void *opaque, hwaddr addr, 
uint64_t data,
 {
 NvmeCtrl *n = (NvmeCtrl *)opaque;
 
-trace_pci_nvme_mmio_write(addr, data);
+trace_pci_nvme_mmio_write(addr, data, size);
 
 if (addr < sizeof(n->bar)) {
 nvme_write_bar(n, addr, data, size);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 6d1686e6dc9d..3772502033af 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -80,8 +80,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
 pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
 pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
 pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) 
"cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16""
-pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
-pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 
0x%"PRIx64""
+pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d"
+pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 
0x%"PRIx64" data 0x%"PRIx64" size %d"
 pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" 
new_head %"PRIu16""
 pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "sqid %"PRIu16" 
new_tail %"PRIu16""
 pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, 
interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
-- 
2.30.0




[PULL 48/56] hw/block/nvme: bump to v1.4

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

With the new CMB logic in place, bump the implemented specification
version to v1.4 by default.

This requires adding the setting the CNTRLTYPE field and modifying the
VWC field since 0x00 is no longer a valid value for bits 2:1.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 include/block/nvme.h | 3 ++-
 hw/block/nvme.c  | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 2e85b97a6c4e..07cfc929368b 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -951,7 +951,8 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
 uint32_trtd3e;
 uint32_toaes;
 uint32_tctratt;
-uint8_t rsvd100[12];
+uint8_t rsvd100[11];
+uint8_t cntrltype;
 uint8_t fguid[16];
 uint8_t rsvd128[128];
 uint16_toacs;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1e13d25b0887..c4c968f5951e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -108,7 +108,7 @@
 
 #define NVME_MAX_IOQPAIRS 0x
 #define NVME_DB_SIZE  4
-#define NVME_SPEC_VER 0x00010300
+#define NVME_SPEC_VER 0x00010400
 #define NVME_CMB_BIR 2
 #define NVME_PMR_BIR 4
 #define NVME_TEMPERATURE 0x143
@@ -4450,6 +4450,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->mdts = n->params.mdts;
 id->ver = cpu_to_le32(NVME_SPEC_VER);
 id->oacs = cpu_to_le16(0);
+id->cntrltype = 0x1;
 
 /*
  * Because the controller always completes the Abort command immediately,
@@ -4478,7 +4479,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
NVME_ONCS_FEATURES | NVME_ONCS_DSM |
NVME_ONCS_COMPARE);
 
-id->vwc = 0x1;
+id->vwc = (0x2 << 1) | 0x1;
 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
NVME_CTRL_SGLS_BITBUCKET);
 
-- 
2.30.0




[PULL 33/56] hw/block/nvme: remove unused argument in nvme_ns_setup

2021-02-09 Thread Klaus Jensen
From: Minwoo Im 

nvme_ns_setup() finally does not have nothing to do with NvmeCtrl
instance.

Signed-off-by: Minwoo Im 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h | 2 +-
 hw/block/nvme-ns.c | 4 ++--
 hw/block/nvme.c| 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index a0baa5f6d44c..293ac990e3f6 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -174,7 +174,7 @@ static inline void nvme_aor_dec_active(NvmeNamespace *ns)
 assert(ns->nr_active_zones >= 0);
 }
 
-int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
+int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
 void nvme_ns_cleanup(NvmeNamespace *ns);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index ce79ad4a5319..3f52acb89c95 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -301,7 +301,7 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, 
Error **errp)
 return 0;
 }
 
-int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
+int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
 {
 if (nvme_ns_check_constraints(ns, errp)) {
 return -1;
@@ -353,7 +353,7 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
 NvmeCtrl *n = NVME(s->parent);
 Error *local_err = NULL;
 
-if (nvme_ns_setup(n, ns, &local_err)) {
+if (nvme_ns_setup(ns, &local_err)) {
 error_propagate_prepend(errp, local_err,
 "could not setup namespace: ");
 return;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 0b002cb2beab..30bd70fd5b07 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -4377,7 +4377,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 ns = &n->namespace;
 ns->params.nsid = 1;
 
-if (nvme_ns_setup(n, ns, errp)) {
+if (nvme_ns_setup(ns, errp)) {
 return;
 }
 }
-- 
2.30.0




[PULL 29/56] hw/block/nvme: remove unused argument in nvme_ns_init_zoned

2021-02-09 Thread Klaus Jensen
From: Minwoo Im 

nvme_ns_init_zoned() has no use for given NvmeCtrl object.

Signed-off-by: Minwoo Im 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 9be170abb78d..d35c2925ecb8 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -205,7 +205,7 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
 }
 }
 
-static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace *ns, int lba_index)
+static void nvme_ns_init_zoned(NvmeNamespace *ns, int lba_index)
 {
 NvmeIdNsZoned *id_ns_z;
 
@@ -322,7 +322,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp)
 if (nvme_ns_zoned_check_calc_geometry(ns, errp) != 0) {
 return -1;
 }
-nvme_ns_init_zoned(n, ns, 0);
+nvme_ns_init_zoned(ns, 0);
 }
 
 if (nvme_register_namespace(n, ns, errp)) {
-- 
2.30.0




[PULL 45/56] hw/block/nvme: disable PMR at boot up

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

The PMR should not be enabled at boot up. Disable the PMR MemoryRegion
initially and implement MMIO for PMRCTL, allowing the host to enable the
PMR explicitly.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f8dd771925f9..d773796051d6 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3848,8 +3848,16 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
"invalid write to PMRCAP register, ignored");
 return;
-case 0xE04: /* TODO PMRCTL */
-break;
+case 0xE04: /* PMRCTL */
+n->bar.pmrctl = data;
+if (NVME_PMRCTL_EN(data)) {
+memory_region_set_enabled(&n->pmrdev->mr, true);
+n->bar.pmrsts = 0;
+} else {
+memory_region_set_enabled(&n->pmrdev->mr, false);
+NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
+}
+return;
 case 0xE08: /* PMRSTS */
 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
"invalid write to PMRSTS register, ignored");
@@ -4225,6 +4233,8 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
  PCI_BASE_ADDRESS_SPACE_MEMORY |
  PCI_BASE_ADDRESS_MEM_TYPE_64 |
  PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
+
+memory_region_set_enabled(&n->pmrdev->mr, false);
 }
 
 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
-- 
2.30.0




[PULL 30/56] hw/block/nvme: open code for volatile write cache

2021-02-09 Thread Klaus Jensen
From: Minwoo Im 

Volatile Write Cache(VWC) feature is set in nvme_ns_setup() in the
initial time.  This feature is related to block device backed,  but this
feature is controlled in controller level via Set/Get Features command.

This patch removed dependency between nvme and nvme-ns to manage the VWC
flag value.  Also, it open coded the Get Features for VWC to check all
namespaces attached to the controller, and if false detected, return
directly false.

Signed-off-by: Minwoo Im 
[k.jensen: report write cache preset if present on ANY namespace]
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h|  1 -
 hw/block/nvme-ns.c |  4 
 hw/block/nvme.c| 15 ---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 65540b650e1d..347c149e7905 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -121,7 +121,6 @@ typedef struct NvmeFeatureVal {
 uint16_t temp_thresh_low;
 };
 uint32_tasync_config;
-uint32_tvwc;
 } NvmeFeatureVal;
 
 typedef struct NvmeCtrl {
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index d35c2925ecb8..7a5a77983798 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -90,10 +90,6 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, 
Error **errp)
 return -1;
 }
 
-if (blk_enable_write_cache(ns->blkconf.blk)) {
-n->features.vwc = 0x1;
-}
-
 return 0;
 }
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 35f39ecd9559..0b002cb2beab 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3097,6 +3097,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
 uint16_t iv;
 NvmeNamespace *ns;
+int i;
 
 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
@@ -3172,7 +3173,17 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
NvmeRequest *req)
 result = ns->features.err_rec;
 goto out;
 case NVME_VOLATILE_WRITE_CACHE:
-result = n->features.vwc;
+for (i = 1; i <= n->num_namespaces; i++) {
+ns = nvme_ns(n, i);
+if (!ns) {
+continue;
+}
+
+result = blk_enable_write_cache(ns->blkconf.blk);
+if (result) {
+break;
+}
+}
 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
 goto out;
 case NVME_ASYNCHRONOUS_EVENT_CONF:
@@ -3335,8 +3346,6 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 ns->features.err_rec = dw11;
 break;
 case NVME_VOLATILE_WRITE_CACHE:
-n->features.vwc = dw11 & 0x1;
-
 for (i = 1; i <= n->num_namespaces; i++) {
 ns = nvme_ns(n, i);
 if (!ns) {
-- 
2.30.0




[PULL 28/56] hw/block/nvme: Correct error status for unaligned ZA

2021-02-09 Thread Klaus Jensen
From: Dmitry Fomichev 

TP 4053 says (in section 2.3.1.1) -
... if a Zone Append command specifies a ZSLBA that is not the lowest
logical block address in that zone, then the controller shall abort
that command with a status code of Invalid Field In Command.

In the code, Zone Invalid Write is returned instead, fix this.

Signed-off-by: Dmitry Fomichev 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c73afdf8054f..35f39ecd9559 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1150,7 +1150,7 @@ static uint16_t nvme_check_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 if (append) {
 if (unlikely(slba != zone->d.zslba)) {
 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
-status = NVME_ZONE_INVALID_WRITE;
+status = NVME_INVALID_FIELD;
 }
 if (nvme_l2b(ns, nlb) > (n->page_size << n->zasl)) {
 trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
-- 
2.30.0




[PULL 27/56] hw/block/nvme: remove unnecessary check for append

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

nvme_io_cmd already checks if the namespace supports the Zone Append
command, so the removed check is dead code.

Signed-off-by: Klaus Jensen 
Tested-by: Dmitry Fomichev 
Reviewed-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7222eff755ee..c73afdf8054f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1707,10 +1707,6 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,
 }
 
 res->slba = nvme_advance_zone_wp(ns, zone, nlb);
-} else if (append) {
-trace_pci_nvme_err_invalid_opc(rw->opcode);
-status = NVME_INVALID_OPCODE;
-goto invalid;
 }
 
 data_offset = nvme_l2b(ns, slba);
-- 
2.30.0




[PULL 23/56] hw/block/nvme: merge implicitly/explicitly opened processing masks

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

Implicitly and explicitly opended zones are always bulk processed
together, so merge the two processing masks.

Signed-off-by: Klaus Jensen 
Tested-by: Dmitry Fomichev 
Reviewed-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 27 +++
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 551878338e5d..a7245a7e05a1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1740,11 +1740,10 @@ typedef uint16_t (*op_handler_t)(NvmeNamespace *, 
NvmeZone *,
 
 enum NvmeZoneProcessingMask {
 NVME_PROC_CURRENT_ZONE= 0,
-NVME_PROC_IMP_OPEN_ZONES  = 1 << 0,
-NVME_PROC_EXP_OPEN_ZONES  = 1 << 1,
-NVME_PROC_CLOSED_ZONES= 1 << 2,
-NVME_PROC_READ_ONLY_ZONES = 1 << 3,
-NVME_PROC_FULL_ZONES  = 1 << 4,
+NVME_PROC_OPENED_ZONES= 1 << 0,
+NVME_PROC_CLOSED_ZONES= 1 << 1,
+NVME_PROC_READ_ONLY_ZONES = 1 << 2,
+NVME_PROC_FULL_ZONES  = 1 << 3,
 };
 
 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
@@ -1885,10 +1884,8 @@ static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 
 switch (zs) {
 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
-proc_zone = proc_mask & NVME_PROC_IMP_OPEN_ZONES;
-break;
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
-proc_zone = proc_mask & NVME_PROC_EXP_OPEN_ZONES;
+proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
 break;
 case NVME_ZONE_STATE_CLOSED:
 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
@@ -1929,15 +1926,14 @@ static uint16_t nvme_do_zone_op(NvmeNamespace *ns, 
NvmeZone *zone,
 }
 }
 }
-if (proc_mask & NVME_PROC_IMP_OPEN_ZONES) {
+if (proc_mask & NVME_PROC_OPENED_ZONES) {
 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
 if (status != NVME_SUCCESS) {
 goto out;
 }
 }
-}
-if (proc_mask & NVME_PROC_EXP_OPEN_ZONES) {
+
 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr);
 if (status != NVME_SUCCESS) {
@@ -2012,7 +2008,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, 
NvmeRequest *req)
 
 case NVME_ZONE_ACTION_CLOSE:
 if (all) {
-proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES;
+proc_mask = NVME_PROC_OPENED_ZONES;
 }
 trace_pci_nvme_close_zone(slba, zone_idx, all);
 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone);
@@ -2020,8 +2016,7 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, 
NvmeRequest *req)
 
 case NVME_ZONE_ACTION_FINISH:
 if (all) {
-proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES |
-NVME_PROC_CLOSED_ZONES;
+proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
 }
 trace_pci_nvme_finish_zone(slba, zone_idx, all);
 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone);
@@ -2029,8 +2024,8 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, 
NvmeRequest *req)
 
 case NVME_ZONE_ACTION_RESET:
 if (all) {
-proc_mask = NVME_PROC_IMP_OPEN_ZONES | NVME_PROC_EXP_OPEN_ZONES |
-NVME_PROC_CLOSED_ZONES | NVME_PROC_FULL_ZONES;
+proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
+NVME_PROC_FULL_ZONES;
 }
 trace_pci_nvme_reset_zone(slba, zone_idx, all);
 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone);
-- 
2.30.0




[PULL 25/56] hw/block/nvme: zero out zones on reset

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

The zoned command set specification states that "All logical blocks in a
zone *shall* be marked as deallocated when [the zone is reset]". Since
the device guarantees 0x00 to be read from deallocated blocks we have to
issue a pwrite_zeroes since we cannot be sure that a discard will do
anything. But typically, this will be achieved with an efficient
unmap/discard operation.

Signed-off-by: Klaus Jensen 
Tested-by: Dmitry Fomichev 
Reviewed-by: Dmitry Fomichev 
---
 hw/block/nvme.c   | 150 +++---
 hw/block/trace-events |   1 +
 2 files changed, 113 insertions(+), 38 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a5cf798bbbaa..7222eff755ee 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1371,6 +1371,53 @@ static void nvme_aio_discard_cb(void *opaque, int ret)
 nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+struct nvme_zone_reset_ctx {
+NvmeRequest *req;
+NvmeZone*zone;
+};
+
+static void nvme_aio_zone_reset_cb(void *opaque, int ret)
+{
+struct nvme_zone_reset_ctx *ctx = opaque;
+NvmeRequest *req = ctx->req;
+NvmeNamespace *ns = req->ns;
+NvmeZone *zone = ctx->zone;
+uintptr_t *resets = (uintptr_t *)&req->opaque;
+
+g_free(ctx);
+
+trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
+
+if (!ret) {
+switch (nvme_get_zone_state(zone)) {
+case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+nvme_aor_dec_open(ns);
+/* fall through */
+case NVME_ZONE_STATE_CLOSED:
+nvme_aor_dec_active(ns);
+/* fall through */
+case NVME_ZONE_STATE_FULL:
+zone->w_ptr = zone->d.zslba;
+zone->d.wp = zone->w_ptr;
+nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
+/* fall through */
+default:
+break;
+}
+} else {
+nvme_aio_err(req, ret);
+}
+
+(*resets)--;
+
+if (*resets) {
+return;
+}
+
+nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
 struct nvme_compare_ctx {
 QEMUIOVector iov;
 uint8_t *bounce;
@@ -1735,7 +1782,8 @@ static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace 
*ns, NvmeCmd *c,
 return NVME_SUCCESS;
 }
 
-typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState);
+typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
+ NvmeRequest *);
 
 enum NvmeZoneProcessingMask {
 NVME_PROC_CURRENT_ZONE= 0,
@@ -1746,7 +1794,7 @@ enum NvmeZoneProcessingMask {
 };
 
 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
-   NvmeZoneState state)
+   NvmeZoneState state, NvmeRequest *req)
 {
 uint16_t status;
 
@@ -1779,7 +1827,7 @@ static uint16_t nvme_open_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 }
 
 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
-NvmeZoneState state)
+NvmeZoneState state, NvmeRequest *req)
 {
 switch (state) {
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1795,7 +1843,7 @@ static uint16_t nvme_close_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 }
 
 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
- NvmeZoneState state)
+ NvmeZoneState state, NvmeRequest *req)
 {
 switch (state) {
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
@@ -1818,30 +1866,42 @@ static uint16_t nvme_finish_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 }
 
 static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
-NvmeZoneState state)
+NvmeZoneState state, NvmeRequest *req)
 {
+uintptr_t *resets = (uintptr_t *)&req->opaque;
+struct nvme_zone_reset_ctx *ctx;
+
 switch (state) {
-case NVME_ZONE_STATE_EXPLICITLY_OPEN:
-case NVME_ZONE_STATE_IMPLICITLY_OPEN:
-nvme_aor_dec_open(ns);
-/* fall through */
-case NVME_ZONE_STATE_CLOSED:
-nvme_aor_dec_active(ns);
-/* fall through */
-case NVME_ZONE_STATE_FULL:
-zone->w_ptr = zone->d.zslba;
-zone->d.wp = zone->w_ptr;
-nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
-/* fall through */
 case NVME_ZONE_STATE_EMPTY:
 return NVME_SUCCESS;
+case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+case NVME_ZONE_STATE_CLOSED:
+case NVME_ZONE_STATE_FULL:
+break;
 default:
 return NVME_ZONE_INVAL_TRANSITION;
 }
+
+/*
+ * The zone reset aio callback needs to know the zone that is being reset
+ * in order to transition the zone on completion.
+ */
+ctx = g_new(struct nvme_zone_reset_ctx, 1);
+ctx->req = req;
+ctx->zone = zone;
+
+   

[PULL 39/56] hw/block/nvme: fix 64 bit register hi/lo split writes

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

64 bit registers like ASQ and ACQ should be writable by both a hi/lo 32
bit write combination as well as a plain 64 bit write. The spec does not
define ordering on the hi/lo split, but the code currently assumes that
the low order bits are written first. Additionally, the code does not
consider that another address might already have been written into the
register, causing the OR'ing to result in a bad address.

Fix this by explicitly overwriting only the low or high order bits for
32 bit writes.

Signed-off-by: Klaus Jensen 
Reviewed-by: Keith Busch 
---
 hw/block/nvme.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 2407b6578abc..2785127037db 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3819,19 +3819,21 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 trace_pci_nvme_mmio_aqattr(data & 0x);
 break;
 case 0x28:  /* ASQ */
-n->bar.asq = data;
+n->bar.asq = size == 8 ? data :
+(n->bar.asq & ~0xULL) | (data & 0x);
 trace_pci_nvme_mmio_asqaddr(data);
 break;
 case 0x2c:  /* ASQ hi */
-n->bar.asq |= data << 32;
+n->bar.asq = (n->bar.asq & 0x) | (data << 32);
 trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
 break;
 case 0x30:  /* ACQ */
 trace_pci_nvme_mmio_acqaddr(data);
-n->bar.acq = data;
+n->bar.acq = size == 8 ? data :
+(n->bar.acq & ~0xULL) | (data & 0x);
 break;
 case 0x34:  /* ACQ hi */
-n->bar.acq |= data << 32;
+n->bar.acq = (n->bar.acq & 0x) | (data << 32);
 trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
 break;
 case 0x38:  /* CMBLOC */
-- 
2.30.0




[PULL 20/56] hw/block/nvme: fix for non-msix machines

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

Commit 1c0c2163aa08 ("hw/block/nvme: verify msix_init_exclusive_bar()
return value") had the unintended effect of breaking support on
several platforms not supporting MSI-X.

Still check for errors, but only report that MSI-X is unsupported
instead of bailing out.

Fixes: 1c0c2163aa08 ("hw/block/nvme: verify msix_init_exclusive_bar() return 
value")
Fixes: fbf2e5375e33 ("hw/block/nvme: Verify msix_vector_use() returned value")
Reported-by: Guenter Roeck 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f4f1487afeb1..b0b7abf3312e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2590,7 +2590,9 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
 {
 n->cq[cq->cqid] = NULL;
 timer_free(cq->timer);
-msix_vector_unuse(&n->parent_obj, cq->vector);
+if (msix_enabled(&n->parent_obj)) {
+msix_vector_unuse(&n->parent_obj, cq->vector);
+}
 if (cq->cqid) {
 g_free(cq);
 }
@@ -2624,8 +2626,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
uint64_t dma_addr,
 {
 int ret;
 
-ret = msix_vector_use(&n->parent_obj, vector);
-assert(ret == 0);
+if (msix_enabled(&n->parent_obj)) {
+ret = msix_vector_use(&n->parent_obj, vector);
+assert(ret == 0);
+}
 cq->ctrl = n;
 cq->cqid = cqid;
 cq->size = size;
@@ -4161,9 +4165,12 @@ static void nvme_init_pmr(NvmeCtrl *n, PCIDevice 
*pci_dev)
  PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmrdev->mr);
 }
 
-static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
+static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
 {
 uint8_t *pci_conf = pci_dev->config;
+int ret;
+
+Error *err = NULL;
 
 pci_conf[PCI_INTERRUPT_PIN] = 1;
 pci_config_set_prog_interface(pci_conf, 0x2);
@@ -4183,8 +4190,14 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
   n->reg_size);
 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
  PCI_BASE_ADDRESS_MEM_TYPE_64, &n->iomem);
-if (msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, errp)) {
-return;
+ret = msix_init_exclusive_bar(pci_dev, n->params.msix_qsize, 4, &err);
+if (ret < 0) {
+if (ret == -ENOTSUP) {
+warn_report_err(err);
+} else {
+error_propagate(errp, err);
+return ret;
+}
 }
 
 if (n->params.cmb_size_mb) {
@@ -4192,6 +4205,8 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
 } else if (n->pmrdev) {
 nvme_init_pmr(n, pci_dev);
 }
+
+return 0;
 }
 
 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
@@ -4280,9 +4295,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 &pci_dev->qdev, n->parent_obj.qdev.id);
 
 nvme_init_state(n);
-nvme_init_pci(n, pci_dev, &local_err);
-if (local_err) {
-error_propagate(errp, local_err);
+if (nvme_init_pci(n, pci_dev, errp)) {
 return;
 }
 
-- 
2.30.0




[PULL 17/56] hw/block/nvme: Introduce max active and open zone limits

2021-02-09 Thread Klaus Jensen
From: Dmitry Fomichev 

Add two module properties, "zoned.max_active" and "zoned.max_open"
to control the maximum number of zones that can be active or open.
Once these variables are set to non-default values, these limits are
checked during I/O and Too Many Active or Too Many Open command status
is returned if they are exceeded.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h| 41 +++
 hw/block/nvme-ns.c| 31 ++-
 hw/block/nvme.c   | 92 +++
 hw/block/trace-events |  2 +
 4 files changed, 164 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 388381dda0df..7e1fd26909ba 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -33,6 +33,8 @@ typedef struct NvmeNamespaceParams {
 bool cross_zone_read;
 uint64_t zone_size_bs;
 uint64_t zone_cap_bs;
+uint32_t max_active_zones;
+uint32_t max_open_zones;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -54,6 +56,8 @@ typedef struct NvmeNamespace {
 uint64_tzone_size;
 uint64_tzone_capacity;
 uint32_tzone_size_log2;
+int32_t nr_open_zones;
+int32_t nr_active_zones;
 
 NvmeNamespaceParams params;
 
@@ -125,6 +129,43 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline void nvme_aor_inc_open(NvmeNamespace *ns)
+{
+assert(ns->nr_open_zones >= 0);
+if (ns->params.max_open_zones) {
+ns->nr_open_zones++;
+assert(ns->nr_open_zones <= ns->params.max_open_zones);
+}
+}
+
+static inline void nvme_aor_dec_open(NvmeNamespace *ns)
+{
+if (ns->params.max_open_zones) {
+assert(ns->nr_open_zones > 0);
+ns->nr_open_zones--;
+}
+assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeNamespace *ns)
+{
+assert(ns->nr_active_zones >= 0);
+if (ns->params.max_active_zones) {
+ns->nr_active_zones++;
+assert(ns->nr_active_zones <= ns->params.max_active_zones);
+}
+}
+
+static inline void nvme_aor_dec_active(NvmeNamespace *ns)
+{
+if (ns->params.max_active_zones) {
+assert(ns->nr_active_zones > 0);
+ns->nr_active_zones--;
+assert(ns->nr_active_zones >= ns->nr_open_zones);
+}
+assert(ns->nr_active_zones >= 0);
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index d79452c627cf..c55afc1920a3 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -135,6 +135,21 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 ns->zone_size = zone_size / lbasz;
 ns->zone_capacity = zone_cap / lbasz;
 ns->num_zones = ns->size / lbasz / ns->zone_size;
+
+/* Do a few more sanity checks of ZNS properties */
+if (ns->params.max_open_zones > ns->num_zones) {
+error_setg(errp,
+   "max_open_zones value %u exceeds the number of zones %u",
+   ns->params.max_open_zones, ns->num_zones);
+return -1;
+}
+if (ns->params.max_active_zones > ns->num_zones) {
+error_setg(errp,
+   "max_active_zones value %u exceeds the number of zones %u",
+   ns->params.max_active_zones, ns->num_zones);
+return -1;
+}
+
 return 0;
 }
 
@@ -182,8 +197,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index)
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
 /* MAR/MOR are zeroes-based, 0x means no limit */
-id_ns_z->mar = 0x;
-id_ns_z->mor = 0x;
+id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
+id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
@@ -209,6 +224,7 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
 }
+nvme_aor_inc_active(ns);
 QTAILQ_INSERT_HEAD(&ns->closed_zones, zone, entry);
 } else {
 trace_pci_nvme_clear_ns_reset(state, zone->d.zslba);
@@ -225,16 +241,23 @@ static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
 
 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
+nvme_aor_dec_active(ns);
 nvme_clear_zone(ns, zone);
 }
 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
+nvme_aor_dec_open(ns);
+nvme_aor_dec_active(ns);
 n

[PULL 18/56] hw/block/nvme: Support Zone Descriptor Extensions

2021-02-09 Thread Klaus Jensen
From: Dmitry Fomichev 

Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.

This commit adds a new optional module property,
"zoned.descr_ext_size". Its value must be a multiple of 64 bytes.
If this value is non-zero, it becomes possible to assign extensions
of that size to any Empty zones. The default value for this property
is 0, therefore setting extensions is disabled by default.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h|  8 +++
 hw/block/nvme-ns.c| 25 ++--
 hw/block/nvme.c   | 53 +--
 hw/block/trace-events |  2 ++
 4 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 7e1fd26909ba..f8f3c28c360b 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -35,6 +35,7 @@ typedef struct NvmeNamespaceParams {
 uint64_t zone_cap_bs;
 uint32_t max_active_zones;
 uint32_t max_open_zones;
+uint32_t zd_extension_size;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -56,6 +57,7 @@ typedef struct NvmeNamespace {
 uint64_tzone_size;
 uint64_tzone_capacity;
 uint32_tzone_size_log2;
+uint8_t *zd_extensions;
 int32_t nr_open_zones;
 int32_t nr_active_zones;
 
@@ -129,6 +131,12 @@ static inline bool nvme_wp_is_valid(NvmeZone *zone)
st != NVME_ZONE_STATE_OFFLINE;
 }
 
+static inline uint8_t *nvme_get_zd_extension(NvmeNamespace *ns,
+ uint32_t zone_idx)
+{
+return &ns->zd_extensions[zone_idx * ns->params.zd_extension_size];
+}
+
 static inline void nvme_aor_inc_open(NvmeNamespace *ns)
 {
 assert(ns->nr_open_zones >= 0);
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index c55afc1920a3..838b15c064f5 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -150,6 +150,18 @@ static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace 
*ns, Error **errp)
 return -1;
 }
 
+if (ns->params.zd_extension_size) {
+if (ns->params.zd_extension_size & 0x3f) {
+error_setg(errp,
+"zone descriptor extension size must be a multiple of 64B");
+return -1;
+}
+if ((ns->params.zd_extension_size >> 6) > 0xff) {
+error_setg(errp, "zone descriptor extension size is too large");
+return -1;
+}
+}
+
 return 0;
 }
 
@@ -161,6 +173,10 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
 int i;
 
 ns->zone_array = g_new0(NvmeZone, ns->num_zones);
+if (ns->params.zd_extension_size) {
+ns->zd_extensions = g_malloc0(ns->params.zd_extension_size *
+  ns->num_zones);
+}
 
 QTAILQ_INIT(&ns->exp_open_zones);
 QTAILQ_INIT(&ns->imp_open_zones);
@@ -203,7 +219,8 @@ static void nvme_ns_init_zoned(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index)
 id_ns_z->ozcs = ns->params.cross_zone_read ? 0x01 : 0x00;
 
 id_ns_z->lbafe[lba_index].zsze = cpu_to_le64(ns->zone_size);
-id_ns_z->lbafe[lba_index].zdes = 0;
+id_ns_z->lbafe[lba_index].zdes =
+ns->params.zd_extension_size >> 6; /* Units of 64B */
 
 ns->csi = NVME_CSI_ZONED;
 ns->id_ns.nsze = cpu_to_le64(ns->num_zones * ns->zone_size);
@@ -219,7 +236,8 @@ static void nvme_clear_zone(NvmeNamespace *ns, NvmeZone 
*zone)
 
 zone->w_ptr = zone->d.wp;
 state = nvme_get_zone_state(zone);
-if (zone->d.wp != zone->d.zslba) {
+if (zone->d.wp != zone->d.zslba ||
+(zone->d.za & NVME_ZA_ZD_EXT_VALID)) {
 if (state != NVME_ZONE_STATE_CLOSED) {
 trace_pci_nvme_clear_ns_close(state, zone->d.zslba);
 nvme_set_zone_state(zone, NVME_ZONE_STATE_CLOSED);
@@ -315,6 +333,7 @@ void nvme_ns_cleanup(NvmeNamespace *ns)
 if (ns->params.zoned) {
 g_free(ns->id_ns_zoned);
 g_free(ns->zone_array);
+g_free(ns->zd_extensions);
 }
 }
 
@@ -347,6 +366,8 @@ static Property nvme_ns_props[] = {
params.max_active_zones, 0),
 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace,
params.max_open_zones, 0),
+DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace,
+   params.zd_extension_size, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index c07dbcd2a809..4bcc7660736b 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1823,6 +1823,25 @@ static uint16_t nvme_offline_zone(NvmeNamespace *ns, 
NvmeZone *zone,
 }
 }
 
+static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
+{
+uint16_t status;
+uint8_t state = nvme_get_zone_state(zone);
+
+if (state == NVME_ZONE_STATE_EMPTY) {
+sta

[PULL 16/56] hw/block/nvme: Support Zoned Namespace Command Set

2021-02-09 Thread Klaus Jensen
From: Dmitry Fomichev 

The emulation code has been changed to advertise NVM Command Set when
"zoned" device property is not set (default) and Zoned Namespace
Command Set otherwise.

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.
Define trace events where needed in newly introduced code.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Device initialization code has been extended to create a proper
configuration for zoned operation using device properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Write Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

Subsequent commits in this series add ZDE support and checks for
active and open zone limits.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Chaitanya Kulkarni 
Signed-off-by: Matias Bjorling 
Signed-off-by: Aravind Ramesh 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Adam Manzanares 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h|  52 +++
 hw/block/nvme.h   |   6 +
 hw/block/nvme-ns.c| 166 +
 hw/block/nvme.c   | 807 +-
 hw/block/trace-events |  17 +
 5 files changed, 1040 insertions(+), 8 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index bdbc98c2ec17..388381dda0df 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -19,9 +19,20 @@
 #define NVME_NS(obj) \
 OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+typedef struct NvmeZone {
+NvmeZoneDescr   d;
+uint64_tw_ptr;
+QTAILQ_ENTRY(NvmeZone) entry;
+} NvmeZone;
+
 typedef struct NvmeNamespaceParams {
 uint32_t nsid;
 QemuUUID uuid;
+
+bool zoned;
+bool cross_zone_read;
+uint64_t zone_size_bs;
+uint64_t zone_cap_bs;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -33,6 +44,17 @@ typedef struct NvmeNamespace {
 const uint32_t *iocs;
 uint8_t  csi;
 
+NvmeIdNsZoned   *id_ns_zoned;
+NvmeZone*zone_array;
+QTAILQ_HEAD(, NvmeZone) exp_open_zones;
+QTAILQ_HEAD(, NvmeZone) imp_open_zones;
+QTAILQ_HEAD(, NvmeZone) closed_zones;
+QTAILQ_HEAD(, NvmeZone) full_zones;
+uint32_tnum_zones;
+uint64_tzone_size;
+uint64_tzone_capacity;
+uint32_tzone_size_log2;
+
 NvmeNamespaceParams params;
 
 struct {
@@ -74,8 +96,38 @@ static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t 
lba)
 
 typedef struct NvmeCtrl NvmeCtrl;
 
+static inline enum NvmeZoneState nvme_get_zone_state(NvmeZone *zone)
+{
+return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
state)
+{
+zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeNamespace *ns, NvmeZone *zone)
+{
+return zone->d.zslba + ns->zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+uint8_t st = nvme_get_zone_state(zone);
+
+return st != NVME_ZONE_STATE_FULL &&
+   st != NVME_ZONE_STATE_READ_ONLY &&
+   st != NVME_ZONE_STATE_OFFLINE;
+}
+
 int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
 void nvme_ns_shutdown(NvmeNamespace *ns);
+void nvme_ns_cleanup(NvmeNamespace *ns);
 
 #endif /* NVME_NS_H */
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 574333caa3f9..b7fbcca39d9f 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -6,6 +6,9 @@
 
 #define NVME_MAX_NAMESPACES 256
 
+#define NVME_DEFAULT_ZONE_SIZE   (128 * MiB)
+#define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
@@ -16,6 +19,7 @@ typedef struct NvmeParams {
 uint32_t aer_max_queued;
 uint8_t  mdts;
 bool use_intel_id;
+uint32_t zasl_bs;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -149,6 +153,8 @@ typedef struct NvmeCtrl {
 QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
 int aer_queued;
 
+uint8_t zasl;
+
 

[PULL 26/56] hw/block/nvme: add missing string representations for commands

2021-02-09 Thread Klaus Jensen
From: Klaus Jensen 

Add missing string representations for a couple of new commands.

Signed-off-by: Klaus Jensen 
Tested-by: Dmitry Fomichev 
Reviewed-by: Dmitry Fomichev 
---
 hw/block/nvme.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b7fbcca39d9f..65540b650e1d 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -64,8 +64,12 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
 case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH";
 case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
+case NVME_CMD_COMPARE:  return "NVME_NVM_CMD_COMPARE";
 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
 case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
+case NVME_CMD_ZONE_MGMT_SEND:   return "NVME_ZONED_CMD_MGMT_SEND";
+case NVME_CMD_ZONE_MGMT_RECV:   return "NVME_ZONED_CMD_MGMT_RECV";
+case NVME_CMD_ZONE_APPEND:  return "NVME_ZONED_CMD_ZONE_APPEND";
 default:return "NVME_NVM_CMD_UNKNOWN";
 }
 }
-- 
2.30.0




[PULL 15/56] nvme: Make ZNS-related definitions

2021-02-09 Thread Klaus Jensen
From: Dmitry Fomichev 

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053).

Signed-off-by: Dmitry Fomichev 
Acked-by: Stefan Hajnoczi 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 include/block/nvme.h | 114 ++-
 1 file changed, 113 insertions(+), 1 deletion(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index adb5806365a3..9494246f1f59 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -489,6 +489,9 @@ enum NvmeIoCommands {
 NVME_CMD_COMPARE= 0x05,
 NVME_CMD_WRITE_ZEROES   = 0x08,
 NVME_CMD_DSM= 0x09,
+NVME_CMD_ZONE_MGMT_SEND = 0x79,
+NVME_CMD_ZONE_MGMT_RECV = 0x7a,
+NVME_CMD_ZONE_APPEND= 0x7d,
 };
 
 typedef struct QEMU_PACKED NvmeDeleteQ {
@@ -654,9 +657,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
 uint8_t resv;
 } NvmeAerResult;
 
+typedef struct QEMU_PACKED NvmeZonedResult {
+uint64_t slba;
+} NvmeZonedResult;
+
 typedef struct QEMU_PACKED NvmeCqe {
 uint32_tresult;
-uint32_trsvd;
+uint32_tdw1;
 uint16_tsq_head;
 uint16_tsq_id;
 uint16_tcid;
@@ -685,6 +692,7 @@ enum NvmeStatusCodes {
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
 NVME_CMD_SET_CMB_REJECTED   = 0x002b,
+NVME_INVALID_CMD_SET= 0x002c,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -709,6 +717,14 @@ enum NvmeStatusCodes {
 NVME_CONFLICTING_ATTRS  = 0x0180,
 NVME_INVALID_PROT_INFO  = 0x0181,
 NVME_WRITE_TO_RO= 0x0182,
+NVME_ZONE_BOUNDARY_ERROR= 0x01b8,
+NVME_ZONE_FULL  = 0x01b9,
+NVME_ZONE_READ_ONLY = 0x01ba,
+NVME_ZONE_OFFLINE   = 0x01bb,
+NVME_ZONE_INVALID_WRITE = 0x01bc,
+NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
+NVME_ZONE_TOO_MANY_OPEN = 0x01be,
+NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
 NVME_WRITE_FAULT= 0x0280,
 NVME_UNRECOVERED_READ   = 0x0281,
 NVME_E2E_GUARD_ERROR= 0x0282,
@@ -894,6 +910,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
 uint8_t vs[1024];
 } NvmeIdCtrl;
 
+typedef struct NvmeIdCtrlZoned {
+uint8_t zasl;
+uint8_t rsvd1[4095];
+} NvmeIdCtrlZoned;
+
 enum NvmeIdCtrlOacs {
 NVME_OACS_SECURITY  = 1 << 0,
 NVME_OACS_FORMAT= 1 << 1,
@@ -1022,6 +1043,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
 uint8_t rp;
 } NvmeLBAF;
 
+typedef struct QEMU_PACKED NvmeLBAFE {
+uint64_tzsze;
+uint8_t zdes;
+uint8_t rsvd9[7];
+} NvmeLBAFE;
+
 #define NVME_NSID_BROADCAST 0x
 
 typedef struct QEMU_PACKED NvmeIdNs {
@@ -1081,10 +1108,24 @@ enum NvmeNsIdentifierType {
 
 enum NvmeCsi {
 NVME_CSI_NVM= 0x00,
+NVME_CSI_ZONED  = 0x02,
 };
 
 #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
 
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+uint16_tzoc;
+uint16_tozcs;
+uint32_tmar;
+uint32_tmor;
+uint32_trrl;
+uint32_tfrl;
+uint8_t rsvd20[2796];
+NvmeLBAFE   lbafe[16];
+uint8_t rsvd3072[768];
+uint8_t vs[256];
+} NvmeIdNsZoned;
+
 /*Deallocate Logical Block Features*/
 #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)   ((dlfeat) & 0x10)
 #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08)
@@ -1117,10 +1158,76 @@ enum NvmeIdNsDps {
 DPS_FIRST_EIGHT = 8,
 };
 
+enum NvmeZoneAttr {
+NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
+NVME_ZA_FINISH_RECOMMENDED   = 1 << 1,
+NVME_ZA_RESET_RECOMMENDED= 1 << 2,
+NVME_ZA_ZD_EXT_VALID = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+uint64_tnr_zones;
+uint8_t rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+NVME_ZONE_REPORT = 0,
+NVME_ZONE_REPORT_EXTENDED= 1,
+};
+
+enum NvmeZoneReportType {
+NVME_ZONE_REPORT_ALL = 0,
+NVME_ZONE_REPORT_EMPTY   = 1,
+NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+NVME_ZONE_REPORT_CLOSED  = 4,
+NVME_ZONE_REPORT_FULL= 5,
+NVME_ZONE_REPORT_READ_ONLY   = 6,
+NVME_ZONE_REPORT_OFFLINE = 7,
+};
+
+enum NvmeZoneType {
+NVME_ZONE_TYPE_RESERVED  = 0x00,
+NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
+};
+
+enum NvmeZoneSendAction {
+NVME_ZONE_ACTION_RSD = 0x00,
+NVME_ZONE_ACTION_CLOSE   = 0x01,
+NVME_ZONE_ACTION_FINISH  = 0x02,
+NVME_ZONE_ACTION_OPEN= 0x03,
+NVME_ZONE_ACTION_RESET   = 0x04,
+NVME_ZONE_ACTION_OFFLINE = 0x05,
+NVME_ZONE_ACTION_SET_ZD_EXT  = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+uint8_t  

[PULL 13/56] hw/block/nvme: Add support for Namespace Types

2021-02-09 Thread Klaus Jensen
From: Niklas Cassel 

Define the structures and constants required to implement
Namespace Types support.

Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set and enable
detection for the NVM Command Set.

The new workflows for identify commands rely heavily on zero-filled
identify structs. E.g., certain CNS commands are defined to return
a zero-filled identify struct when an inactive namespace NSID
is supplied.

Add a helper function in order to avoid code duplication when
reporting zero-filled identify structures.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h|   1 +
 include/block/nvme.h  |  64 ++
 hw/block/nvme-ns.c|   2 +
 hw/block/nvme.c   | 188 +++---
 hw/block/trace-events |   6 ++
 5 files changed, 217 insertions(+), 44 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index bdeaf1c0de84..bdbc98c2ec17 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,7 @@ typedef struct NvmeNamespace {
 int64_t  size;
 NvmeIdNs id_ns;
 const uint32_t *iocs;
+uint8_t  csi;
 
 NvmeNamespaceParams params;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 397f7ca3b5cb..19347cf69e52 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -84,6 +84,7 @@ enum NvmeCapMask {
 
 enum NvmeCapCss {
 NVME_CAP_CSS_NVM= 1 << 0,
+NVME_CAP_CSS_CSI_SUPP   = 1 << 6,
 NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
 };
 
@@ -117,9 +118,25 @@ enum NvmeCcMask {
 
 enum NvmeCcCss {
 NVME_CC_CSS_NVM= 0x0,
+NVME_CC_CSS_CSI= 0x6,
 NVME_CC_CSS_ADMIN_ONLY = 0x7,
 };
 
+#define NVME_SET_CC_EN(cc, val) \
+(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)\
+(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)\
+(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)\
+(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)\
+(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
 enum NvmeCstsShift {
 CSTS_RDY_SHIFT  = 0,
 CSTS_CFS_SHIFT  = 1,
@@ -540,8 +557,13 @@ typedef struct QEMU_PACKED NvmeIdentify {
 uint64_trsvd2[2];
 uint64_tprp1;
 uint64_tprp2;
-uint32_tcns;
-uint32_trsvd11[5];
+uint8_t cns;
+uint8_t rsvd10;
+uint16_tctrlid;
+uint16_tnvmsetid;
+uint8_t rsvd11;
+uint8_t csi;
+uint32_trsvd12[4];
 } NvmeIdentify;
 
 typedef struct QEMU_PACKED NvmeRwCmd {
@@ -662,6 +684,7 @@ enum NvmeStatusCodes {
 NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
 NVME_INVALID_USE_OF_CMB = 0x0012,
 NVME_INVALID_PRP_OFFSET = 0x0013,
+NVME_CMD_SET_CMB_REJECTED   = 0x002b,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -789,11 +812,15 @@ typedef struct QEMU_PACKED NvmePSD {
 
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
-enum {
-NVME_ID_CNS_NS = 0x0,
-NVME_ID_CNS_CTRL   = 0x1,
-NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
-NVME_ID_CNS_NS_DESCR_LIST  = 0x3,
+enum NvmeIdCns {
+NVME_ID_CNS_NS= 0x00,
+NVME_ID_CNS_CTRL  = 0x01,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x02,
+NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct QEMU_PACKED NvmeIdCtrl {
@@ -944,6 +971,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
 };
@@ -1033,18 +1061,26 @@ typedef struct QEMU_PACKED NvmeIdNsDescr {
 uint8_t rsvd2[2];
 } NvmeIdNsDescr;
 
-enum {
-NVME_NIDT_EUI64_LEN =  8,
-NVME_NIDT_NGUID_LEN = 16,
-NVME_NIDT_UUID_LEN  = 16,
+enum NvmeNsIdentifierLength {
+NVME_NIDL_EUI64 = 8,
+NVME_NIDL_NGUID = 16,
+NVME_NIDL_UUID  = 16,
+NVME_NIDL_CSI   = 1,
 };
 
 enum NvmeNsIdentifierType {
-NVME_NIDT_EUI64 = 0x1,
-NVME_NIDT_NGUID = 0x2,
-NVME_NIDT_UUID  = 0x3,
+NVME_NIDT_EUI64 = 0x01,
+NVME_NIDT_NGUID   

[PULL 12/56] hw/block/nvme: Add Commands Supported and Effects log

2021-02-09 Thread Klaus Jensen
From: Dmitry Fomichev 

This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.

This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.

All incoming admin and i/o commands are now only processed if their
corresponding support bits are set in this log. This provides an
easy way to control what commands to support and what not to
depending on set CC.CSS.

Signed-off-by: Dmitry Fomichev 
Reviewed-by: Niklas Cassel 
Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h|   1 +
 include/block/nvme.h  |  19 
 hw/block/nvme.c   | 102 ++
 hw/block/trace-events |   1 +
 4 files changed, 114 insertions(+), 9 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index aeca810fc7a8..bdeaf1c0de84 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -30,6 +30,7 @@ typedef struct NvmeNamespace {
 int32_t  bootindex;
 int64_t  size;
 NvmeIdNs id_ns;
+const uint32_t *iocs;
 
 NvmeNamespaceParams params;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 11ac1c2b7dfb..397f7ca3b5cb 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -752,10 +752,27 @@ enum NvmeSmartWarn {
 NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
 };
 
+typedef struct NvmeEffectsLog {
+uint32_tacs[256];
+uint32_tiocs[256];
+uint8_t resv[2048];
+} NvmeEffectsLog;
+
+enum {
+NVME_CMD_EFF_CSUPP  = 1 << 0,
+NVME_CMD_EFF_LBCC   = 1 << 1,
+NVME_CMD_EFF_NCC= 1 << 2,
+NVME_CMD_EFF_NIC= 1 << 3,
+NVME_CMD_EFF_CCC= 1 << 4,
+NVME_CMD_EFF_CSE_MASK   = 3 << 16,
+NVME_CMD_EFF_UUID_SEL   = 1 << 19,
+};
+
 enum NvmeLogIdentifier {
 NVME_LOG_ERROR_INFO = 0x01,
 NVME_LOG_SMART_INFO = 0x02,
 NVME_LOG_FW_SLOT_INFO   = 0x03,
+NVME_LOG_CMD_EFFECTS= 0x05,
 };
 
 typedef struct QEMU_PACKED NvmePSD {
@@ -868,6 +885,7 @@ enum NvmeIdCtrlFrmw {
 
 enum NvmeIdCtrlLpa {
 NVME_LPA_NS_SMART = 1 << 0,
+NVME_LPA_CSE  = 1 << 1,
 NVME_LPA_EXTENDED = 1 << 2,
 };
 
@@ -1076,6 +1094,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4d1ca8c466c5..05e799623c41 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -112,6 +112,30 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 };
 
+static const uint32_t nvme_cse_acs[256] = {
+[NVME_ADM_CMD_DELETE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_SQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_DELETE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_CREATE_CQ]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ABORT]= NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
+[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
+};
+
+static const uint32_t nvme_cse_iocs_none[256];
+
+static const uint32_t nvme_cse_iocs_nvm[256] = {
+[NVME_CMD_FLUSH]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_WRITE]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
+[NVME_CMD_DSM]  = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
+[NVME_CMD_COMPARE]  = NVME_CMD_EFF_CSUPP,
+};
+
 static void nvme_process_sq(void *opaque);
 
 static uint16_t nvme_cid(NvmeRequest *req)
@@ -1306,10 +1330,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
   req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
 
-if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_ADMIN_ONLY) {
-return NVME_INVALID_OPCODE | NVME_DNR;
-}
-
 if (!nvme_nsid_valid(n, nsid)) {
 return NVME_INVALID_NSID | NVME_DNR;
 }
@@ -1319,6 +1339,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
+if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
+trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
+return NVME_INVALID_OPCODE