date:20200615

[PATCH 18/18] hw/block/nvme: Document zoned parameters in usage text

2020-06-15 Thread Dmitry Fomichev

Added brief descriptions of the new driver properties now available
to users to configure features of Zoned Namespace Command Set in the
driver.

This patch is for documentation only, no functionality change.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 62 +++--
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 1ecbc31272..b765475bb0 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -9,7 +9,7 @@
  */
 
 /**
- * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
+ * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
  *
  *  http://www.nvmexpress.org/resources/
  */
@@ -20,7 +20,8 @@
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
  *  [pmrdev=,] \
- *  max_ioqpairs=
+ *  max_ioqpairs= \
+ *  zoned=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -32,6 +33,63 @@
  * For example:
  * -object memory-backend-file,id=,share=on,mem-path=, \
  *  size=  -device nvme,...,pmrdev=
+ *
+ * Setting "zoned" to true makes the driver emulate zoned namespaces.
+ * In this case, of the following options are available to configure zoned
+ * operation:
+ * zone_size=
+ *
+ * zone_capacity=
+ *
+ * zone_file=
+ * Zone metadata file, if specified, allows zone information
+ * to be persistent across shutdowns and restarts.
+ *
+ * zone_descr_ext_size=
+ * This value needs to be specified in 64B units. If it is zero,
+ * namespace(s) will not support zone descriptor extensions.
+ *
+ * max_active=
+ *
+ * max_open=
+ *
+ * reset_rcmnd_delay=
+ * The amount of time that passes between the moment when a zone
+ * enters Full state and when Reset Zone Recommended attribute
+ * is set for that zone.
+ *
+ * reset_rcmnd_limit=
+ * If this value is zero (default), RZR attribute is not set for
+ *  any zones.
+ *
+ * finish_rcmnd_delay=
+ * The amount of time that passes between the moment when a zone
+ * enters an Open or Closed state and when Finish Zone Recommended
+ * attribute is set for that zone.
+ *
+ * finish_rcmnd_limit=
+ * If this value is zero (default), FZR attribute is not set for
+ * any zones.
+ *
+ * zamds=
+ * The maximum I/O size that can be supported by Zone Append
+ * command. Since internally this this value is maintained as
+ * ZAMDS = log2( / ), some
+ * values assigned to this property may be rounded down and
+ * result in a lower maximum ZA data size being in effect.
+ *
+ * zone_async_events=
+ * Enable sending Zone Descriptor Changed AENs to the host.
+ *
+ * offline_zones=
+ *
+ * rdonly_zones=
+ *
+ * cross_zone_read=
+ *
+ * fill_pattern=
+ * Byte pattern to to return for any portions of unwritten data
+ * during read.
  */
 
 #include "qemu/osdep.h"
-- 
2.21.0

[PATCH 12/18] hw/block/nvme: Simulate Zone Active excursions

2020-06-15 Thread Dmitry Fomichev

Added a Boolean flag to turn on simulation of Zone Active Excursions.
If the flag, "active_excursions", is set to true, the driver will try
to finish one of the currently open zone if max active zones limit is
going to get exceeded.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 24 +++-
 hw/block/nvme.h |  1 +
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3e51837a63..d9110b39d3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -539,6 +539,26 @@ static void nvme_auto_transition_zone(NvmeCtrl *n, 
NvmeNamespace *ns,
 {
 NvmeZone *zone;
 
+if (n->params.active_excursions && adding_active &&
+n->params.max_active_zones &&
+ns->nr_active_zones == n->params.max_active_zones) {
+zone = nvme_peek_zone_head(ns, ns->closed_zones);
+if (zone) {
+/*
+ * The namespace is at the limit of active zones.
+ * Try to finish one of the currently active zones
+ * to make the needed active zone resource available.
+ */
+nvme_aor_dec_active(n, ns);
+nvme_assign_zone_state(n, ns, zone, NVME_ZONE_STATE_FULL);
+zone->d.za &= ~(NVME_ZA_FINISH_RECOMMENDED |
+NVME_ZA_RESET_RECOMMENDED);
+zone->d.za |= NVME_ZA_FINISHED_BY_CTLR;
+zone->tstamp = 0;
+trace_pci_nvme_zone_finished_by_controller(zone->d.zslba);
+}
+}
+
 if (implicit && n->params.max_open_zones &&
 ns->nr_open_zones == n->params.max_open_zones) {
 zone = nvme_remove_zone_head(n, ns, ns->imp_open_zones);
@@ -2630,7 +2650,7 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace 
*ns, int lba_index,
 /* MAR/MOR are zeroes-based, 0x means no limit */
 ns->id_ns_zoned->mar = cpu_to_le32(n->params.max_active_zones - 1);
 ns->id_ns_zoned->mor = cpu_to_le32(n->params.max_open_zones - 1);
-ns->id_ns_zoned->zoc = 0;
+ns->id_ns_zoned->zoc = cpu_to_le16(n->params.active_excursions ? 0x2 : 0);
 ns->id_ns_zoned->ozcs = n->params.cross_zone_read ? 0x01 : 0x00;
 
 ns->id_ns_zoned->lbafe[lba_index].zsze = cpu_to_le64(n->params.zone_size);
@@ -2975,6 +2995,8 @@ static Property nvme_props[] = {
 DEFINE_PROP_INT32("max_active", NvmeCtrl, params.max_active_zones, 0),
 DEFINE_PROP_INT32("max_open", NvmeCtrl, params.max_open_zones, 0),
 DEFINE_PROP_BOOL("cross_zone_read", NvmeCtrl, params.cross_zone_read, 
true),
+DEFINE_PROP_BOOL("active_excursions", NvmeCtrl, params.active_excursions,
+ false),
 DEFINE_PROP_UINT8("fill_pattern", NvmeCtrl, params.fill_pattern, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 75bbad1509..cad154a21f 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -14,6 +14,7 @@ typedef struct NvmeParams {
 
 boolzoned;
 boolcross_zone_read;
+boolactive_excursions;
 uint8_t fill_pattern;
 uint32_tzamds_bs;
 uint64_tzone_size;
-- 
2.21.0

[PATCH 11/18] hw/block/nvme: Introduce max active and open zone limits

2020-06-15 Thread Dmitry Fomichev

Added two module properties, "max_active" and "max_open" to control
the maximum number of zones that can be active or open. Once these
variables are set to non-default values, the driver checks these
limits during I/O and returns Too Many Active or Too Many Open
command status if they are exceeded.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 183 +++-
 hw/block/nvme.h |   4 ++
 2 files changed, 185 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7bf452d597..3e51837a63 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -119,6 +119,87 @@ static void nvme_remove_zone(NvmeCtrl *n, NvmeNamespace 
*ns, NvmeZoneList *zl,
 zone->prev = zone->next = 0;
 }
 
+/*
+ * Take the first zone out from a list, return NULL if the list is empty.
+ */
+static NvmeZone *nvme_remove_zone_head(NvmeCtrl *n, NvmeNamespace *ns,
+NvmeZoneList *zl)
+{
+NvmeZone *zone = nvme_peek_zone_head(ns, zl);
+
+if (zone) {
+--zl->size;
+if (zl->size == 0) {
+zl->head = NVME_ZONE_LIST_NIL;
+zl->tail = NVME_ZONE_LIST_NIL;
+} else {
+zl->head = zone->next;
+ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
+}
+zone->prev = zone->next = 0;
+}
+
+return zone;
+}
+
+/*
+ * Check if we can open a zone without exceeding open/active limits.
+ * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
+ */
+static int nvme_aor_check(NvmeCtrl *n, NvmeNamespace *ns,
+ uint32_t act, uint32_t opn)
+{
+if (n->params.max_active_zones != 0 &&
+ns->nr_active_zones + act > n->params.max_active_zones) {
+trace_pci_nvme_err_insuff_active_res(n->params.max_active_zones);
+return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
+}
+if (n->params.max_open_zones != 0 &&
+ns->nr_open_zones + opn > n->params.max_open_zones) {
+trace_pci_nvme_err_insuff_open_res(n->params.max_open_zones);
+return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
+}
+
+return NVME_SUCCESS;
+}
+
+static inline void nvme_aor_inc_open(NvmeCtrl *n, NvmeNamespace *ns)
+{
+assert(ns->nr_open_zones >= 0);
+if (n->params.max_open_zones) {
+ns->nr_open_zones++;
+assert(ns->nr_open_zones <= n->params.max_open_zones);
+}
+}
+
+static inline void nvme_aor_dec_open(NvmeCtrl *n, NvmeNamespace *ns)
+{
+if (n->params.max_open_zones) {
+assert(ns->nr_open_zones > 0);
+ns->nr_open_zones--;
+}
+assert(ns->nr_open_zones >= 0);
+}
+
+static inline void nvme_aor_inc_active(NvmeCtrl *n, NvmeNamespace *ns)
+{
+assert(ns->nr_active_zones >= 0);
+if (n->params.max_active_zones) {
+ns->nr_active_zones++;
+assert(ns->nr_active_zones <= n->params.max_active_zones);
+}
+}
+
+static inline void nvme_aor_dec_active(NvmeCtrl *n, NvmeNamespace *ns)
+{
+if (n->params.max_active_zones) {
+assert(ns->nr_active_zones > 0);
+ns->nr_active_zones--;
+assert(ns->nr_active_zones >= ns->nr_open_zones);
+}
+assert(ns->nr_active_zones >= 0);
+}
+
 static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns,
 NvmeZone *zone, uint8_t state)
 {
@@ -453,6 +534,24 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, 
NvmeRequest *req)
 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 }
 
+static void nvme_auto_transition_zone(NvmeCtrl *n, NvmeNamespace *ns,
+bool implicit, bool adding_active)
+{
+NvmeZone *zone;
+
+if (implicit && n->params.max_open_zones &&
+ns->nr_open_zones == n->params.max_open_zones) {
+zone = nvme_remove_zone_head(n, ns, ns->imp_open_zones);
+if (zone) {
+/*
+ * Automatically close this implicitly open zone.
+ */
+nvme_aor_dec_open(n, ns);
+nvme_assign_zone_state(n, ns, zone, NVME_ZONE_STATE_CLOSED);
+}
+}
+}
+
 static uint16_t nvme_check_zone_write(NvmeZone *zone, uint64_t slba,
 uint32_t nlb)
 {
@@ -530,6 +629,23 @@ static uint16_t nvme_check_zone_read(NvmeCtrl *n, NvmeZone 
*zone, uint64_t slba,
 return status;
 }
 
+static uint16_t nvme_auto_open_zone(NvmeCtrl *n, NvmeNamespace *ns,
+NvmeZone *zone)
+{
+uint16_t status = NVME_SUCCESS;
+uint8_t zs = nvme_get_zone_state(zone);
+
+if (zs == NVME_ZONE_STATE_EMPTY) {
+nvme_auto_transition_zone(n, ns, true, true);
+status = nvme_aor_check(n, ns, 1, 1);
+} else if (zs == NVME_ZONE_STATE_CLOSED) {
+nvme_auto_transition_zone(n, ns, true, false);
+status = nvme_aor_check(n, ns, 0, 1);
+}
+
+return status;
+}
+
 static uint64_t nvme_finalize_zone_write(NvmeCtrl *n, NvmeNamespace *ns,
 NvmeZone *zone, uint32_t nlb)
 {
@@ -542,7 +658,11 @@ static uint64_t nvme_finalize_zone_write(NvmeCtrl *n, 
NvmeNamespace *ns,
 switch (zs) {
 case NVM

[PATCH 15/18] hw/block/nvme: Support Zone Descriptor Extensions

2020-06-15 Thread Dmitry Fomichev

Zone Descriptor Extension is a label that can be assigned to a zone.
It can be set to an Empty zone and it stays assigned until the zone
is reset.

This commit adds a new optional property, "zone_descr_ext_size", to
the driver. Its value must be a multiple of 64 bytes. If this value
is non-zero, it becomes possible to assign extensions of that size
to any Empty zones. The default value for this property is 0,
therefore setting extensions is disabled by default.

Signed-off-by: Hans Holmberg 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 76 ++---
 hw/block/nvme.h |  8 ++
 2 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8dfb568173..16780d5116 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1359,6 +1359,26 @@ static bool nvme_cond_offline_all(uint8_t state)
 return state == NVME_ZONE_STATE_READ_ONLY;
 }
 
+static uint16_t nvme_set_zd_ext(NvmeCtrl *n, NvmeNamespace *ns,
+NvmeZone *zone, uint8_t state)
+{
+uint16_t status;
+
+if (state == NVME_ZONE_STATE_EMPTY) {
+nvme_auto_transition_zone(n, ns, false, true);
+status = nvme_aor_check(n, ns, 1, 0);
+if (status != NVME_SUCCESS) {
+return status;
+}
+nvme_aor_inc_active(n, ns);
+zone->d.za |= NVME_ZA_ZD_EXT_VALID;
+nvme_assign_zone_state(n, ns, zone, NVME_ZONE_STATE_CLOSED);
+return NVME_SUCCESS;
+}
+
+return NVME_ZONE_INVAL_TRANSITION;
+}
+
 static uint16_t name_do_zone_op(NvmeCtrl *n, NvmeNamespace *ns,
 NvmeZone *zone, uint8_t state, bool all,
 uint16_t (*op_hndlr)(NvmeCtrl *, NvmeNamespace *, NvmeZone *,
@@ -1387,13 +1407,16 @@ static uint16_t name_do_zone_op(NvmeCtrl *n, 
NvmeNamespace *ns,
 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeNamespace *ns,
 NvmeCmd *cmd, NvmeRequest *req)
 {
+NvmeRwCmd *rw;
 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
+uint64_t prp1, prp2;
 uint64_t slba = 0;
 uint64_t zone_idx = 0;
 uint16_t status;
 uint8_t action, state;
 bool all;
 NvmeZone *zone;
+uint8_t *zd_ext;
 
 action = dw13 & 0xff;
 all = dw13 & 0x100;
@@ -1448,7 +1471,25 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, 
NvmeNamespace *ns,
 
 case NVME_ZONE_ACTION_SET_ZD_EXT:
 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
-return NVME_INVALID_FIELD | NVME_DNR;
+if (all || !n->params.zd_extension_size) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+zd_ext = nvme_get_zd_extension(n, ns, zone_idx);
+rw = (NvmeRwCmd *)cmd;
+prp1 = le64_to_cpu(rw->prp1);
+prp2 = le64_to_cpu(rw->prp2);
+status = nvme_dma_write_prp(n, zd_ext, n->params.zd_extension_size,
+prp1, prp2);
+if (status) {
+trace_pci_nvme_err_zd_extension_map_error(zone_idx);
+return status;
+}
+
+status = nvme_set_zd_ext(n, ns, zone, state);
+if (status == NVME_SUCCESS) {
+trace_pci_nvme_zd_extension_set(zone_idx);
+return status;
+}
 break;
 
 default:
@@ -1527,7 +1568,7 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, 
NvmeNamespace *ns,
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-if (zra == NVME_ZONE_REPORT_EXTENDED) {
+if (zra == NVME_ZONE_REPORT_EXTENDED && !n->params.zd_extension_size) {
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
@@ -1539,6 +1580,9 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, 
NvmeNamespace *ns,
 partial = (dw13 >> 16) & 0x01;
 
 zone_entry_sz = sizeof(NvmeZoneDescr);
+if (zra == NVME_ZONE_REPORT_EXTENDED) {
+zone_entry_sz += n->params.zd_extension_size;
+}
 
 max_zones = (len - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
 buf = g_malloc0(len);
@@ -1570,6 +1614,14 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, 
NvmeNamespace *ns,
 z->wp = cpu_to_le64(~0ULL);
 }
 
+if (zra == NVME_ZONE_REPORT_EXTENDED) {
+if (zs->d.za & NVME_ZA_ZD_EXT_VALID) {
+memcpy(buf_p, nvme_get_zd_extension(n, ns, zone_index),
+   n->params.zd_extension_size);
+}
+buf_p += n->params.zd_extension_size;
+}
+
 zone_index++;
 }
 
@@ -2336,7 +2388,7 @@ static uint16_t nvme_handle_changed_zone_log(NvmeCtrl *n, 
NvmeCmd *cmd,
 continue;
 }
 num_aen_zones++;
-if (zone->d.za) {
+if (zone->d.za & ~NVME_ZA_ZD_EXT_VALID) {
 trace_pci_nvme_reporting_changed_zone(zone->d.zslba, zone->d.za);
 *zid_ptr++ = cpu_to_le64(zone->d.zslba);
 nids++;
@@ -2935,6 +2987,7 @@ static int nvme_init_zone_meta(NvmeCtrl *n, NvmeNamespace 
*ns,
 ns->imp_open_zones = g_malloc0(sizeof(NvmeZoneList));
 ns->closed_zones = g_malloc0(sizeof(NvmeZoneList));
 n

[PATCH 16/18] hw/block/nvme: Add injection of Offline/Read-Only zones

2020-06-15 Thread Dmitry Fomichev

ZNS specification defines two zone conditions for the zones that no
longer can function properly, possibly because of flash wear or other
internal fault. It is useful to be able to "inject" a small number of
such zones for testing purposes.

This commit defines two optional driver properties, "offline_zones"
and "rdonly_zones". User can assign non-zero values to these variables
to specify the number of zones to be initialized as Offline or
Read-Only. The actual number of injected zones may be smaller than the
requested amount - Read-Only and Offline counts are expected to be much
smaller than the total number of drive zones.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 46 ++
 hw/block/nvme.h |  2 ++
 2 files changed, 48 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 16780d5116..930f5f9d41 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2979,8 +2979,11 @@ static int nvme_init_zone_meta(NvmeCtrl *n, 
NvmeNamespace *ns,
 uint64_t capacity)
 {
 NvmeZone *zone;
+Error *err;
 uint64_t start = 0, zone_size = n->params.zone_size;
+uint32_t rnd;
 int i;
+uint16_t zs;
 
 ns->zone_array = g_malloc0(n->zone_array_size);
 ns->exp_open_zones = g_malloc0(sizeof(NvmeZoneList));
@@ -3010,6 +3013,37 @@ static int nvme_init_zone_meta(NvmeCtrl *n, 
NvmeNamespace *ns,
 start += zone_size;
 }
 
+/* If required, make some zones Offline or Read Only */
+
+for (i = 0; i < n->params.nr_offline_zones; i++) {
+do {
+qcrypto_random_bytes(&rnd, sizeof(rnd), &err);
+rnd %= n->num_zones;
+} while (rnd < n->params.max_open_zones);
+zone = &ns->zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_OFFLINE);
+} else {
+i--;
+}
+}
+
+for (i = 0; i < n->params.nr_rdonly_zones; i++) {
+do {
+qcrypto_random_bytes(&rnd, sizeof(rnd), &err);
+rnd %= n->num_zones;
+} while (rnd < n->params.max_open_zones);
+zone = &ns->zone_array[rnd];
+zs = nvme_get_zone_state(zone);
+if (zs != NVME_ZONE_STATE_OFFLINE &&
+zs != NVME_ZONE_STATE_READ_ONLY) {
+nvme_set_zone_state(zone, NVME_ZONE_STATE_READ_ONLY);
+} else {
+i--;
+}
+}
+
 return 0;
 }
 
@@ -3062,6 +3096,16 @@ static void nvme_zoned_init_ctrl(NvmeCtrl *n, Error 
**errp)
 if (n->params.max_active_zones > nz) {
 n->params.max_active_zones = nz;
 }
+if (n->params.max_open_zones < nz) {
+if (n->params.nr_offline_zones > nz - n->params.max_open_zones) {
+n->params.nr_offline_zones = nz - n->params.max_open_zones;
+}
+if (n->params.nr_rdonly_zones >
+nz - n->params.max_open_zones - n->params.nr_offline_zones) {
+n->params.nr_rdonly_zones =
+nz - n->params.max_open_zones - n->params.nr_offline_zones;
+}
+}
 if (n->params.zd_extension_size) {
 if (n->params.zd_extension_size & 0x3f) {
 error_setg(errp,
@@ -3453,6 +3497,8 @@ static Property nvme_props[] = {
 DEFINE_PROP_UINT64("finish_rcmnd_delay", NvmeCtrl,
params.fzr_delay_usec, 0),
 DEFINE_PROP_UINT64("finish_rcmnd_limit", NvmeCtrl, params.frl_usec, 0),
+DEFINE_PROP_UINT32("offline_zones", NvmeCtrl, params.nr_offline_zones, 0),
+DEFINE_PROP_UINT32("rdonly_zones", NvmeCtrl, params.nr_rdonly_zones, 0),
 DEFINE_PROP_BOOL("zone_async_events", NvmeCtrl, params.zone_async_events,
  true),
 DEFINE_PROP_BOOL("cross_zone_read", NvmeCtrl, params.cross_zone_read, 
true),
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index a9ac01434b..1726cc9d6e 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -23,6 +23,8 @@ typedef struct NvmeParams {
 uint64_tzone_capacity;
 int32_t max_active_zones;
 int32_t max_open_zones;
+uint32_tnr_offline_zones;
+uint32_tnr_rdonly_zones;
 uint32_tzd_extension_size;
 uint64_trzr_delay_usec;
 uint64_trrl_usec;
-- 
2.21.0

[PATCH 09/18] hw/block/nvme: Define Zoned NS Command Set trace events

2020-06-15 Thread Dmitry Fomichev

The Zoned Namespace Command Set / Namespace Types implementation that
is being introduced in this series adds a good number of trace events.
Combine all tracepoint definitions into a separate patch to make
reviewing more convenient.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/trace-events | 41 +
 1 file changed, 41 insertions(+)

diff --git a/hw/block/trace-events b/hw/block/trace-events
index 3f3323fe38..984db8a20c 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -66,6 +66,31 @@ pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects 
log read"
 pci_nvme_css_nvm_cset_selected_by_host(uint32_t cc) "NVM command set selected 
by host, bar.cc=0x%"PRIx32""
 pci_nvme_css_all_csets_sel_by_host(uint32_t cc) "all supported command sets 
selected by host, bar.cc=0x%"PRIx32""
+pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline 
zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
+pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone 
descriptor extension, slba=%"PRIu64", idx=%"PRIu32""
+pci_nvme_zone_reset_recommended(uint64_t slba) "slba=%"PRIu64""
+pci_nvme_zone_reset_internal_op(uint64_t slba) "slba=%"PRIu64""
+pci_nvme_zone_finish_recommended(uint64_t slba) "slba=%"PRIu64""
+pci_nvme_zone_finish_internal_op(uint64_t slba) "slba=%"PRIu64""
+pci_nvme_zone_finished_by_controller(uint64_t slba) "slba=%"PRIu64""
+pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for 
zone_idx=%"PRIu32""
+pci_nvme_power_on_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", 
slba=%"PRIu64" transitioned to Closed state"
+pci_nvme_power_on_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", 
slba=%"PRIu64" transitioned to Empty state"
+pci_nvme_power_on_full(uint32_t state, uint64_t slba) "zone state=%"PRIu32", 
slba=%"PRIu64" transitioned to Full state"
+pci_nvme_zone_ae_not_enabled(int info, int log_page, int nsid) "zone async 
event not enabled, info=0x%"PRIx32", lp=0x%"PRIx32", nsid=%"PRIu32""
+pci_nvme_zone_ae_not_cleared(int info, int log_page, int nsid) "zoned async 
event not cleared, info=0x%"PRIx32", lp=0x%"PRIx32", nsid=%"PRIu32""
+pci_nvme_zone_aen_not_requested(uint32_t oaes) "zone descriptor AEN are not 
requested by host, oaes=0x%"PRIx32""
+pci_nvme_getfeat_aen_cfg(uint64_t res) "reporting async event config 
res=%"PRIu64""
+pci_nvme_setfeat_zone_info_aer_on(void) "zone info change notices enabled"
+pci_nvme_setfeat_zone_info_aer_off(void) "zone info change notices disabled"
+pci_nvme_changed_zone_log_read(uint16_t nsid) "changed zone list log of ns 
%"PRIu16""
+pci_nvme_reporting_changed_zone(uint64_t zslba, uint8_t za) "zslba=%"PRIu64", 
attr=0x%"PRIx8""
+pci_nvme_empty_changed_zone_list(void) "no changes zones to report"
+pci_nvme_mapped_zone_file(char *zfile_name, int ret) "mapped zone file %s, 
error %d"
 
 # nvme traces for error conditions
 pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
@@ -77,10 +102,25 @@ pci_nvme_err_invalid_ns(uint32_t ns, uint32_t limit) 
"invalid namespace %u not w
 pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) 
"Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+pci_nvme_err_capacity_exceeded(uint64_t zone_id, uint64_t nr_zones) "zone 
capacity exceeded, zone_id=%"PRIu64", nr_zones=%"PRIu64""
+pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) 
"unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64""
+pci_nvme_err_invalid_zone_state_transition(uint8_t state, uint8_t action, 
uint64_t slba, uint8_t attrs) "0x%"PRIx32"->0x%"PRIx32", slba=%"PRIu64", 
attrs=0x%"PRIx32""
+pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) 
"writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64""
+pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at 
slba=%"PRIu64", but zone=%"PRIu64""
+pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint32_t status) 
"slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
+pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint32_t status) 
"slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16""
+pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t za

[PATCH 08/18] hw/block/nvme: Make Zoned NS Command Set definitions

2020-06-15 Thread Dmitry Fomichev

Define values and structures that are needed to support Zoned
Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator.

All new protocol definitions are located in include/block/nvme.h
and everything added that is specific to this implementation is kept
in hw/block/nvme.h.

In order to improve scalability, all open, closed and full zones
are organized in separate linked lists. Consequently, almost all
zone operations don't require scanning of the entire zone array
(which potentially can be quite large) - it is only necessary to
enumerate one or more zone lists. Zone lists are designed to be
position-independent as they can be persisted to the backing file
as a part of zone metadata. NvmeZoneList struct defined in this patch
serves as a head of every zone list.

NvmeZone structure encapsulates NvmeZoneDescriptor defined in Zoned
Command Set specification and adds a few more fields that are
internal to this implementation.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Matias Bjorling 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Alexey Bogoslavsky 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.h  | 130 +++
 include/block/nvme.h | 119 ++-
 2 files changed, 248 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index ef1197b4cd..1c7deaf42d 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -3,11 +3,21 @@
 
 #include "block/nvme.h"
 
+#define NVME_DEFAULT_ZONE_SIZE   128 /* MiB */
+#define NVME_DEFAULT_MAX_ZA_SIZE 128 /* KiB */
+
 typedef struct NvmeParams {
 char *serial;
 uint32_t num_queues; /* deprecated since 5.1 */
 uint32_t max_ioqpairs;
 uint32_t cmb_size_mb;
+
+boolzoned;
+boolcross_zone_read;
+uint8_t fill_pattern;
+uint32_tzamds_bs;
+uint64_tzone_size;
+uint64_tzone_capacity;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
@@ -16,6 +26,8 @@ typedef struct NvmeAsyncEvent {
 
 enum NvmeRequestFlags {
 NVME_REQ_FLG_HAS_SG   = 1 << 0,
+NVME_REQ_FLG_FILL = 1 << 1,
+NVME_REQ_FLG_APPEND   = 1 << 2,
 };
 
 typedef struct NvmeRequest {
@@ -23,6 +35,7 @@ typedef struct NvmeRequest {
 BlockAIOCB  *aiocb;
 uint16_tstatus;
 uint16_tflags;
+uint64_tfill_ofs;
 NvmeCqe cqe;
 BlockAcctCookie acct;
 QEMUSGList  qsg;
@@ -60,11 +73,35 @@ typedef struct NvmeCQueue {
 QTAILQ_HEAD(, NvmeRequest) req_list;
 } NvmeCQueue;
 
+typedef struct NvmeZone {
+NvmeZoneDescr   d;
+uint64_ttstamp;
+uint32_tnext;
+uint32_tprev;
+uint8_t rsvd80[8];
+} NvmeZone;
+
+#define NVME_ZONE_LIST_NILUINT_MAX
+
+typedef struct NvmeZoneList {
+uint32_thead;
+uint32_ttail;
+uint32_tsize;
+uint8_t rsvd12[4];
+} NvmeZoneList;
+
 typedef struct NvmeNamespace {
 NvmeIdNsid_ns;
 uint32_tnsid;
 uint8_t csi;
 QemuUUIDuuid;
+
+NvmeIdNsZoned   *id_ns_zoned;
+NvmeZone*zone_array;
+NvmeZoneList*exp_open_zones;
+NvmeZoneList*imp_open_zones;
+NvmeZoneList*closed_zones;
+NvmeZoneList*full_zones;
 } NvmeNamespace;
 
 static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
@@ -99,6 +136,7 @@ typedef struct NvmeCtrl {
 uint32_tnum_namespaces;
 uint32_tmax_q_ents;
 uint64_tns_size;
+
 uint8_t *cmbuf;
 uint32_tirq_status;
 uint64_thost_timestamp; /* Timestamp sent by the host 
*/
@@ -106,6 +144,12 @@ typedef struct NvmeCtrl {
 
 HostMemoryBackend *pmrdev;
 
+int zone_file_fd;
+uint32_tnum_zones;
+uint64_tzone_size_bs;
+uint64_tzone_array_size;
+uint8_t zamds;
+
 NvmeNamespace   *namespaces;
 NvmeSQueue  **sq;
 NvmeCQueue  **cq;
@@ -120,6 +164,86 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, 
NvmeNamespace *ns)
 return n->ns_size >> nvme_ns_lbads(ns);
 }
 
+static inline uint8_t nvme_get_zone_state(NvmeZone *zone)
+{
+return zone->d.zs >> 4;
+}
+
+static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState 
state)
+{
+zone->d.zs = state << 4;
+}
+
+static inline uint64_t nvme_zone_rd_boundary(NvmeCtrl *n, NvmeZone *zone)
+{
+return zone->d.zslba + n->params.zone_size;
+}
+
+static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone)
+{
+return zone->d.zslba + zone->d.zcap;
+}
+
+static inline bool nvme_wp_is_valid(NvmeZone *zone)
+{
+uint8_t st = nvme_get_zone_state(zone);
+
+return st != NVME_ZONE_STATE_FULL &&
+   st != NVME_ZONE_STATE_READ_ONLY &&
+   st != NVME_ZONE_STATE_OFFLINE;
+}
+
+/*
+ * Initialize a zone list head.
+ */
+static inline void nvme

[PATCH 01/18] hw/block/nvme: Move NvmeRequest has_sg field to a bit flag

2020-06-15 Thread Dmitry Fomichev

In addition to the existing has_sg flag, a few more Boolean
NvmeRequest flags are going to be introduced in subsequent patches.
Convert "has_sg" variable to "flags" and define NvmeRequestFlags
enum for individual flag values.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 8 +++-
 hw/block/nvme.h | 6 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 567bce7519..6e8e4ccdb1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -349,7 +349,7 @@ static void nvme_rw_cb(void *opaque, int ret)
 block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
 req->status = NVME_INTERNAL_DEV_ERROR;
 }
-if (req->has_sg) {
+if (req->flags & NVME_REQ_FLG_HAS_SG) {
 qemu_sglist_destroy(&req->qsg);
 }
 nvme_enqueue_req_completion(cq, req);
@@ -358,7 +358,6 @@ static void nvme_rw_cb(void *opaque, int ret)
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
 NvmeRequest *req)
 {
-req->has_sg = false;
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
  BLOCK_ACCT_FLUSH);
 req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
@@ -382,7 +381,6 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace 
*ns, NvmeCmd *cmd,
 return NVME_LBA_RANGE | NVME_DNR;
 }
 
-req->has_sg = false;
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
  BLOCK_ACCT_WRITE);
 req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
@@ -421,14 +419,13 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeCmd *cmd,
 
 dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
 if (req->qsg.nsg > 0) {
-req->has_sg = true;
+req->flags |= NVME_REQ_FLG_HAS_SG;
 req->aiocb = is_write ?
 dma_blk_write(n->conf.blk, &req->qsg, data_offset, 
BDRV_SECTOR_SIZE,
   nvme_rw_cb, req) :
 dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
  nvme_rw_cb, req);
 } else {
-req->has_sg = false;
 req->aiocb = is_write ?
 blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
 req) :
@@ -916,6 +913,7 @@ static void nvme_process_sq(void *opaque)
 QTAILQ_REMOVE(&sq->req_list, req, entry);
 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
 memset(&req->cqe, 0, sizeof(req->cqe));
+req->flags = 0;
 req->cqe.cid = cmd.cid;
 
 status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 61dd9b23b8..79513eaa49 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -15,11 +15,15 @@ typedef struct NvmeAsyncEvent {
 NvmeAerResult result;
 } NvmeAsyncEvent;
 
+enum NvmeRequestFlags {
+NVME_REQ_FLG_HAS_SG   = 1 << 0,
+};
+
 typedef struct NvmeRequest {
 struct NvmeSQueue   *sq;
 BlockAIOCB  *aiocb;
 uint16_tstatus;
-boolhas_sg;
+uint16_tflags;
 NvmeCqe cqe;
 BlockAcctCookie acct;
 QEMUSGList  qsg;
-- 
2.21.0

[PATCH 17/18] hw/block/nvme: Use zone metadata file for persistence

2020-06-15 Thread Dmitry Fomichev

A ZNS drive that is emulated by this driver is currently initialized
with all zones Empty upon startup. However, actual ZNS SSDs save the
state and condition of all zones in their internal NVRAM in the event
of power loss. When such a drive is powered up again, it closes or
finishes all zones that were open at the moment of shutdown. Besides
that, the write pointer position as well as the state and condition
of all zones is preserved across power-downs.

This commit adds the capability to have a persistent zone metadata
to the driver. The new optional driver property, "zone_file",
is introduced. If added to the command line, this property specifies
the name of the file that stores the zone metadata. If "zone_file" is
omitted, the driver will initialize with all zones empty, the same as
before.

If zone metadata is configured to be persistent, then zone descriptor
extensions also persist across controller shutdowns.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 371 +---
 hw/block/nvme.h |  38 +
 2 files changed, 388 insertions(+), 21 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 930f5f9d41..1ecbc31272 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -68,6 +68,8 @@
 } while (0)
 
 static void nvme_process_sq(void *opaque);
+static void nvme_sync_zone_file(NvmeCtrl *n, NvmeNamespace *ns,
+NvmeZone *zone, int len);
 
 /*
  * Add a zone to the tail of a zone list.
@@ -89,6 +91,7 @@ static void nvme_add_zone_tail(NvmeCtrl *n, NvmeNamespace 
*ns, NvmeZoneList *zl,
 zl->tail = idx;
 }
 zl->size++;
+nvme_set_zone_meta_dirty(n, ns, true);
 }
 
 /*
@@ -105,12 +108,15 @@ static void nvme_remove_zone(NvmeCtrl *n, NvmeNamespace 
*ns, NvmeZoneList *zl,
 if (zl->size == 0) {
 zl->head = NVME_ZONE_LIST_NIL;
 zl->tail = NVME_ZONE_LIST_NIL;
+nvme_set_zone_meta_dirty(n, ns, true);
 } else if (idx == zl->head) {
 zl->head = zone->next;
 ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
+nvme_set_zone_meta_dirty(n, ns, true);
 } else if (idx == zl->tail) {
 zl->tail = zone->prev;
 ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL;
+nvme_set_zone_meta_dirty(n, ns, true);
 } else {
 ns->zone_array[zone->next].prev = zone->prev;
 ns->zone_array[zone->prev].next = zone->next;
@@ -137,6 +143,7 @@ static NvmeZone *nvme_remove_zone_head(NvmeCtrl *n, 
NvmeNamespace *ns,
 ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
 }
 zone->prev = zone->next = 0;
+nvme_set_zone_meta_dirty(n, ns, true);
 }
 
 return zone;
@@ -475,6 +482,7 @@ static void nvme_assign_zone_state(NvmeCtrl *n, 
NvmeNamespace *ns,
 case NVME_ZONE_STATE_READ_ONLY:
 zone->tstamp = 0;
 }
+nvme_sync_zone_file(n, ns, zone, sizeof(NvmeZone));
 }
 
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
@@ -2975,9 +2983,114 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
-static int nvme_init_zone_meta(NvmeCtrl *n, NvmeNamespace *ns,
+static int nvme_validate_zone_file(NvmeCtrl *n, NvmeNamespace *ns,
 uint64_t capacity)
 {
+NvmeZoneMeta *meta = ns->zone_meta;
+NvmeZone *zone = ns->zone_array;
+uint64_t start = 0, zone_size = n->params.zone_size;
+int i, n_imp_open = 0, n_exp_open = 0, n_closed = 0, n_full = 0;
+
+if (meta->magic != NVME_ZONE_META_MAGIC) {
+return 1;
+}
+if (meta->version != NVME_ZONE_META_VER) {
+return 2;
+}
+if (meta->zone_size != zone_size) {
+return 3;
+}
+if (meta->zone_capacity != n->params.zone_capacity) {
+return 4;
+}
+if (meta->nr_offline_zones != n->params.nr_offline_zones) {
+return 5;
+}
+if (meta->nr_rdonly_zones != n->params.nr_rdonly_zones) {
+return 6;
+}
+if (meta->lba_size != n->conf.logical_block_size) {
+return 7;
+}
+if (meta->zd_extension_size != n->params.zd_extension_size) {
+return 8;
+}
+
+for (i = 0; i < n->num_zones; i++, zone++) {
+if (start + zone_size > capacity) {
+zone_size = capacity - start;
+}
+if (zone->d.zt != NVME_ZONE_TYPE_SEQ_WRITE) {
+return 9;
+}
+if (zone->d.zcap != n->params.zone_capacity) {
+return 10;
+}
+if (zone->d.zslba != start) {
+return 11;
+}
+switch (nvme_get_zone_state(zone)) {
+case NVME_ZONE_STATE_EMPTY:
+case NVME_ZONE_STATE_OFFLINE:
+case NVME_ZONE_STATE_READ_ONLY:
+if (zone->d.wp != start) {
+return 12;
+}
+break;
+case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+if (zone->d.wp < start ||
+zone->d.wp >= zone->d.zslba + zone->d.zcap) {
+return 13;
+}
+n_imp_open++;
+break;
+case N

[PATCH 02/18] hw/block/nvme: Define 64 bit cqe.result

2020-06-15 Thread Dmitry Fomichev

From: Ajay Joshi 

A new write command, Zone Append, is added as a part of Zoned
Namespace Command Set. Upon successful completion of this command,
the controller returns the start LBA of the performed write operation
in cqe.result field. Therefore, the maximum size of this variable
needs to be changed from 32 to 64 bit, consuming the reserved 32 bit
field that follows the result in CQE struct. Since the existing
commands are expected to return a 32 bit LE value, two separate
variables, result32 and result64, are now kept in a union.

Signed-off-by: Ajay Joshi 
Signed-off-by: Dmitry Fomichev 
---
 block/nvme.c | 2 +-
 block/trace-events   | 2 +-
 hw/block/nvme.c  | 6 +++---
 include/block/nvme.h | 6 --
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index eb2f54dd9d..ca245ec574 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -287,7 +287,7 @@ static inline int nvme_translate_error(const NvmeCqe *c)
 {
 uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
 if (status) {
-trace_nvme_error(le32_to_cpu(c->result),
+trace_nvme_error(le64_to_cpu(c->result64),
  le16_to_cpu(c->sq_head),
  le16_to_cpu(c->sq_id),
  le16_to_cpu(c->cid),
diff --git a/block/trace-events b/block/trace-events
index 29dff8881c..05c1393943 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -156,7 +156,7 @@ vxhs_get_creds(const char *cacert, const char *client_key, 
const char *client_ce
 # nvme.c
 nvme_kick(void *s, int queue) "s %p queue %d"
 nvme_dma_flush_queue_wait(void *s) "s %p"
-nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) 
"cmd_specific %d sq_head %d sqid %d cid %d status 0x%x"
+nvme_error(uint64_t cmd_specific, int sq_head, int sqid, int cid, int status) 
"cmd_specific %ld sq_head %d sqid %d cid %d status 0x%x"
 nvme_process_completion(void *s, int index, int inflight) "s %p queue %d 
inflight %d"
 nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d"
 nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d"
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6e8e4ccdb1..94110db596 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -822,7 +822,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-req->cqe.result = result;
+req->cqe.result32 = result;
 return NVME_SUCCESS;
 }
 
@@ -858,8 +858,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 ((dw11 >> 16) & 0x) + 1,
 n->params.max_ioqpairs,
 n->params.max_ioqpairs);
-req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
-  ((n->params.max_ioqpairs - 1) << 16));
+req->cqe.result32 = cpu_to_le32((n->params.max_ioqpairs - 1) |
+((n->params.max_ioqpairs - 1) << 16));
 break;
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, cmd);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 1720ee1d51..9c3a04dcd7 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -577,8 +577,10 @@ typedef struct NvmeAerResult {
 } NvmeAerResult;
 
 typedef struct NvmeCqe {
-uint32_tresult;
-uint32_trsvd;
+union {
+uint64_t result64;
+uint32_t result32;
+};
 uint16_tsq_head;
 uint16_tsq_id;
 uint16_tcid;
-- 
2.21.0

[PATCH 13/18] hw/block/nvme: Set Finish/Reset Zone Recommended attributes

2020-06-15 Thread Dmitry Fomichev

Added logic to set and reset FZR and RZR zone attributes. Four new
driver properties are added to control the timing of setting and
resetting these attributes. FZR/RZR delay lasts from the zone
operation and until when the corresponding zone attribute is set.
FZR/RZR limits set the time period between setting FZR or RZR
attribute and resetting it simulating the internal controller action
on that zone.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 99 +
 hw/block/nvme.h | 13 ++-
 2 files changed, 111 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d9110b39d3..6e6616f7e9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -200,6 +200,84 @@ static inline void nvme_aor_dec_active(NvmeCtrl *n, 
NvmeNamespace *ns)
 assert(ns->nr_active_zones >= 0);
 }
 
+static void nvme_set_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
+{
+assert(zone->flags & NVME_ZFLAGS_SET_RZR);
+zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+zone->flags &= ~NVME_ZFLAGS_TS_DELAY;
+zone->d.za |= NVME_ZA_RESET_RECOMMENDED;
+zone->flags &= ~NVME_ZFLAGS_SET_RZR;
+trace_pci_nvme_zone_reset_recommended(zone->d.zslba);
+}
+
+static void nvme_clear_rzr(NvmeCtrl *n, NvmeNamespace *ns,
+NvmeZone *zone, bool notify)
+{
+if (n->params.rrl_usec) {
+zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY);
+notify = notify && (zone->d.za & NVME_ZA_RESET_RECOMMENDED);
+zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED;
+zone->tstamp = 0;
+}
+}
+
+static void nvme_set_fzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
+{
+assert(zone->flags & NVME_ZFLAGS_SET_FZR);
+zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+zone->flags &= ~NVME_ZFLAGS_TS_DELAY;
+zone->d.za |= NVME_ZA_FINISH_RECOMMENDED;
+zone->flags &= ~NVME_ZFLAGS_SET_FZR;
+trace_pci_nvme_zone_finish_recommended(zone->d.zslba);
+}
+
+static void nvme_clear_fzr(NvmeCtrl *n, NvmeNamespace *ns,
+NvmeZone *zone, bool notify)
+{
+if (n->params.frl_usec) {
+zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY);
+notify = notify && (zone->d.za & NVME_ZA_FINISH_RECOMMENDED);
+zone->d.za &= ~NVME_ZA_FINISH_RECOMMENDED;
+zone->tstamp = 0;
+}
+}
+
+static void nvme_schedule_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
+{
+if (n->params.frl_usec) {
+zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY);
+zone->d.za &= ~NVME_ZA_FINISH_RECOMMENDED;
+zone->tstamp = 0;
+}
+if (n->params.rrl_usec) {
+zone->flags |= NVME_ZFLAGS_SET_RZR;
+if (n->params.rzr_delay_usec) {
+zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+zone->flags |= NVME_ZFLAGS_TS_DELAY;
+} else {
+nvme_set_rzr(n, ns, zone);
+}
+}
+}
+
+static void nvme_schedule_fzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
+{
+if (n->params.rrl_usec) {
+zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY);
+zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED;
+zone->tstamp = 0;
+}
+if (n->params.frl_usec) {
+zone->flags |= NVME_ZFLAGS_SET_FZR;
+if (n->params.fzr_delay_usec) {
+zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+zone->flags |= NVME_ZFLAGS_TS_DELAY;
+} else {
+nvme_set_fzr(n, ns, zone);
+}
+}
+}
+
 static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns,
 NvmeZone *zone, uint8_t state)
 {
@@ -207,15 +285,19 @@ static void nvme_assign_zone_state(NvmeCtrl *n, 
NvmeNamespace *ns,
 switch (nvme_get_zone_state(zone)) {
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
 nvme_remove_zone(n, ns, ns->exp_open_zones, zone);
+nvme_clear_fzr(n, ns, zone, false);
 break;
 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
 nvme_remove_zone(n, ns, ns->imp_open_zones, zone);
+nvme_clear_fzr(n, ns, zone, false);
 break;
 case NVME_ZONE_STATE_CLOSED:
 nvme_remove_zone(n, ns, ns->closed_zones, zone);
+nvme_clear_fzr(n, ns, zone, false);
 break;
 case NVME_ZONE_STATE_FULL:
 nvme_remove_zone(n, ns, ns->full_zones, zone);
+nvme_clear_rzr(n, ns, zone, false);
 }
}
 
@@ -224,15 +306,19 @@ static void nvme_assign_zone_state(NvmeCtrl *n, 
NvmeNamespace *ns,
 switch (state) {
 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
 nvme_add_zone_tail(n, ns, ns->exp_open_zones, zone);
+nvme_schedule_fzr(n, ns, zone);
 break;
 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
 nvme_add_zone_tail(n, ns, ns->imp_open_zones, zone);
+nvme_schedule_fzr(n, ns, zone);
 break;
 case NVME_ZONE_STATE_CLOSED:
 nvme_add_zone_tail(n, ns, ns->closed_zones, zone);
+nvme_schedu

[PATCH 05/18] hw/block/nvme: Introduce the Namespace Types definitions

2020-06-15 Thread Dmitry Fomichev

From: Niklas Cassel 

Define the structures and constants required to implement
Namespace Types support.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.h  |  3 ++
 include/block/nvme.h | 75 +---
 2 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index dae7b977af..fbf40c7256 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -62,6 +62,9 @@ typedef struct NvmeCQueue {
 
 typedef struct NvmeNamespace {
 NvmeIdNsid_ns;
+uint32_tnsid;
+uint8_t csi;
+QemuUUIDuuid;
 } NvmeNamespace;
 
 static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 6a58bac0c2..5a1e5e137c 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -50,6 +50,11 @@ enum NvmeCapMask {
 CAP_PMR_MASK   = 0x1,
 };
 
+enum NvmeCapCssBits {
+CAP_CSS_NVM= 0x01,
+CAP_CSS_CSI_SUPP   = 0x40,
+};
+
 #define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
 #define NVME_CAP_CQR(cap)   (((cap) >> CAP_CQR_SHIFT)& CAP_CQR_MASK)
 #define NVME_CAP_AMS(cap)   (((cap) >> CAP_AMS_SHIFT)& CAP_AMS_MASK)
@@ -101,6 +106,12 @@ enum NvmeCcMask {
 CC_IOCQES_MASK  = 0xf,
 };
 
+enum NvmeCcCss {
+CSS_NVM_ONLY= 0,
+CSS_ALL_NSTYPES = 6,
+CSS_ADMIN_ONLY  = 7,
+};
+
 #define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK)
 #define NVME_CC_CSS(cc)((cc >> CC_CSS_SHIFT)& CC_CSS_MASK)
 #define NVME_CC_MPS(cc)((cc >> CC_MPS_SHIFT)& CC_MPS_MASK)
@@ -109,6 +120,21 @@ enum NvmeCcMask {
 #define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
 #define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
 
+#define NVME_SET_CC_EN(cc, val) \
+(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)\
+(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)\
+(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)\
+(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)\
+(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
 enum NvmeCstsShift {
 CSTS_RDY_SHIFT  = 0,
 CSTS_CFS_SHIFT  = 1,
@@ -482,10 +508,41 @@ typedef struct NvmeIdentify {
 uint64_trsvd2[2];
 uint64_tprp1;
 uint64_tprp2;
-uint32_tcns;
-uint32_trsvd11[5];
+uint8_t cns;
+uint8_t rsvd4;
+uint16_tctrlid;
+uint16_tnvmsetid;
+uint8_t rsvd3;
+uint8_t csi;
+uint32_trsvd12[4];
 } NvmeIdentify;
 
+typedef struct NvmeNsIdDesc {
+uint8_t nidt;
+uint8_t nidl;
+uint16_trsvd2;
+} NvmeNsIdDesc;
+
+enum NvmeNidType {
+NVME_NIDT_EUI64 = 0x01,
+NVME_NIDT_NGUID = 0x02,
+NVME_NIDT_UUID  = 0x03,
+NVME_NIDT_CSI   = 0x04,
+};
+
+enum NvmeNidLength {
+NVME_NIDL_EUI64 = 8,
+NVME_NIDL_NGUID = 16,
+NVME_NIDL_UUID  = 16,
+NVME_NIDL_CSI   = 1,
+};
+
+enum NvmeCsi {
+NVME_CSI_NVM= 0x00,
+};
+
+#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
+
 typedef struct NvmeRwCmd {
 uint8_t opcode;
 uint8_t flags;
@@ -603,6 +660,7 @@ enum NvmeStatusCodes {
 NVME_CMD_ABORT_MISSING_FUSE = 0x000a,
 NVME_INVALID_NSID   = 0x000b,
 NVME_CMD_SEQ_ERROR  = 0x000c,
+NVME_CMD_SET_CMB_REJECTED   = 0x002b,
 NVME_LBA_RANGE  = 0x0080,
 NVME_CAP_EXCEEDED   = 0x0081,
 NVME_NS_NOT_READY   = 0x0082,
@@ -729,9 +787,14 @@ typedef struct NvmePSD {
 #define NVME_IDENTIFY_DATA_SIZE 4096
 
 enum {
-NVME_ID_CNS_NS = 0x0,
-NVME_ID_CNS_CTRL   = 0x1,
-NVME_ID_CNS_NS_ACTIVE_LIST = 0x2,
+NVME_ID_CNS_NS= 0x0,
+NVME_ID_CNS_CTRL  = 0x1,
+NVME_ID_CNS_NS_ACTIVE_LIST= 0x2,
+NVME_ID_CNS_NS_DESC_LIST  = 0x03,
+NVME_ID_CNS_CS_NS = 0x05,
+NVME_ID_CNS_CS_CTRL   = 0x06,
+NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+NVME_ID_CNS_IO_COMMAND_SET= 0x1c,
 };
 
 typedef struct NvmeIdCtrl {
@@ -825,6 +888,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80
 };
 
@@ -914,6 +978,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) !=

[PATCH 04/18] hw/block/nvme: Add Commands Supported and Effects log

2020-06-15 Thread Dmitry Fomichev

This log page becomes necessary to implement to allow checking for
Zone Append command support in Zoned Namespace Command Set.

This commit adds the code to report this log page for NVM Command
Set only. The parts that are specific to zoned operation will be
added later in the series.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c   | 62 +++
 hw/block/trace-events |  4 +++
 include/block/nvme.h  | 18 +
 3 files changed, 84 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 94110db596..d000c7c9fb 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -870,6 +870,66 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
*cmd, NvmeRequest *req)
 return NVME_SUCCESS;
 }
 
+static uint16_t nvme_handle_cmd_effects(NvmeCtrl *n, NvmeCmd *cmd,
+uint64_t prp1, uint64_t prp2, uint64_t ofs, uint32_t len)
+{
+   NvmeEffectsLog cmd_eff_log = {};
+   uint32_t *iocs = cmd_eff_log.iocs;
+
+trace_pci_nvme_cmd_supp_and_effects_log_read();
+
+if (ofs != 0) {
+trace_pci_nvme_err_invalid_effects_log_offset(ofs);
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+if (len != sizeof(cmd_eff_log)) {
+trace_pci_nvme_err_invalid_effects_log_len(len);
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+iocs[NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFFECTS_CSUPP;
+iocs[NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFFECTS_CSUPP;
+iocs[NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFFECTS_CSUPP;
+iocs[NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFFECTS_CSUPP;
+iocs[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFFECTS_CSUPP;
+iocs[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFFECTS_CSUPP;
+iocs[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFFECTS_CSUPP;
+iocs[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFFECTS_CSUPP;
+
+iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
+iocs[NVME_CMD_WRITE_ZEROS] = NVME_CMD_EFFECTS_CSUPP |
+ NVME_CMD_EFFECTS_LBCC;
+iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC;
+iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP;
+
+return nvme_dma_read_prp(n, (uint8_t *)&cmd_eff_log, len, prp1, prp2);
+}
+
+static uint16_t nvme_get_log_page(NvmeCtrl *n, NvmeCmd *cmd)
+{
+uint64_t prp1 = le64_to_cpu(cmd->prp1);
+uint64_t prp2 = le64_to_cpu(cmd->prp2);
+uint32_t dw10 = le32_to_cpu(cmd->cdw10);
+uint32_t dw11 = le32_to_cpu(cmd->cdw11);
+uint64_t dw12 = le32_to_cpu(cmd->cdw12);
+uint64_t dw13 = le32_to_cpu(cmd->cdw13);
+uint64_t ofs = (dw13 << 32) | dw12;
+uint32_t numdl, numdu, len;
+uint16_t lid = dw10 & 0xff;
+
+numdl = dw10 >> 16;
+numdu = dw11 & 0x;
+len = (((numdu << 16) | numdl) + 1) << 2;
+
+switch (lid) {
+case NVME_LOG_CMD_EFFECTS:
+return nvme_handle_cmd_effects(n, cmd, prp1, prp2, ofs, len);
+}
+
+trace_pci_nvme_unsupported_log_page(lid);
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
 switch (cmd->opcode) {
@@ -887,6 +947,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
NvmeRequest *req)
 return nvme_set_feature(n, cmd, req);
 case NVME_ADM_CMD_GET_FEATURES:
 return nvme_get_feature(n, cmd, req);
+case NVME_ADM_CMD_GET_LOG_PAGE:
+return nvme_get_log_page(n, cmd);
 default:
 trace_pci_nvme_err_invalid_admin_opc(cmd->opcode);
 return NVME_INVALID_OPCODE | NVME_DNR;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 958fcc5508..423d491e27 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -58,6 +58,7 @@ pci_nvme_mmio_start_success(void) "setting controller enable 
bit succeeded"
 pci_nvme_mmio_stopped(void) "cleared controller enable bit"
 pci_nvme_mmio_shutdown_set(void) "shutdown bit set"
 pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects 
log read"
 
 # nvme traces for error conditions
 pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
@@ -69,6 +70,8 @@ pci_nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid 
namespace %u not w
 pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8""
 pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) 
"Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
+pci_nvme_err_invalid_effects_log_offset(uint64_t ofs) "commands supported and 
effects log offset must be 0, got %"PRIu64""
+pci_nvme_err_invalid_effects_log_len(uint32_t len) "commands supported and 
effects log size is 4096, got %"PRIu32""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, 
sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission 
queue, invalid cqid=%"PRIu16""
 pci_nvme_err_inv

[PATCH 10/18] hw/block/nvme: Support Zoned Namespace Command Set

2020-06-15 Thread Dmitry Fomichev

The driver has been changed to advertise NVM Command Set when "zoned"
driver property is not set (default) and Zoned Namespace Command Set
otherwise.

Handlers for three new NVMe commands introduced in Zoned Namespace
Command Set specification are added, namely for Zone Management
Receive, Zone Management Send and Zone Append.

Driver initialization code has been extended to create a proper
configuration for zoned operation using driver properties.

Read/Write command handler is modified to only allow writes at the
write pointer if the namespace is zoned. For Zone Append command,
writes implicitly happen at the write pointer and the starting write
pointer value is returned as the result of the command. Read Zeroes
handler is modified to add zoned checks that are identical to those
done as a part of Write flow.

The code to support for Zone Descriptor Extensions is not included in
this commit and the driver always reports ZDES 0. A later commit in
this series will add ZDE support.

This commit doesn't yet include checks for active and open zone
limits. It is assumed that there are no limits on either active or
open zones.

Signed-off-by: Niklas Cassel 
Signed-off-by: Hans Holmberg 
Signed-off-by: Ajay Joshi 
Signed-off-by: Chaitanya Kulkarni 
Signed-off-by: Matias Bjorling 
Signed-off-by: Aravind Ramesh 
Signed-off-by: Shin'ichiro Kawasaki 
Signed-off-by: Adam Manzanares 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 963 ++--
 1 file changed, 933 insertions(+), 30 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 0db1f2002d..7bf452d597 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -37,6 +37,7 @@
 #include "qemu/osdep.h"
 #include "qemu/units.h"
 #include "qemu/error-report.h"
+#include "crypto/random.h"
 #include "hw/block/block.h"
 #include "hw/pci/msix.h"
 #include "hw/pci/pci.h"
@@ -68,6 +69,98 @@
 
 static void nvme_process_sq(void *opaque);
 
+/*
+ * Add a zone to the tail of a zone list.
+ */
+static void nvme_add_zone_tail(NvmeCtrl *n, NvmeNamespace *ns, NvmeZoneList 
*zl,
+NvmeZone *zone)
+{
+uint32_t idx = (uint32_t)(zone - ns->zone_array);
+
+assert(nvme_zone_not_in_list(zone));
+
+if (!zl->size) {
+zl->head = zl->tail = idx;
+zone->next = zone->prev = NVME_ZONE_LIST_NIL;
+} else {
+ns->zone_array[zl->tail].next = idx;
+zone->prev = zl->tail;
+zone->next = NVME_ZONE_LIST_NIL;
+zl->tail = idx;
+}
+zl->size++;
+}
+
+/*
+ * Remove a zone from a zone list. The zone must be linked in the list.
+ */
+static void nvme_remove_zone(NvmeCtrl *n, NvmeNamespace *ns, NvmeZoneList *zl,
+NvmeZone *zone)
+{
+uint32_t idx = (uint32_t)(zone - ns->zone_array);
+
+assert(!nvme_zone_not_in_list(zone));
+
+--zl->size;
+if (zl->size == 0) {
+zl->head = NVME_ZONE_LIST_NIL;
+zl->tail = NVME_ZONE_LIST_NIL;
+} else if (idx == zl->head) {
+zl->head = zone->next;
+ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL;
+} else if (idx == zl->tail) {
+zl->tail = zone->prev;
+ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL;
+} else {
+ns->zone_array[zone->next].prev = zone->prev;
+ns->zone_array[zone->prev].next = zone->next;
+}
+
+zone->prev = zone->next = 0;
+}
+
+static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns,
+NvmeZone *zone, uint8_t state)
+{
+if (!nvme_zone_not_in_list(zone)) {
+switch (nvme_get_zone_state(zone)) {
+case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+nvme_remove_zone(n, ns, ns->exp_open_zones, zone);
+break;
+case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+nvme_remove_zone(n, ns, ns->imp_open_zones, zone);
+break;
+case NVME_ZONE_STATE_CLOSED:
+nvme_remove_zone(n, ns, ns->closed_zones, zone);
+break;
+case NVME_ZONE_STATE_FULL:
+nvme_remove_zone(n, ns, ns->full_zones, zone);
+}
+   }
+
+nvme_set_zone_state(zone, state);
+
+switch (state) {
+case NVME_ZONE_STATE_EXPLICITLY_OPEN:
+nvme_add_zone_tail(n, ns, ns->exp_open_zones, zone);
+break;
+case NVME_ZONE_STATE_IMPLICITLY_OPEN:
+nvme_add_zone_tail(n, ns, ns->imp_open_zones, zone);
+break;
+case NVME_ZONE_STATE_CLOSED:
+nvme_add_zone_tail(n, ns, ns->closed_zones, zone);
+break;
+case NVME_ZONE_STATE_FULL:
+nvme_add_zone_tail(n, ns, ns->full_zones, zone);
+break;
+default:
+zone->d.za = 0;
+/* fall through */
+case NVME_ZONE_STATE_READ_ONLY:
+zone->tstamp = 0;
+}
+}
+
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
 hwaddr low = n->ctrl_mem.addr;
@@ -313,6 +406,7 @@ static void nvme_post_cqes(void *opaque)
 
 QTAILQ_REMOVE(&cq->req_list, req, entry);
 sq = req->sq;
+
 req->cqe.status = cpu_to_le16((req->status <

[PATCH 03/18] hw/block/nvme: Clean up unused AER definitions

2020-06-15 Thread Dmitry Fomichev

Removed unused struct NvmeAerResult and SMART-related async event
codes. All other event codes are now categorized by their type.
This avoids having to define the same values in a single enum,
NvmeAsyncEventRequest, that is now removed.

Later commits in this series will define additional values in some
of these enums. No functional change.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.h  |  1 -
 include/block/nvme.h | 43 ++-
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 79513eaa49..dae7b977af 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -12,7 +12,6 @@ typedef struct NvmeParams {
 
 typedef struct NvmeAsyncEvent {
 QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry;
-NvmeAerResult result;
 } NvmeAsyncEvent;
 
 enum NvmeRequestFlags {
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 9c3a04dcd7..3099df99eb 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -553,28 +553,30 @@ typedef struct NvmeDsmRange {
 uint64_tslba;
 } NvmeDsmRange;
 
-enum NvmeAsyncEventRequest {
-NVME_AER_TYPE_ERROR = 0,
-NVME_AER_TYPE_SMART = 1,
-NVME_AER_TYPE_IO_SPECIFIC   = 6,
-NVME_AER_TYPE_VENDOR_SPECIFIC   = 7,
-NVME_AER_INFO_ERR_INVALID_SQ= 0,
-NVME_AER_INFO_ERR_INVALID_DB= 1,
-NVME_AER_INFO_ERR_DIAG_FAIL = 2,
-NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3,
-NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR= 4,
-NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR   = 5,
-NVME_AER_INFO_SMART_RELIABILITY = 0,
-NVME_AER_INFO_SMART_TEMP_THRESH = 1,
-NVME_AER_INFO_SMART_SPARE_THRESH= 2,
+enum NvmeAsyncEventType {
+NVME_AER_TYPE_ERROR = 0x00,
+NVME_AER_TYPE_SMART = 0x01,
+NVME_AER_TYPE_NOTICE= 0x02,
+NVME_AER_TYPE_CMDSET_SPECIFIC   = 0x06,
+NVME_AER_TYPE_VENDOR_SPECIFIC   = 0x07,
 };
 
-typedef struct NvmeAerResult {
-uint8_t event_type;
-uint8_t event_info;
-uint8_t log_page;
-uint8_t resv;
-} NvmeAerResult;
+enum NvmeAsyncErrorInfo {
+NVME_AER_ERR_INVALID_SQ = 0x00,
+NVME_AER_ERR_INVALID_DB = 0x01,
+NVME_AER_ERR_DIAG_FAIL  = 0x02,
+NVME_AER_ERR_PERS_INTERNAL_ERR  = 0x03,
+NVME_AER_ERR_TRANS_INTERNAL_ERR = 0x04,
+NVME_AER_ERR_FW_IMG_LOAD_ERR= 0x05,
+};
+
+enum NvmeAsyncNoticeInfo {
+NVME_AER_NOTICE_NS_CHANGED  = 0x00,
+};
+
+enum NvmeAsyncEventCfg {
+NVME_AEN_CFG_NS_ATTR= 1 << 8,
+};
 
 typedef struct NvmeCqe {
 union {
@@ -881,7 +883,6 @@ enum NvmeIdNsDps {
 
 static inline void _nvme_check_size(void)
 {
-QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
 QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
-- 
2.21.0

[PATCH 07/18] hw/block/nvme: Add support for Namespace Types

2020-06-15 Thread Dmitry Fomichev

From: Niklas Cassel 

Namespace Types introduce a new command set, "I/O Command Sets",
that allows the host to retrieve the command sets associated with
a namespace. Introduce support for the command set, and enable
detection for the NVM Command Set.

Signed-off-by: Niklas Cassel 
Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c | 210 ++--
 hw/block/nvme.h |  11 +++
 2 files changed, 216 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d000c7c9fb..0db1f2002d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -685,6 +685,26 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, 
NvmeIdentify *c)
 prp1, prp2);
 }
 
+static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeIdentify *c)
+{
+uint64_t prp1 = le64_to_cpu(c->prp1);
+uint64_t prp2 = le64_to_cpu(c->prp2);
+static const int data_len = NVME_IDENTIFY_DATA_SIZE;
+uint32_t *list;
+uint16_t ret;
+
+trace_pci_nvme_identify_ctrl_csi(c->csi);
+
+if (c->csi == NVME_CSI_NVM) {
+list = g_malloc0(data_len);
+ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
+g_free(list);
+return ret;
+} else {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+}
+
 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
 {
 NvmeNamespace *ns;
@@ -700,11 +720,42 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeIdentify *c)
 }
 
 ns = &n->namespaces[nsid - 1];
+assert(nsid == ns->nsid);
 
 return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
 prp1, prp2);
 }
 
+static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeIdentify *c)
+{
+NvmeNamespace *ns;
+uint32_t nsid = le32_to_cpu(c->nsid);
+uint64_t prp1 = le64_to_cpu(c->prp1);
+uint64_t prp2 = le64_to_cpu(c->prp2);
+static const int data_len = NVME_IDENTIFY_DATA_SIZE;
+uint32_t *list;
+uint16_t ret;
+
+trace_pci_nvme_identify_ns_csi(nsid, c->csi);
+
+if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces);
+return NVME_INVALID_NSID | NVME_DNR;
+}
+
+ns = &n->namespaces[nsid - 1];
+assert(nsid == ns->nsid);
+
+if (c->csi == NVME_CSI_NVM) {
+list = g_malloc0(data_len);
+ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
+g_free(list);
+return ret;
+} else {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+}
+
 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
 {
 static const int data_len = NVME_IDENTIFY_DATA_SIZE;
@@ -732,6 +783,99 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeIdentify *c)
 return ret;
 }
 
+static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeIdentify *c)
+{
+static const int data_len = NVME_IDENTIFY_DATA_SIZE;
+uint32_t min_nsid = le32_to_cpu(c->nsid);
+uint64_t prp1 = le64_to_cpu(c->prp1);
+uint64_t prp2 = le64_to_cpu(c->prp2);
+uint32_t *list;
+uint16_t ret;
+int i, j = 0;
+
+trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
+
+if (c->csi != NVME_CSI_NVM) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+list = g_malloc0(data_len);
+for (i = 0; i < n->num_namespaces; i++) {
+if (i < min_nsid) {
+continue;
+}
+list[j++] = cpu_to_le32(i + 1);
+if (j == data_len / sizeof(uint32_t)) {
+break;
+}
+}
+ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
+g_free(list);
+return ret;
+}
+
+static uint16_t nvme_list_ns_descriptors(NvmeCtrl *n, NvmeIdentify *c)
+{
+NvmeNamespace *ns;
+uint32_t nsid = le32_to_cpu(c->nsid);
+uint64_t prp1 = le64_to_cpu(c->prp1);
+uint64_t prp2 = le64_to_cpu(c->prp2);
+void *buf_ptr;
+NvmeNsIdDesc *desc;
+static const int data_len = NVME_IDENTIFY_DATA_SIZE;
+uint8_t *buf;
+uint16_t status;
+
+trace_pci_nvme_list_ns_descriptors();
+
+if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
+trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces);
+return NVME_INVALID_NSID | NVME_DNR;
+}
+
+ns = &n->namespaces[nsid - 1];
+assert(nsid == ns->nsid);
+
+buf = g_malloc0(data_len);
+buf_ptr = buf;
+
+desc = buf_ptr;
+desc->nidt = NVME_NIDT_UUID;
+desc->nidl = NVME_NIDL_UUID;
+buf_ptr += sizeof(*desc);
+memcpy(buf_ptr, ns->uuid.data, NVME_NIDL_UUID);
+buf_ptr += NVME_NIDL_UUID;
+
+desc = buf_ptr;
+desc->nidt = NVME_NIDT_CSI;
+desc->nidl = NVME_NIDL_CSI;
+buf_ptr += sizeof(*desc);
+*(uint8_t *)buf_ptr = NVME_CSI_NVM;
+
+status = nvme_dma_read_prp(n, buf, data_len, prp1, prp2);
+g_free(buf);
+return status;
+}
+
+static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeIdentify *c)
+{
+uint64_t prp1 = le64_to_cpu(c->prp1);
+uint64_t prp2 = le64_to_cpu(c->prp2);
+stati

[PATCH 06/18] hw/block/nvme: Define trace events related to NS Types

2020-06-15 Thread Dmitry Fomichev

A few trace events are defined that are relevant to implementing
Namespace Types (NVMe TP 4056).

Signed-off-by: Dmitry Fomichev 
---
 hw/block/trace-events | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/hw/block/trace-events b/hw/block/trace-events
index 423d491e27..3f3323fe38 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -39,8 +39,13 @@ pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t 
vector, uint16_t size,
 pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
 pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16""
 pci_nvme_identify_ctrl(void) "identify controller"
+pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8""
 pci_nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
+pci_nvme_identify_ns_csi(uint16_t ns, uint8_t csi) "identify namespace, 
nsid=%"PRIu16", csi=0x%"PRIx8""
 pci_nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
+pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "identify namespace 
list, nsid=%"PRIu16", csi=0x%"PRIx8""
+pci_nvme_list_ns_descriptors(void) "identify namespace descriptors"
+pci_nvme_identify_cmd_set(void) "identify i/o command set"
 pci_nvme_getfeat_vwcache(const char* result) "get feature volatile write 
cache, result=%s"
 pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
 pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested 
cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
@@ -59,6 +64,8 @@ pci_nvme_mmio_stopped(void) "cleared controller enable bit"
 pci_nvme_mmio_shutdown_set(void) "shutdown bit set"
 pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
 pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects 
log read"
+pci_nvme_css_nvm_cset_selected_by_host(uint32_t cc) "NVM command set selected 
by host, bar.cc=0x%"PRIx32""
+pci_nvme_css_all_csets_sel_by_host(uint32_t cc) "all supported command sets 
selected by host, bar.cc=0x%"PRIx32""
 
 # nvme traces for error conditions
 pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
@@ -72,6 +79,9 @@ pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin 
opcode 0x%"PRIx8""
 pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) 
"Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64""
 pci_nvme_err_invalid_effects_log_offset(uint64_t ofs) "commands supported and 
effects log offset must be 0, got %"PRIu64""
 pci_nvme_err_invalid_effects_log_len(uint32_t len) "commands supported and 
effects log size is 4096, got %"PRIu32""
+pci_nvme_err_change_css_when_enabled(void) "changing CC.CSS while controller 
is enabled"
+pci_nvme_err_only_nvm_cmd_set_avail(void) "setting 110b CC.CSS, but only NVM 
command set is enabled"
+pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination 
index %"PRIu32""
 pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, 
sid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission 
queue, invalid cqid=%"PRIu16""
 pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission 
queue, invalid sqid=%"PRIu16""
@@ -127,6 +137,7 @@ pci_nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t 
new_head) "completion qu
 pci_nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write 
for nonexistent queue, sqid=%"PRIu32", ignoring"
 pci_nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission 
queue doorbell write value beyond queue size, sqid=%"PRIu32", 
new_head=%"PRIu16", ignoring"
 pci_nvme_unsupported_log_page(uint16_t lid) "unsupported log page 0x%"PRIx16""
+pci_nvme_ub_unknown_css_value(void) "unknown value in cc.css field"
 
 # xen-block.c
 xen_block_realize(const char *type, uint32_t disk, uint32_t partition) "%s 
d%up%u"
-- 
2.21.0

[PATCH 14/18] hw/block/nvme: Generate zone AENs

2020-06-15 Thread Dmitry Fomichev

Added an optional Boolean "zone_async_events" property to the driver.
Once it's turned on, the namespace will be sending "Zone Descriptor
Changed" asynchronous events to the host in particular situations
defined by the protocol. In order to clear these AENs, the host needs
to read the newly added Changed Zones Log.

Signed-off-by: Dmitry Fomichev 
---
 hw/block/nvme.c  | 300 ++-
 hw/block/nvme.h  |  13 +-
 include/block/nvme.h |  23 +++-
 3 files changed, 328 insertions(+), 8 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6e6616f7e9..8dfb568173 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -200,12 +200,66 @@ static inline void nvme_aor_dec_active(NvmeCtrl *n, 
NvmeNamespace *ns)
 assert(ns->nr_active_zones >= 0);
 }
 
+static bool nvme_complete_async_req(NvmeCtrl *n, NvmeNamespace *ns,
+enum NvmeAsyncEventType type, uint8_t info)
+{
+NvmeAsyncEvent *ae;
+uint32_t nsid = 0;
+uint8_t log_page = 0;
+
+switch (type) {
+case NVME_AER_TYPE_ERROR:
+case NVME_AER_TYPE_SMART:
+break;
+case NVME_AER_TYPE_NOTICE:
+switch (info) {
+case NVME_AER_NOTICE_ZONE_DESCR_CHANGED:
+log_page = NVME_LOG_ZONE_CHANGED_LIST;
+nsid = ns->nsid;
+if (!(n->ae_cfg & NVME_AEN_CFG_ZONE_DESCR_CHNGD_NOTICES)) {
+trace_pci_nvme_zone_ae_not_enabled(info, log_page, nsid);
+return false;
+}
+if (ns->aen_pending) {
+trace_pci_nvme_zone_ae_not_cleared(info, log_page, nsid);
+return false;
+}
+ns->aen_pending = true;
+}
+break;
+case NVME_AER_TYPE_CMDSET_SPECIFIC:
+case NVME_AER_TYPE_VENDOR_SPECIFIC:
+break;
+}
+
+ae = g_malloc0(sizeof(*ae));
+ae->res = type;
+ae->res |= (info << 8) & 0xff00;
+ae->res |= (log_page << 16) & 0xff;
+ae->nsid = nsid;
+
+QTAILQ_INSERT_TAIL(&n->async_reqs, ae, entry);
+timer_mod(n->admin_cq.timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+return true;
+}
+
+static inline void nvme_notify_zone_changed(NvmeCtrl *n, NvmeNamespace *ns,
+NvmeZone *zone)
+{
+if (n->ae_cfg) {
+zone->flags |= NVME_ZFLAGS_AEN_PEND;
+nvme_complete_async_req(n, ns, NVME_AER_TYPE_NOTICE,
+NVME_AER_NOTICE_ZONE_DESCR_CHANGED);
+}
+}
+
 static void nvme_set_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
 {
 assert(zone->flags & NVME_ZFLAGS_SET_RZR);
 zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 zone->flags &= ~NVME_ZFLAGS_TS_DELAY;
 zone->d.za |= NVME_ZA_RESET_RECOMMENDED;
+nvme_notify_zone_changed(n, ns, zone);
 zone->flags &= ~NVME_ZFLAGS_SET_RZR;
 trace_pci_nvme_zone_reset_recommended(zone->d.zslba);
 }
@@ -214,10 +268,14 @@ static void nvme_clear_rzr(NvmeCtrl *n, NvmeNamespace *ns,
 NvmeZone *zone, bool notify)
 {
 if (n->params.rrl_usec) {
-zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY);
+zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY |
+ NVME_ZFLAGS_AEN_PEND);
 notify = notify && (zone->d.za & NVME_ZA_RESET_RECOMMENDED);
 zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED;
 zone->tstamp = 0;
+if (notify) {
+nvme_notify_zone_changed(n, ns, zone);
+}
 }
 }
 
@@ -227,6 +285,7 @@ static void nvme_set_fzr(NvmeCtrl *n, NvmeNamespace *ns, 
NvmeZone *zone)
 zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 zone->flags &= ~NVME_ZFLAGS_TS_DELAY;
 zone->d.za |= NVME_ZA_FINISH_RECOMMENDED;
+nvme_notify_zone_changed(n, ns, zone);
 zone->flags &= ~NVME_ZFLAGS_SET_FZR;
 trace_pci_nvme_zone_finish_recommended(zone->d.zslba);
 }
@@ -235,13 +294,61 @@ static void nvme_clear_fzr(NvmeCtrl *n, NvmeNamespace *ns,
 NvmeZone *zone, bool notify)
 {
 if (n->params.frl_usec) {
-zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY);
+zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY |
+ NVME_ZFLAGS_AEN_PEND);
 notify = notify && (zone->d.za & NVME_ZA_FINISH_RECOMMENDED);
 zone->d.za &= ~NVME_ZA_FINISH_RECOMMENDED;
 zone->tstamp = 0;
+if (notify) {
+nvme_notify_zone_changed(n, ns, zone);
+}
 }
 }
 
+static bool nvme_process_rrl(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone)
+{
+if (zone->flags & NVME_ZFLAGS_SET_RZR) {
+if (zone->flags & NVME_ZFLAGS_TS_DELAY) {
+assert(!(zone->d.za & NVME_ZA_RESET_RECOMMENDED));
+if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - zone->tstamp >=
+n->params.rzr_delay_usec) {
+nvme_set_rzr(n, ns, zone);
+return true;
+}
+} else if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - zone->tstamp >=
+   n->params.rrl_us

[PATCH 00/18] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-06-15 Thread Dmitry Fomichev

Zoned Namespace (ZNS) Command Set is a newly introduced command set
published by the NVM Express, Inc. organization as TP 4053. The main
design goals of ZNS are to provide hardware designers the means to
reduce NVMe controller complexity and to allow achieving a better I/O
latency and throughput. SSDs that implement this interface are
commonly known as ZNS SSDs.

This command set is implementing a zoned storage model, similarly to
ZAC/ZBC. As such, there is already support in Linux, allowing one to
perform the majority of tasks needed for managing ZNS SSDs.

The Zoned Namespace Command Set relies on another TP, known as
Namespace Types (NVMe TP 4056), which introduces support for having
multiple command sets per namespace.

Both ZNS and Namespace Types specifications can be downloaded by
visiting the following link -

https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs.zip

This patch series adds Namespace Types support and zoned namespace
emulation capability to the existing NVMe PCI driver.

The patchset is organized as follows -

The first several patches are preparatory and are added to allow for
an easier review of the subsequent commits. The group of patches that
follows adds NS Types support with only NVM Command Set being
available. Finally, the last group of commits makes definitions and
adds new code to support Zoned Namespace Command Set.

This series is based on block-next/block branch.

Based-on: <20200608190821.3293867-1-ebl...@redhat.com>

Ajay Joshi (1):
  hw/block/nvme: Define 64 bit cqe.result

Dmitry Fomichev (15):
  hw/block/nvme: Move NvmeRequest has_sg field to a bit flag
  hw/block/nvme: Clean up unused AER definitions
  hw/block/nvme: Add Commands Supported and Effects log
  hw/block/nvme: Define trace events related to NS Types
  hw/block/nvme: Make Zoned NS Command Set definitions
  hw/block/nvme: Define Zoned NS Command Set trace events
  hw/block/nvme: Support Zoned Namespace Command Set
  hw/block/nvme: Introduce max active and open zone limits
  hw/block/nvme: Simulate Zone Active excursions
  hw/block/nvme: Set Finish/Reset Zone Recommended attributes
  hw/block/nvme: Generate zone AENs
  hw/block/nvme: Support Zone Descriptor Extensions
  hw/block/nvme: Add injection of Offline/Read-Only zones
  hw/block/nvme: Use zone metadata file for persistence
  hw/block/nvme: Document zoned parameters in usage text

Niklas Cassel (2):
  hw/block/nvme: Introduce the Namespace Types definitions
  hw/block/nvme: Add support for Namespace Types

 block/nvme.c  |2 +-
 block/trace-events|2 +-
 hw/block/nvme.c   | 2316 -
 hw/block/nvme.h   |  228 +++-
 hw/block/trace-events |   56 +
 include/block/nvme.h  |  282 -
 6 files changed, 2820 insertions(+), 66 deletions(-)

-- 
2.21.0

Re: [PATCH v2 2/2] block: Call attention to truncation of long NBD exports

2020-06-15 Thread Eric Blake


On 6/10/20 11:37 AM, Eric Blake wrote:

Commit 93676c88 relaxed our NBD client code to request export names up
to the NBD protocol maximum of 4096 bytes without NUL terminator, even
though the block layer can't store anything longer than 4096 bytes
including NUL terminator for display to the user.  Since this means
there are some export names where we have to truncate things, we can
at least try to make the truncation a bit more obvious for the user.
Note that in spite of the truncated display name, we can still
communicate with an NBD server using such a long export name; this was
deemed nicer than refusing to even connect to such a server (since the
server may not be under our control, and since determining our actual
length limits gets tricky when nbd://host:port/export and
nbd+unix:///export?socket=/path are themselves variable-length
expansions beyond the export name but count towards the block layer
name length).




+++ b/block/nbd.c



  } else if (host && !s->export) {
-snprintf(bs->exact_filename, sizeof(bs->exact_filename),
- "nbd://%s:%s", host, port);
+len = snprintf(bs->exact_filename, sizeof(bs->exact_filename),
+   "nbd://%s:%s", host, port);
+}
+if (len > sizeof(bs->exact_filename)) {


Max pointed out off-list that this is off-by-one: by not using >=, I am 
failing to detect overflow when exactly one byte is truncated.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2 1/2] nbd/server: Avoid long error message assertions CVE-2020-10761

2020-06-15 Thread Eric Blake


On 6/10/20 11:37 AM, Eric Blake wrote:


We may later want to further sanitize the user-supplied strings we
place into our error messages, such as scrubbing out control
characters, but that is less important to the CVE fix, so it can be a
later patch to the new nbd_sanitize_name.




+static char *
+nbd_sanitize_name(const char *name)
+{
+if (strnlen(name, 80) < 80) {
+return g_strdup(name);
+}
+/* XXX Should we also try to sanitize any control characters? */
+return g_strdup_printf("%.80s...", name);


Max pointed out off-list that this can take a valid UTF-8 name from the 
client and truncate it mid-character to make our reply NOT valid UTF-8, 
which is a (minor) violation of the NBD protocol.  We have not yet 
implemented strict UTF-8 enforcement in qemu (neither our client nor 
server code takes pains to only send UTF-8, nor validates that incoming 
strings are valid UTF-8); and while the server would previously echo 
non-UTF-8 (where the client violated protocol first), this is now a case 
where the server can be coerced into violating protocol first.  I guess 
I may end up doing a followup patch that adds incoming validation and in 
the process avoids chopping a multi-byte character, but that's just as 
easy to fold in with my question about sanitizing control characters.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH] block: file-posix: Fail unmap with NO_FALLBACK on block device

2020-06-15 Thread Nir Soffer

On Sat, Jun 13, 2020 at 8:08 PM Nir Soffer  wrote:
>
> Punching holes on block device uses blkdev_issue_zeroout() with
> BLKDEV_ZERO_NOFALLBACK but there is no guarantee that this is fast
> enough for pre-zeroing an entire device.
>
> Zeroing block device can be slow as writing zeroes or 100 times faster,
> depending on the storage. There is no way to tell if zeroing it fast
> enough.  The kernel BLKDEV_ZERO_NOFALLBACK flag does not mean that the
> operation is fast; it just means that the kernel will not fall back to
> manual zeroing.
>
> Here is an example converting 10g image with 8g of data to block device:
>
> $ ./qemu-img info test.img
> image: test.img
> file format: raw
> virtual size: 10 GiB (10737418240 bytes)
> disk size: 8 GiB
>
> $ time ./qemu-img convert -f raw -O raw -t none -T none -W test.img 
> /dev/test/lv1
>
> Before:
>
> real1m20.483s
> user0m0.490s
> sys 0m0.739s
>
> After:
>
> real0m55.831s
> user0m0.610s
> sys 0m0.956s

I did more testing with real server and storage, and the results confirm
what I reported here in my vm based environment and poor storage.


Testing this LUN:

# multipath -ll
3600a098038304437415d4b6a59684a52 dm-3 NETAPP,LUN C-Mode
size=5.0T features='3 queue_if_no_path pg_init_retries 50'
hwhandler='1 alua' wp=rw
|-+- policy='service-time 0' prio=50 status=active
| |- 18:0:0:0 sdb 8:16  active ready running
| `- 19:0:0:0 sdc 8:32  active ready running
`-+- policy='service-time 0' prio=10 status=enabled
  |- 20:0:0:0 sdd 8:48  active ready running
  `- 21:0:0:0 sde 8:64  active ready running


The destination is 100g logical volume on this LUN:

# qemu-img info test-lv
image: test-lv
file format: raw
virtual size: 100 GiB (107374182400 bytes)
disk size: 0 B


The source image is 100g image with 48g of data:

# qemu-img info fedora-31-100g-50p.raw
image: fedora-31-100g-50p.raw
file format: raw
virtual size: 100 GiB (107374182400 bytes)
disk size: 48.4 GiB


We can zero 2.3 g/s:

# time blkdiscard -z test-lv

real 0m43.902s
user 0m0.002s
sys 0m0.130s

(I should really test with fallocate instead of blkdiscard, but the results look
the same.)

# iostat -xdm dm-3 5

Devicer/s w/s rMB/s wMB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util

dm-320.80  301.40  0.98   2323.31 0.00 0.00
0.00   0.00   26.56  854.50 257.9448.23  7893.41   0.73  23.58

dm-315.20  297.20  0.80   2321.67 0.00 0.00
0.00   0.00   26.43  836.06 248.7253.80  7999.30   0.78  24.22


We can write 445m/s:

# dd if=/dev/zero bs=2M count=51200 of=test-lv oflag=direct conv=fsync
107374182400 bytes (107 GB, 100 GiB) copied, 241.257 s, 445 MB/s

# iostat -xdm dm-3 5

Devicer/s w/s rMB/s wMB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util

dm-3 6.60 6910.00  0.39431.85 0.00 0.00
0.00   0.002.482.70  15.1960.7364.00   0.14  98.84

dm-340.80 6682.60  1.59417.61 0.00 0.00
0.00   0.001.712.73  14.9240.0063.99   0.15  97.60

dm-3 6.60 6887.40  0.39430.46 0.00 0.00
0.00   0.002.152.66  14.9260.7364.00   0.14  98.22


Testing latest qemu-img:

# rpm -q qemu-img
qemu-img-4.2.0-22.module+el8.2.1+6758+cb8d64c2.x86_64

# time qemu-img convert -p -f raw -O raw -t none -W
fedora-31-100g-50p.raw test-lv
(100.00/100%)

real 2m2.337s
user 0m2.708s
sys 0m17.326s

# iostat -xdm dm-3 5

pre zero phase:

Devicer/s w/s rMB/s wMB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util

dm-324.00  265.40  1.00   2123.20 0.00 0.00
0.00   0.00   36.81  543.52 144.9942.48  8192.00   0.70  20.14

dm-3 9.60  283.60  0.59   2265.60 0.00 0.00
0.00   0.00   35.42  576.80 163.7862.50  8180.44   0.70  20.58

dm-324.00  272.00  1.00   2176.00 0.00 0.00
0.00   0.00   22.89  512.40 139.7742.48  8192.00   0.67  19.90

copy phase:

Devicer/s w/s rMB/s wMB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util

dm-327.20 10671.20  1.19655.84 0.00 0.00
0.00   0.002.70   10.99 111.9844.8362.93   0.09  96.74

dm-3 6.40 11537.00  0.39712.33 0.00 0.00
0.00   0.003.00   11.90 131.5262.5063.23   0.08  97.82

dm-327.20 12400.20  1.19765.47 0.00 0.00
0.00   0.003.60   11.16 132.3144.8363.21   0.08  95.50

dm-3 9.60 11312.60  0.59698.20 0.00 0.20
0.00   0.003.73   11.69 126.6463.0063.20   0.09  97.70


Testing latest qemu-img + this patch:

# rpm -q qemu-img
qemu-img-4.2.0-25.module+el8.2.1+6815+1c792dc8.nsoffer202006140516.x86_64

# time qemu-img convert

Re: [PATCH v8 0/8] block: enhance handling of size-related BlockConf properties

2020-06-15 Thread Roman Kagan

On Fri, May 29, 2020 at 01:55:08AM +0300, Roman Kagan wrote:
> BlockConf includes several properties counted in bytes.
> 
> Enhance their handling in some aspects, specifically
> 
> - accept common size suffixes (k, m)
> - perform consistency checks on the values
> - lift the upper limit on physical_block_size and logical_block_size
> 
> Also fix the accessor for opt_io_size in virtio-blk to make it consistent with
> the size of the field.
> 
> History:
> v7 -> v8:
> - replace stringify with %u in error messages [Eric]
> - fix wording in logs [Eric]
> 
> v6 -> v7:
> - avoid overflow in min_io_size check [Eric]
> - try again to perform the art form in patch splitting [Eric]
> 
> v5 -> v6:
> - fix forgotten xen-block and swim
> - add prop_size32 instead of going with 64bit
> 
> v4 -> v5:
> - re-split the patches [Philippe]
> - fix/reword error messages [Philippe, Kevin]
> - do early return on failed consistency check [Philippe]
> - use QEMU_IS_ALIGNED instead of open coding [Philippe]
> - make all BlockConf size props support suffixes
> - expand the log for virtio-blk opt_io_size [Michael]
> 
> v3 -> v4:
> - add patch to fix opt_io_size width in virtio-blk
> - add patch to perform consistency checks [Kevin]
> - check min_io_size against truncation [Kevin]
> 
> v2 -> v3:
> - mention qcow2 cluster size limit in the log and comment [Eric]
> 
> v1 -> v2:
> - cap the property at 2 MiB [Eric]
> - accept size suffixes
> 
> Roman Kagan (8):
>   virtio-blk: store opt_io_size with correct size
>   block: consolidate blocksize properties consistency checks
>   qdev-properties: blocksize: use same limits in code and description
>   qdev-properties: add size32 property type
>   qdev-properties: make blocksize accept size suffixes
>   block: make BlockConf size props 32bit and accept size suffixes
>   qdev-properties: add getter for size32 and blocksize
>   block: lift blocksize property limit to 2 MiB
> 
>  include/hw/block/block.h |  14 +-
>  include/hw/qdev-properties.h |   5 +-
>  hw/block/block.c |  40 ++-
>  hw/block/fdc.c   |   5 +-
>  hw/block/nvme.c  |   5 +-
>  hw/block/swim.c  |   5 +-
>  hw/block/virtio-blk.c|   9 +-
>  hw/block/xen-block.c |   6 +-
>  hw/core/qdev-properties.c|  85 +-
>  hw/ide/qdev.c|   5 +-
>  hw/scsi/scsi-disk.c  |  12 +-
>  hw/usb/dev-storage.c |   5 +-
>  tests/qemu-iotests/172.out   | 532 +--
>  13 files changed, 419 insertions(+), 309 deletions(-)

Ping?

Re: [PATCH v7 00/22] nvme: small fixes, refactoring and cleanups

2020-06-15 Thread Kevin Wolf

Am 09.06.2020 um 21:03 hat Klaus Jensen geschrieben:
> From: Klaus Jensen 
> 
> Hi all,
> 
> As per our discussion about how to amend the bug I introduced in
> "hw/block/nvme: allow use of any valid msix vector", this is a respin
> without that patch.
> 
> Kevin, it applies cleanly on top of your block tree with all current
> hw/block/bnvme patches removed.

Thanks, applied to the block branch.

Kevin

Re: [PATCH v3 11/11] hw/sd/sdcard: Simplify realize() a bit

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:28, Philippe Mathieu-Daudé  wrote:
>
> From: Philippe Mathieu-Daudé 
>
> We don't need to check if sd->blk is set twice.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/sd/sd.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/hw/sd/sd.c b/hw/sd/sd.c
> index a1b25ed36f..060ca9d993 100644
> --- a/hw/sd/sd.c
> +++ b/hw/sd/sd.c
> @@ -2123,12 +2123,12 @@ static void sd_realize(DeviceState *dev, Error **errp)
>  return;
>  }
>
> -if (sd->blk && blk_is_read_only(sd->blk)) {
> -error_setg(errp, "Cannot use read-only drive as SD card");
> -return;
> -}
> -
>  if (sd->blk) {
> +if (blk_is_read_only(sd->blk)) {
> +error_setg(errp, "Cannot use read-only drive as SD card");
> +return;
> +}
> +
>  ret = blk_set_perm(sd->blk, BLK_PERM_CONSISTENT_READ | 
> BLK_PERM_WRITE,
> BLK_PERM_ALL, errp);
>  if (ret < 0) {

Originally written with the pattern of "check all the error cases
first; then do actual work". But if you prefer this way around

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v3 09/11] hw/sd/sdcard: Correctly display the command name in trace events

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:25, Philippe Mathieu-Daudé  wrote:
>
> From: Philippe Mathieu-Daudé 
>
> Some ACMD were incorrectly displayed. Fix by remembering if we
> are processing a ACMD (with current_cmd_is_acmd) and add the
> sd_current_cmd_name() helper, which display to correct name
> regardless it is a CMD or ACMD.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/sd/sd.c | 17 ++---
>  1 file changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/hw/sd/sd.c b/hw/sd/sd.c
> index 952be36399..fad34ab184 100644
> --- a/hw/sd/sd.c
> +++ b/hw/sd/sd.c
> @@ -114,6 +114,7 @@ struct SDState {
>  uint8_t pwd[16];
>  uint32_t pwd_len;
>  uint8_t function_group[6];
> +bool current_cmd_is_acmd;

This is extra state, so strictly speaking it needs to be
migrated (though the only thing we would get wrong is a
possible wrong trace message after a migration load).

>  uint8_t current_cmd;
>  /* True if we will handle the next command as an ACMD. Note that this 
> does
>   * *not* track the APP_CMD status bit!
> @@ -1687,6 +1688,8 @@ int sd_do_command(SDState *sd, SDRequest *req,
>req->cmd);
>  req->cmd &= 0x3f;
>  }
> +sd->current_cmd = req->cmd;
> +sd->current_cmd_is_acmd = sd->expecting_acmd;

I'm not 100% sure about moving the update of sd->current_cmd
down here -- if it's an illegal command that seems wrong.

>  if (sd->card_status & CARD_IS_LOCKED) {
>  if (!cmd_valid_while_locked(sd, req->cmd)) {
> @@ -1714,7 +1717,6 @@ int sd_do_command(SDState *sd, SDRequest *req,
>  /* Valid command, we can update the 'state before command' bits.
>   * (Do this now so they appear in r1 responses.)
>   */
> -sd->current_cmd = req->cmd;
>  sd->card_status &= ~CURRENT_STATE;
>  sd->card_status |= (last_state << 9);
>  }
> @@ -1775,6 +1777,15 @@ send_response:
>  return rsplen;
>  }

thanks
-- PMM

Re: [PATCH v3 10/11] hw/sd/sdcard: Display offset in read/write_data() trace events

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:32, Philippe Mathieu-Daudé  wrote:
>
> From: Philippe Mathieu-Daudé 
>
> Having 'base address' and 'relative offset' displayed
> separately is more helpful than the absolute address.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v3 08/11] hw/sd/sdcard: Make iolen unsigned

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:27, Philippe Mathieu-Daudé  wrote:
>
> I/O request length can not be negative.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/sd/sd.c | 2 +-
>  hw/sd/trace-events | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/hw/sd/sd.c b/hw/sd/sd.c
> index 9d51138b11..952be36399 100644
> --- a/hw/sd/sd.c
> +++ b/hw/sd/sd.c
> @@ -1950,7 +1950,7 @@ uint8_t sd_read_data(SDState *sd)
>  {
>  /* TODO: Append CRCs */
>  uint8_t ret;
> -int io_len;
> +size_t io_len;

size_t seems an odd choice -- we initialize it with
io_len = (sd->ocr & (1 << 30)) ? 512 : sd->blk_len;

where sd->blk_len is a uint32_t, and we use it mainly with
BLK_READ_BLOCK(sd->data_start, io_len);
where BLK_READ_BLOCK is a rather unnecessary macroization
of sd_blk_read(), which takes a uint32_t.

thanks
-- PMM

Re: [PATCH] hw/scsi/megasas: Fix possible out-of-bounds array access in tracepoints

2020-06-15 Thread Paolo Bonzini

On 15/06/20 09:26, Thomas Huth wrote:
> Some tracepoints in megasas.c use a guest-controlled value as an index
> into the mfi_frame_desc[] array. Thus a malicious guest could cause an
> out-of-bounds error here. Fortunately, the impact is very low since this
> can only happen when the corresponding tracepoints have been enabled
> before, but the problem should be fixed anyway with a proper check.
> 
> Buglink: https://bugs.launchpad.net/qemu/+bug/1882065
> Signed-off-by: Thomas Huth 
> ---
>  hw/scsi/megasas.c | 36 +++-
>  1 file changed, 23 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
> index af18c88b65..aa930226f8 100644
> --- a/hw/scsi/megasas.c
> +++ b/hw/scsi/megasas.c
> @@ -54,10 +54,6 @@
>  #define MEGASAS_FLAG_USE_QUEUE64   1
>  #define MEGASAS_MASK_USE_QUEUE64   (1 << MEGASAS_FLAG_USE_QUEUE64)
>  
> -static const char *mfi_frame_desc[] = {
> -"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI",
> -"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"};
> -
>  typedef struct MegasasCmd {
>  uint32_t index;
>  uint16_t flags;
> @@ -183,6 +179,20 @@ static void megasas_frame_set_scsi_status(MegasasState 
> *s,
>  stb_pci_dma(pci, frame + offsetof(struct mfi_frame_header, scsi_status), 
> v);
>  }
>  
> +static inline const char *mfi_frame_desc(unsigned int cmd)
> +{
> +static const char *mfi_frame_descs[] = {
> +"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI",
> +"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"
> +};
> +
> +if (cmd < ARRAY_SIZE(mfi_frame_descs)) {
> +return mfi_frame_descs[cmd];
> +}
> +
> +return "Unknown";
> +}
> +
>  /*
>   * Context is considered opaque, but the HBA firmware is running
>   * in little endian mode. So convert it to little endian, too.
> @@ -1670,25 +1680,25 @@ static int megasas_handle_scsi(MegasasState *s, 
> MegasasCmd *cmd,
>  if (is_logical) {
>  if (target_id >= MFI_MAX_LD || lun_id != 0) {
>  trace_megasas_scsi_target_not_present(
> -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id);
> +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id);
>  return MFI_STAT_DEVICE_NOT_FOUND;
>  }
>  }
>  sdev = scsi_device_find(&s->bus, 0, target_id, lun_id);
>  
>  cmd->iov_size = le32_to_cpu(cmd->frame->header.data_len);
> -trace_megasas_handle_scsi(mfi_frame_desc[frame_cmd], is_logical,
> +trace_megasas_handle_scsi(mfi_frame_desc(frame_cmd), is_logical,
>target_id, lun_id, sdev, cmd->iov_size);
>  
>  if (!sdev || (megasas_is_jbod(s) && is_logical)) {
>  trace_megasas_scsi_target_not_present(
> -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id);
> +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id);
>  return MFI_STAT_DEVICE_NOT_FOUND;
>  }
>  
>  if (cdb_len > 16) {
>  trace_megasas_scsi_invalid_cdb_len(
> -mfi_frame_desc[frame_cmd], is_logical,
> +mfi_frame_desc(frame_cmd), is_logical,
>  target_id, lun_id, cdb_len);
>  megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE));
>  cmd->frame->header.scsi_status = CHECK_CONDITION;
> @@ -1706,7 +1716,7 @@ static int megasas_handle_scsi(MegasasState *s, 
> MegasasCmd *cmd,
>  cmd->req = scsi_req_new(sdev, cmd->index, lun_id, cdb, cmd);
>  if (!cmd->req) {
>  trace_megasas_scsi_req_alloc_failed(
> -mfi_frame_desc[frame_cmd], target_id, lun_id);
> +mfi_frame_desc(frame_cmd), target_id, lun_id);
>  megasas_write_sense(cmd, SENSE_CODE(NO_SENSE));
>  cmd->frame->header.scsi_status = BUSY;
>  s->event_count++;
> @@ -1751,17 +1761,17 @@ static int megasas_handle_io(MegasasState *s, 
> MegasasCmd *cmd, int frame_cmd)
>  }
>  
>  trace_megasas_handle_io(cmd->index,
> -mfi_frame_desc[frame_cmd], target_id, lun_id,
> +mfi_frame_desc(frame_cmd), target_id, lun_id,
>  (unsigned long)lba_start, (unsigned 
> long)lba_count);
>  if (!sdev) {
>  trace_megasas_io_target_not_present(cmd->index,
> -mfi_frame_desc[frame_cmd], target_id, lun_id);
> +mfi_frame_desc(frame_cmd), target_id, lun_id);
>  return MFI_STAT_DEVICE_NOT_FOUND;
>  }
>  
>  if (cdb_len > 16) {
>  trace_megasas_scsi_invalid_cdb_len(
> -mfi_frame_desc[frame_cmd], 1, target_id, lun_id, cdb_len);
> +mfi_frame_desc(frame_cmd), 1, target_id, lun_id, cdb_len);
>  megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE));
>  cmd->frame->header.scsi_status = CHECK_CONDITION;
>  s->event_count++;
> @@ -1781,7 +1791,7 @@ static int megasas_handle_io(MegasasState *s, 
> MegasasCmd *cmd, int frame_cmd)
>

Re: [PATCH v3 06/11] hw/sd/sdcard: Simplify cmd_valid_while_locked()

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:27, Philippe Mathieu-Daudé  wrote:
>
> From: Philippe Mathieu-Daudé 
>
> cmd_valid_while_locked() only needs to read SDRequest->cmd,
> pass it directly and make it const.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/sd/sd.c | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v3 05/11] hw/sd/sdcard: Update the SDState documentation

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:22, Philippe Mathieu-Daudé  wrote:
>
> From: Philippe Mathieu-Daudé 
>
> Add more descriptive comments to keep a clear separation
> between static property vs runtime changeable.
>
> Suggested-by: Peter Maydell 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/sd/sd.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v3 03/11] hw/sd/sdcard: Do not switch to ReceivingData if address is invalid

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:25, Philippe Mathieu-Daudé  wrote:
>
> From: Philippe Mathieu-Daudé 
>
> Only move the state machine to ReceivingData if there is no
> pending error.  This avoids later OOB access while processing
> commands queued.
>
>   "SD Specifications Part 1 Physical Layer Simplified Spec. v3.01"
>
>   4.3.3 Data Read
>
>   Read command is rejected if BLOCK_LEN_ERROR or ADDRESS_ERROR
>   occurred and no data transfer is performed.
>
>   4.3.4 Data Write
>
>   Write command is rejected if BLOCK_LEN_ERROR or ADDRESS_ERROR
>   occurred and no data transfer is performed.

It's not clear from the spec that this should also
apply to WP_VIOLATION errors. The text about WP_VIOLATION
suggests that it is handled by aborting the data transfer
(ie set the error bit, stay in receive-data state, wait for
a stop command, but ignore all further data transfer),
which is I think distinct from "rejecting" the command.

If that theory is right then moving the check for the
ADDRESS_ERROR in this patch is correct but the WP_VIOLATION
tests should stay as they are, I think.

NB: is the buffer overrun we're trying to protect against
caused by passing sd_wp_addr() a bad address? Maybe we
should assert in sd_addr_to_wpnum() that the address is
in range, as a defence.

thanks
-- PMM

Re: [PATCH v3 02/11] hw/sd/sdcard: Update coding style to make checkpatch.pl happy

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:24, Philippe Mathieu-Daudé  wrote:
>
> To make the next commit easier to review, clean this code first.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/sd/sd.c | 24 
>  1 file changed, 16 insertions(+), 8 deletions(-)

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v3 04/11] hw/sd/sdcard: Restrict Class 6 commands to SCSD cards

2020-06-15 Thread Peter Maydell

On Fri, 5 Jun 2020 at 11:23, Philippe Mathieu-Daudé  wrote:
>
> From: Philippe Mathieu-Daudé 
>
> Only SCSD cards support Class 6 (Block Oriented Write Protection)
> commands.
>
>   "SD Specifications Part 1 Physical Layer Simplified Spec. v3.01"
>
>   4.3.14 Command Functional Difference in Card Capacity Types
>
>   * Write Protected Group
>
>   SDHC and SDXC do not support write-protected groups. Issuing
>   CMD28, CMD29 and CMD30 generates the ILLEGAL_COMMAND error.
>
> Signed-off-by: Philippe Mathieu-Daudé 
> ---

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: [PATCH v3 0/4] block: seriously improve savevm performance

2020-06-15 Thread Dr. David Alan Gilbert

* Denis V. Lunev (d...@openvz.org) wrote:
> On 6/15/20 3:17 PM, Dr. David Alan Gilbert wrote:
> > * Denis V. Lunev (d...@openvz.org) wrote:
> >> This series do standard basic things:
> >> - it creates intermediate buffer for all writes from QEMU migration code
> >>   to QCOW2 image,
> >> - this buffer is sent to disk asynchronously, allowing several writes to
> >>   run in parallel.
> >>
> >> In general, migration code is fantastically inefficent (by observation),
> >> buffers are not aligned and sent with arbitrary pieces, a lot of time
> >> less than 100 bytes at a chunk, which results in read-modify-write
> >> operations with non-cached operations. It should also be noted that all
> >> operations are performed into unallocated image blocks, which also suffer
> >> due to partial writes to such new clusters.
> > It surprises me a little that you're not benefiting from the buffer
> > internal to qemu-file.c
> >
> > Dave
> There are a lot of problems with this buffer:
> 
> Flushes to block driver state are performed in the abstract places,
> pushing
> Â  a) small IO
> Â  b) non-aligned IO both to
> Â Â Â Â Â Â  1) page size
> Â Â Â Â Â Â  2) cluster size
> It should also be noted that buffer in QEMU file is quite small and
> all IO operations with it are synchronous. IO, like ethernet, wants
> good queues.

Yeh, for ethernet we immediately get the kernels buffer so it's not too
bad; and I guess the async page writes are easier as well.

Dave

> The difference is on the table.
> 
> Den
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH v3 0/4] block: seriously improve savevm performance

2020-06-15 Thread Denis V. Lunev

On 6/15/20 3:17 PM, Dr. David Alan Gilbert wrote:
> * Denis V. Lunev (d...@openvz.org) wrote:
>> This series do standard basic things:
>> - it creates intermediate buffer for all writes from QEMU migration code
>>   to QCOW2 image,
>> - this buffer is sent to disk asynchronously, allowing several writes to
>>   run in parallel.
>>
>> In general, migration code is fantastically inefficent (by observation),
>> buffers are not aligned and sent with arbitrary pieces, a lot of time
>> less than 100 bytes at a chunk, which results in read-modify-write
>> operations with non-cached operations. It should also be noted that all
>> operations are performed into unallocated image blocks, which also suffer
>> due to partial writes to such new clusters.
> It surprises me a little that you're not benefiting from the buffer
> internal to qemu-file.c
>
> Dave
There are a lot of problems with this buffer:

Flushes to block driver state are performed in the abstract places,
pushing
  a) small IO
  b) non-aligned IO both to
   1) page size
   2) cluster size
It should also be noted that buffer in QEMU file is quite small and
all IO operations with it are synchronous. IO, like ethernet, wants
good queues.

The difference is on the table.

Den

Re: [PATCH v3 0/4] block: seriously improve savevm performance

2020-06-15 Thread Dr. David Alan Gilbert

* Denis V. Lunev (d...@openvz.org) wrote:
> This series do standard basic things:
> - it creates intermediate buffer for all writes from QEMU migration code
>   to QCOW2 image,
> - this buffer is sent to disk asynchronously, allowing several writes to
>   run in parallel.
> 
> In general, migration code is fantastically inefficent (by observation),
> buffers are not aligned and sent with arbitrary pieces, a lot of time
> less than 100 bytes at a chunk, which results in read-modify-write
> operations with non-cached operations. It should also be noted that all
> operations are performed into unallocated image blocks, which also suffer
> due to partial writes to such new clusters.

It surprises me a little that you're not benefiting from the buffer
internal to qemu-file.c

Dave

> This patch series is an implementation of idea discussed in the RFC
> posted by Denis Plotnikov
> https://lists.gnu.org/archive/html/qemu-devel/2020-04/msg01925.html
> Results with this series over NVME are better than original code
> original rfcthis
> cached:  1.79s  2.38s   1.27s
> non-cached:  3.29s  1.31s   0.81s
> 
> Changes from v2:
> - code moved from QCOW2 level to generic block level
> - created bdrv_flush_vmstate helper to fix 022, 029 tests
> - added recursive for bs->file in bdrv_co_flush_vmstate (fix 267)
> - fixed blk_save_vmstate helper
> - fixed coroutine wait as Vladimir suggested with waiting fixes from me
> 
> Changes from v1:
> - patchew warning fixed
> - fixed validation that only 1 waiter is allowed in patch 1
> 
> Signed-off-by: Denis V. Lunev 
> CC: Kevin Wolf 
> CC: Max Reitz 
> CC: Stefan Hajnoczi 
> CC: Fam Zheng 
> CC: Juan Quintela 
> CC: "Dr. David Alan Gilbert" 
> CC: Vladimir Sementsov-Ogievskiy 
> CC: Denis Plotnikov 
> 
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH 1/4] migration/savevm: respect qemu_fclose() error code in save_snapshot()

2020-06-15 Thread Dr. David Alan Gilbert

* Denis V. Lunev (d...@openvz.org) wrote:
> qemu_fclose() could return error, f.e. if bdrv_co_flush() will return
> the error.
> 
> This validation will become more important once we will start waiting of
> asynchronous IO operations, started from bdrv_write_vmstate(), which are
> coming soon.
> 
> Signed-off-by: Denis V. Lunev 
> CC: Kevin Wolf 
> CC: Max Reitz 
> CC: Stefan Hajnoczi 
> CC: Fam Zheng 
> CC: Juan Quintela 
> CC: "Dr. David Alan Gilbert" 
> CC: Vladimir Sementsov-Ogievskiy 
> CC: Denis Plotnikov 

We check the return value in very few other places; I think in the
migration case we do flushes and assume that if the flushes work we
were OK; then most of the closes happen on error paths or after points
we think we're done.

Reviewed-by: Dr. David Alan Gilbert 

> ---
>  migration/savevm.c | 8 ++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/savevm.c b/migration/savevm.c
> index c00a6807d9..0ff5bb40ed 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -2628,7 +2628,7 @@ int save_snapshot(const char *name, Error **errp)
>  {
>  BlockDriverState *bs, *bs1;
>  QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1;
> -int ret = -1;
> +int ret = -1, ret2;
>  QEMUFile *f;
>  int saved_vm_running;
>  uint64_t vm_state_size;
> @@ -2712,10 +2712,14 @@ int save_snapshot(const char *name, Error **errp)
>  }
>  ret = qemu_savevm_state(f, errp);
>  vm_state_size = qemu_ftell(f);
> -qemu_fclose(f);
> +ret2 = qemu_fclose(f);
>  if (ret < 0) {
>  goto the_end;
>  }
> +if (ret2 < 0) {
> +ret = ret2;
> +goto the_end;
> +}
>  
>  /* The bdrv_all_create_snapshot() call that follows acquires the 
> AioContext
>   * for itself.  BDRV_POLL_WHILE() does not support nested locking because
> -- 
> 2.17.1
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH v7 4/9] qcow2_format.py: Dump bitmap directory information

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


12.06.2020 03:04, Andrey Shinkevich wrote:

Read and dump entries from the bitmap directory of QCOW2 image.
It extends the output in the test case #291.

Header extension:
magic 0x23852875 (Bitmaps)
...
Bitmap name   bitmap-1
flag  auto
table size8 (bytes)
bitmap_table_offset   0x9
bitmap_table_size 1
flags 0
type  1
granularity_bits  16
name_size 8
extra_data_size   0

Suggested-by: Kevin Wolf 
Signed-off-by: Andrey Shinkevich 
Reviewed-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 


Hmm I don't remember me gave it..


---
  tests/qemu-iotests/291.out | 52 ++
  tests/qemu-iotests/qcow2_format.py | 75 ++
  2 files changed, 127 insertions(+)

diff --git a/tests/qemu-iotests/291.out b/tests/qemu-iotests/291.out
index ccfcdc5..d847419 100644
--- a/tests/qemu-iotests/291.out
+++ b/tests/qemu-iotests/291.out
@@ -33,6 +33,27 @@ reserved320
  bitmap_directory_size 0x40
  bitmap_directory_offset   0x51
  
+Bitmap name   b1

+table size8 (bytes)
+bitmap_table_offset   0x4e
+bitmap_table_size 1
+flags 0
+type  1
+granularity_bits  19
+name_size 2
+extra_data_size   0
+
+Bitmap name   b2
+flag  auto
+table size8 (bytes)
+bitmap_table_offset   0x50
+bitmap_table_size 1
+flags 2


both having flags and flag in the output looks strange.

If you want good human look of flags field, you should implement a special 
formatter-class for it, like Flags64.
Maybe, something like this:

flags  0x3 (in_use auto)



+type  1
+granularity_bits  16
+name_size 2
+extra_data_size   0
+
  
  === Bitmap preservation not possible to non-qcow2 ===
  
@@ -98,6 +119,37 @@ reserved320

  bitmap_directory_size 0x60
  bitmap_directory_offset   0x52
  
+Bitmap name   b1

+table size8 (bytes)
+bitmap_table_offset   0x47
+bitmap_table_size 1
+flags 0
+type  1
+granularity_bits  19
+name_size 2
+extra_data_size   0
+
+Bitmap name   b2
+flag  auto
+table size8 (bytes)
+bitmap_table_offset   0x49
+bitmap_table_size 1
+flags 2
+type  1
+granularity_bits  16
+name_size 2
+extra_data_size   0
+
+Bitmap name   b0
+table size8 (bytes)
+bitmap_table_offset   0x51
+bitmap_table_size 1
+flags 0
+type  1
+granularity_bits  16
+name_size 2
+extra_data_size   0
+
  
  === Check bitmap contents ===
  
diff --git a/tests/qemu-iotests/qcow2_format.py b/tests/qemu-iotests/qcow2_format.py

index d4f..90eff92 100644
--- a/tests/qemu-iotests/qcow2_format.py
+++ b/tests/qemu-iotests/qcow2_format.py
@@ -103,6 +103,10 @@ class Qcow2Struct(metaclass=Qcow2StructMeta):
  print('{:<25} {}'.format(f[2], value_str))
  
  
+# seek relative to the current position in the file

+FROM_CURRENT = 1


Firstly, I though that it's a global variable to control class behavior. Only 
than I understood that you just decided to use a named constant instead of just 
1...

So, I'd prefer to use just 1.


+
+
  class Qcow2BitmapExt(Qcow2Struct):
  
  fields = (

@@ -112,6 +116,73 @@ class Qcow2BitmapExt(Qcow2Struct):
  ('u64', '{:#x}', 'bitmap_directory_offset')
  )
  
+def read_bitmap_directory(self, fd):

+self.bitmaps = []
+fd.seek(self.bitmap_directory_offset)
+buf_size = struct.calcsize(Qcow2BitmapDirEntry.fmt)
+
+for n in range(self.nb_bitmaps):
+buf = fd.read(buf_size)
+dir_entry = Qcow2BitmapDirEntry(data=buf)


Base class of Qcow2BitmapDirEntry can load from fd. We should definitely 
utilize this possibility, instead of writing it again.


+fd.seek(dir_entry.extra_data_size, FROM_CURRENT)
+bitmap_name = fd.read(dir_entry.name_size)
+dir_entry.name = bitmap_name.decode('ascii')


You should initialize object in its constructor, not by hand here.

Actually, the code here should look like this:

self.bitmap_directory = [Qcow2BitmapDirEntry(fd) for _ in 
range(self.nb_bitmaps)]

OK, you may leave a loop, and even calculating of final alignment here, but 
real fields should be read and initialized in Qcow2BitmapDirEntry constructor
   


+self.bitmaps.append(dir_entry)
+entry_raw_size = dir_entry.bitmap_dir_entry_raw_size()
+shift = ((

Re: [PATCH v7 3/9] qcow2_format.py: make printable data an extension class member

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


12.06.2020 03:04, Andrey Shinkevich wrote:

Let us differ binary data type from string one for the extension data
variable and keep the string as the QcowHeaderExtension class member.


Keep my r-b, but I just want to note, that I missed, what is the actual aim of 
this change..



Signed-off-by: Andrey Shinkevich 
Reviewed-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
  tests/qemu-iotests/qcow2_format.py | 14 --
  1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/qemu-iotests/qcow2_format.py 
b/tests/qemu-iotests/qcow2_format.py
index 0f65fd1..d4f 100644
--- a/tests/qemu-iotests/qcow2_format.py
+++ b/tests/qemu-iotests/qcow2_format.py
@@ -164,6 +164,13 @@ class QcowHeaderExtension(Qcow2Struct):
  self.data = fd.read(padded)
  assert self.data is not None
  
+data_str = self.data[:self.length]

+if all(c in string.printable.encode('ascii') for c in data_str):
+data_str = f"'{ data_str.decode('ascii') }'"
+else:
+data_str = ''
+self.data_str = data_str
+
  if self.magic == QCOW2_EXT_MAGIC_BITMAPS:
  self.obj = Qcow2BitmapExt(data=self.data)
  else:
@@ -173,12 +180,7 @@ class QcowHeaderExtension(Qcow2Struct):
  super().dump()
  
  if self.obj is None:

-data = self.data[:self.length]
-if all(c in string.printable.encode('ascii') for c in data):
-data = f"'{ data.decode('ascii') }'"
-else:
-data = ''
-print(f'{"data":<25} {data}')
+print(f'{"data":<25} {self.data_str}')
  else:
  self.obj.dump()
  




--
Best regards,
Vladimir

Re: [PATCH v7 2/9] qcow2: Fix capitalization of header extension constant.

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


12.06.2020 03:04, Andrey Shinkevich wrote:

Make the capitalization of the hexadecimal numbers consistent for the
QCOW2 header extension constants in docs/interop/qcow2.txt.

Suggested-by: Eric Blake
Signed-off-by: Andrey Shinkevich



Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

Re: [PATCH v7 1/9] iotests: Fix for magic hexadecimal output in 291

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


12.06.2020 03:04, Andrey Shinkevich wrote:

This issue was introduced in the earlier patch:
"qcow2_format: refactor QcowHeaderExtension as a subclass of
Qcow2Struct".

Signed-off-by: Andrey Shinkevich 


This change was squashed to original commit


---
  tests/qemu-iotests/291.out | 10 +-
  1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/291.out b/tests/qemu-iotests/291.out
index 1d4f9cd..ccfcdc5 100644
--- a/tests/qemu-iotests/291.out
+++ b/tests/qemu-iotests/291.out
@@ -16,17 +16,17 @@ wrote 1048576/1048576 bytes at offset 2097152
  1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
  Check resulting qcow2 header extensions:
  Header extension:
-magic 3799591626 (Backing format)
+magic 0xe2792aca (Backing format)
  length5
  data  'qcow2'
  
  Header extension:

-magic 1745090647 (Feature table)
+magic 0x6803f857 (Feature table)
  length336
  data  
  
  Header extension:

-magic 595929205 (Bitmaps)
+magic 0x23852875 (Bitmaps)
  length24
  nb_bitmaps2
  reserved320
@@ -86,12 +86,12 @@ Format specific information:
  corrupt: false
  Check resulting qcow2 header extensions:
  Header extension:
-magic 1745090647 (Feature table)
+magic 0x6803f857 (Feature table)
  length336
  data  
  
  Header extension:

-magic 595929205 (Bitmaps)
+magic 0x23852875 (Bitmaps)
  length24
  nb_bitmaps3
  reserved320




--
Best regards,
Vladimir

Re: [PATCH v7 0/9] iotests: Dump QCOW2 dirty bitmaps metadata

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


12.06.2020 03:04, Andrey Shinkevich wrote:

Note: based on the Vladimir's series
   [v5 00/13] iotests: Dump QCOW2 dirty bitmaps metadata


It's merged to master, so, based on master. (except for 01, which is not 
needed, thanks to Eric)



Add dirty bitmap information to QCOW2 metadata dump in the qcow2_format.py.

v7:
   01: Fix for magic hexadecimal output in 291
   02: Bitmap table output format improvement.
   03: Incremental change in the test 291 output.

v6:
   01: Fixing capitalization of header extension constant.
   (Suggested by Eric)
   02: The cluster size global variable discarded and passed as a parameter.
   03: Re-based to Vladimir's v5 series.
   04: The code of passing qcow2.py JSON format key moved to separate patch.
   05: Making dict(s) for dumping in JSON format was substituted with a copy
   of __dict__.

v5: The Vladimir's preliminary series
v4: The Vladimir's preliminary series

Andrey Shinkevich (9):
   iotests: Fix for magic hexadecimal output in 291
   qcow2: Fix capitalization of header extension constant.
   qcow2_format.py: make printable data an extension class member
   qcow2_format.py: Dump bitmap directory information
   qcow2_format.py: pass cluster size to substructures
   qcow2_format.py: Dump bitmap table serialized entries
   qcow2.py: Introduce '-j' key to dump in JSON format
   qcow2_format.py: collect fields to dump in JSON format
   qcow2_format.py: support dumping metadata in JSON format

  block/qcow2.c  |   2 +-
  docs/interop/qcow2.txt |   2 +-
  tests/qemu-iotests/291.out | 112 ++-
  tests/qemu-iotests/qcow2.py|  20 +++-
  tests/qemu-iotests/qcow2_format.py | 217 ++---
  5 files changed, 327 insertions(+), 26 deletions(-)




--
Best regards,
Vladimir

Re: [PATCH 2/4] block/aio_task: allow start/wait task from any coroutine

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


15.06.2020 10:47, Vladimir Sementsov-Ogievskiy wrote:

11.06.2020 20:11, Denis V. Lunev wrote:

From: Vladimir Sementsov-Ogievskiy 

Currently, aio task pool assumes that there is a main coroutine, which
creates tasks and wait for them. Let's remove the restriction by using
CoQueue. Code becomes clearer, interface more obvious.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Denis V. Lunev 
CC: Kevin Wolf 
CC: Max Reitz 
CC: Stefan Hajnoczi 
CC: Fam Zheng 
CC: Juan Quintela 
CC: "Dr. David Alan Gilbert" 
CC: Vladimir Sementsov-Ogievskiy 
CC: Denis Plotnikov 
---
  block/aio_task.c | 21 ++---
  1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/block/aio_task.c b/block/aio_task.c
index 88989fa248..cf62e5c58b 100644
--- a/block/aio_task.c
+++ b/block/aio_task.c
@@ -27,11 +27,10 @@
  #include "block/aio_task.h"
  struct AioTaskPool {
-    Coroutine *main_co;
  int status;
  int max_busy_tasks;
  int busy_tasks;
-    bool waiting;
+    CoQueue waiters;
  };
  static void coroutine_fn aio_task_co(void *opaque)
@@ -52,31 +51,23 @@ static void coroutine_fn aio_task_co(void *opaque)
  g_free(task);
-    if (pool->waiting) {
-    pool->waiting = false;
-    aio_co_wake(pool->main_co);
-    }
+    qemu_co_queue_restart_all(&pool->waiters);
  }
  void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool)
  {
  assert(pool->busy_tasks > 0);
-    assert(qemu_coroutine_self() == pool->main_co);
-    pool->waiting = true;
-    qemu_coroutine_yield();
+    qemu_co_queue_wait(&pool->waiters, NULL);
-    assert(!pool->waiting);
  assert(pool->busy_tasks < pool->max_busy_tasks);


As we wake up several coroutines now, I'm afraid this assertion may start to 
fire.
And aio_task_pool_wait_one() becomes useless as a public API (still, it's used 
only locally, so we can make it static).

I'll send updated patch after reviewing the rest of the series.



Hm, OK, we have two kinds of waiters: waiting for a slot and for all tasks to 
finish. So, either we need two queues, or do like this patch (one queue, but 
wake-up all waiters, for them to check does their condition satisfied or not).

I'm OK with this patch with the following squashed-in:

diff --git a/include/block/aio_task.h b/include/block/aio_task.h
index 50bc1e1817..50b1c036c5 100644
--- a/include/block/aio_task.h
+++ b/include/block/aio_task.h
@@ -48,7 +48,6 @@ bool aio_task_pool_empty(AioTaskPool *pool);
 void coroutine_fn aio_task_pool_start_task(AioTaskPool *pool, AioTask *task);
 
 void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool);

-void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool);
 void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool);
 
 #endif /* BLOCK_AIO_TASK_H */

diff --git a/block/aio_task.c b/block/aio_task.c
index cf62e5c58b..7ba15ff41f 100644
--- a/block/aio_task.c
+++ b/block/aio_task.c
@@ -54,26 +54,17 @@ static void coroutine_fn aio_task_co(void *opaque)
 qemu_co_queue_restart_all(&pool->waiters);
 }
 
-void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool)

-{
-assert(pool->busy_tasks > 0);
-
-qemu_co_queue_wait(&pool->waiters, NULL);
-
-assert(pool->busy_tasks < pool->max_busy_tasks);
-}
-
 void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool)
 {
 while (pool->busy_tasks >= pool->max_busy_tasks) {
-aio_task_pool_wait_one(pool);
+qemu_co_queue_wait(&pool->waiters, NULL);
 }
 }
 
 void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool)

 {
 while (pool->busy_tasks > 0) {
-aio_task_pool_wait_one(pool);
+qemu_co_queue_wait(&pool->waiters, NULL);
 }
 }
 




--
Best regards,
Vladimir

Re: [PATCH 4/4] block/io: improve savevm performance

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


11.06.2020 20:11, Denis V. Lunev wrote:

This patch does 2 standard basic things:
- it creates intermediate buffer for all writes from QEMU migration code
   to block driver,
- this buffer is sent to disk asynchronously, allowing several writes to
   run in parallel.

Thus bdrv_vmstate_write() is becoming asynchronous. All pending operations
completion are performed in newly invented bdrv_flush_vmstate().

In general, migration code is fantastically inefficent (by observation),
buffers are not aligned and sent with arbitrary pieces, a lot of time
less than 100 bytes at a chunk, which results in read-modify-write
operations if target file descriptor is opened with O_DIRECT. It should
also be noted that all operations are performed into unallocated image
blocks, which also suffer due to partial writes to such new clusters
even on cached file descriptors.

Snapshot creation time (2 GB Fedora-31 VM running over NVME storage):
 original fixed
cached:  1.79s   1.27s
non-cached:  3.29s   0.81s

The difference over HDD would be more significant :)

Signed-off-by: Denis V. Lunev 
CC: Kevin Wolf 
CC: Max Reitz 
CC: Stefan Hajnoczi 
CC: Fam Zheng 
CC: Juan Quintela 
CC: "Dr. David Alan Gilbert" 
CC: Vladimir Sementsov-Ogievskiy 
CC: Denis Plotnikov 
---
  block/io.c| 121 +-
  include/block/block_int.h |   8 +++
  2 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index fbf352f39d..698f1eef76 100644
--- a/block/io.c
+++ b/block/io.c
@@ -26,6 +26,7 @@
  #include "trace.h"
  #include "sysemu/block-backend.h"
  #include "block/aio-wait.h"
+#include "block/aio_task.h"
  #include "block/blockjob.h"
  #include "block/blockjob_int.h"
  #include "block/block_int.h"
@@ -2633,6 +2634,102 @@ typedef struct BdrvVmstateCo {
  int ret;
  } BdrvVmstateCo;
  
+typedef struct BdrvVMStateTask {

+AioTask task;
+
+BlockDriverState *bs;
+int64_t offset;
+void *buf;
+size_t bytes;
+} BdrvVMStateTask;
+
+typedef struct BdrvSaveVMState {
+AioTaskPool *pool;
+BdrvVMStateTask *t;
+} BdrvSaveVMState;
+
+
+static coroutine_fn int bdrv_co_vmstate_save_task_entry(AioTask *task)
+{
+int err = 0;
+BdrvVMStateTask *t = container_of(task, BdrvVMStateTask, task);
+
+if (t->bytes != 0) {
+QEMUIOVector local_qiov;
+qemu_iovec_init_buf(&local_qiov, t->buf, t->bytes);


Consider special oneliner for this case:

QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, t->buf, t->bytes);


+
+bdrv_inc_in_flight(t->bs);
+err = t->bs->drv->bdrv_save_vmstate(t->bs, &local_qiov, t->offset);
+bdrv_dec_in_flight(t->bs);
+}
+
+qemu_vfree(t->buf);
+return err;
+}
+
+static BdrvVMStateTask *bdrv_vmstate_task_create(BlockDriverState *bs,
+ int64_t pos, size_t size)
+{
+BdrvVMStateTask *t = g_new(BdrvVMStateTask, 1);
+
+*t = (BdrvVMStateTask) {
+.task.func = bdrv_co_vmstate_save_task_entry,
+.buf = qemu_blockalign(bs, size),
+.offset = pos,
+.bs = bs,
+};
+
+return t;
+}
+
+static int bdrv_co_do_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
+   int64_t pos)
+{
+BdrvSaveVMState *state = bs->savevm_state;
+BdrvVMStateTask *t;
+size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB);
+size_t to_copy;
+size_t off;
+
+if (state == NULL) {
+state = g_new(BdrvSaveVMState, 1);
+*state = (BdrvSaveVMState) {
+.pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX),
+.t = bdrv_vmstate_task_create(bs, pos, buf_size),
+};
+
+bs->savevm_state = state;
+}
+
+if (aio_task_pool_status(state->pool) < 0) {
+return aio_task_pool_status(state->pool);


So, on failure, we still need flush, to cleanup the state. I think better is to 
cleanup it
immediately here (goto fail, etc.).

Aha, actually in blk_save_vmstate(), you don't do bdrv_flush_vmstate() if 
bdrv_save_vmstate()
failed.


+}
+
+t = state->t;
+if (t->offset + t->bytes != pos) {
+/* Normally this branch is not reachable from migration */


Aha, like a cache-miss. OK


+return bs->drv->bdrv_save_vmstate(bs, qiov, pos);
+}
+
+off = 0;
+while (1) {
+to_copy = MIN(qiov->size - off, buf_size - t->bytes);
+qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy);
+t->bytes += to_copy;
+if (t->bytes < buf_size) {
+return qiov->size;
+}
+
+aio_task_pool_start_task(state->pool, &t->task);
+
+pos += to_copy;
+off += to_copy;
+state->t = t = bdrv_vmstate_task_create(bs, pos, buf_size);
+}
+
+return qiov->size;


this is unreachable actually. So, I'd drop it or do "break" in a loop instead 
of return.


+}
+
  static int coroutine_fn
  bdrv_co_rw_vmstate(Bloc

Re: [PATCH 3/4] block, migration: add bdrv_flush_vmstate helper

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


11.06.2020 20:11, Denis V. Lunev wrote:

Right now bdrv_fclose() is just calling bdrv_flush().

The problem is that migration code is working inefficently from black
layer terms and are frequently called for very small pieces of not
properly aligned data. Block layer is capable to work this way, but
this is very slow.

This patch is a preparation for the introduction of the intermediate
buffer at block driver state. It would be beneficial to separate
conventional bdrv_flush() from closing QEMU file from migration code.

The patch also forces bdrv_flush_vmstate() operation inside
synchronous blk_save_vmstate() operation. This helper is used from
qemu-io only.

Signed-off-by: Denis V. Lunev 
CC: Kevin Wolf 
CC: Max Reitz 
CC: Stefan Hajnoczi 
CC: Fam Zheng 
CC: Juan Quintela 
CC: "Dr. David Alan Gilbert" 
CC: Vladimir Sementsov-Ogievskiy 
CC: Denis Plotnikov 
---
  block/block-backend.c |  6 +-
  block/io.c| 39 +++
  include/block/block.h |  1 +
  migration/savevm.c|  3 +++
  4 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 9342a475cb..2107ace699 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2177,7 +2177,7 @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool 
exact,
  int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
   int64_t pos, int size)
  {
-int ret;
+int ret, ret2;
  
  if (!blk_is_available(blk)) {

  return -ENOMEDIUM;
@@ -2187,6 +2187,10 @@ int blk_save_vmstate(BlockBackend *blk, const uint8_t 
*buf,
  if (ret < 0) {
  return ret;
  }
+ret2 = bdrv_flush_vmstate(blk_bs(blk));
+if (ret2 < 0) {
+return ret;
+}
  
  if (ret == size && !blk->enable_write_cache) {

  ret = bdrv_flush(blk_bs(blk));
diff --git a/block/io.c b/block/io.c
index 121ce17a49..fbf352f39d 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2725,6 +2725,45 @@ int bdrv_readv_vmstate(BlockDriverState *bs, 
QEMUIOVector *qiov, int64_t pos)
  return bdrv_rw_vmstate(bs, qiov, pos, true);
  }
  
+

+typedef struct FlushVMStateCo {
+BlockDriverState *bs;
+int ret;
+} FlushVMStateCo;
+
+static int coroutine_fn bdrv_co_flush_vmstate(BlockDriverState *bs)
+{
+return 0;
+}
+
+static void coroutine_fn bdrv_flush_vmstate_co_entry(void *opaque)
+{
+FlushVMStateCo *rwco = opaque;
+
+rwco->ret = bdrv_co_flush_vmstate(rwco->bs);
+aio_wait_kick();
+}
+
+int bdrv_flush_vmstate(BlockDriverState *bs)
+{
+Coroutine *co;
+FlushVMStateCo flush_co = {
+.bs = bs,
+.ret = NOT_DONE,
+};
+
+if (qemu_in_coroutine()) {
+/* Fast-path if already in coroutine context */
+bdrv_flush_vmstate_co_entry(&flush_co);
+} else {
+co = qemu_coroutine_create(bdrv_flush_vmstate_co_entry, &flush_co);
+bdrv_coroutine_enter(bs, co);
+BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
+}
+
+return flush_co.ret;
+}


In block/io.c, since 7d2410cea154bf, these coroutine wrappers are using 
bdrv_run_co() instead.
I hope, it's an intermediate state, and my series with auto-generated wrappers 
will be applied, still, now we should use bdrv_run_co()-based approach for 
consistency.

Otherwise, patch looks OK for me, just add a new interface, doing nothing for 
now. Still, would be good to add a comment in block.h, that bdrv_flush_vmsate 
is needed after bdrv_save_vmstate.


+
  /**/
  /* async I/Os */
  
diff --git a/include/block/block.h b/include/block/block.h

index 25e299605e..024525b87d 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -572,6 +572,7 @@ int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t 
*buf,
  
  int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,

int64_t pos, int size);
+int bdrv_flush_vmstate(BlockDriverState *bs);
  
  void bdrv_img_create(const char *filename, const char *fmt,

   const char *base_filename, const char *base_fmt,
diff --git a/migration/savevm.c b/migration/savevm.c
index 0ff5bb40ed..9698c909d7 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -150,6 +150,9 @@ static ssize_t block_get_buffer(void *opaque, uint8_t *buf, 
int64_t pos,
  
  static int bdrv_fclose(void *opaque, Error **errp)

  {
+int err = bdrv_flush_vmstate(opaque);
+if (err < 0)
+return err;
  return bdrv_flush(opaque);
  }
  




--
Best regards,
Vladimir

Re: [PATCH 2/4] block/aio_task: allow start/wait task from any coroutine

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


11.06.2020 20:11, Denis V. Lunev wrote:

From: Vladimir Sementsov-Ogievskiy 

Currently, aio task pool assumes that there is a main coroutine, which
creates tasks and wait for them. Let's remove the restriction by using
CoQueue. Code becomes clearer, interface more obvious.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Denis V. Lunev 
CC: Kevin Wolf 
CC: Max Reitz 
CC: Stefan Hajnoczi 
CC: Fam Zheng 
CC: Juan Quintela 
CC: "Dr. David Alan Gilbert" 
CC: Vladimir Sementsov-Ogievskiy 
CC: Denis Plotnikov 
---
  block/aio_task.c | 21 ++---
  1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/block/aio_task.c b/block/aio_task.c
index 88989fa248..cf62e5c58b 100644
--- a/block/aio_task.c
+++ b/block/aio_task.c
@@ -27,11 +27,10 @@
  #include "block/aio_task.h"
  
  struct AioTaskPool {

-Coroutine *main_co;
  int status;
  int max_busy_tasks;
  int busy_tasks;
-bool waiting;
+CoQueue waiters;
  };
  
  static void coroutine_fn aio_task_co(void *opaque)

@@ -52,31 +51,23 @@ static void coroutine_fn aio_task_co(void *opaque)
  
  g_free(task);
  
-if (pool->waiting) {

-pool->waiting = false;
-aio_co_wake(pool->main_co);
-}
+qemu_co_queue_restart_all(&pool->waiters);
  }
  
  void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool)

  {
  assert(pool->busy_tasks > 0);
-assert(qemu_coroutine_self() == pool->main_co);
  
-pool->waiting = true;

-qemu_coroutine_yield();
+qemu_co_queue_wait(&pool->waiters, NULL);
  
-assert(!pool->waiting);

  assert(pool->busy_tasks < pool->max_busy_tasks);


As we wake up several coroutines now, I'm afraid this assertion may start to 
fire.
And aio_task_pool_wait_one() becomes useless as a public API (still, it's used 
only locally, so we can make it static).

I'll send updated patch after reviewing the rest of the series.


  }
  
  void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool)

  {
-if (pool->busy_tasks < pool->max_busy_tasks) {
-return;
+while (pool->busy_tasks >= pool->max_busy_tasks) {
+aio_task_pool_wait_one(pool);
  }
-
-aio_task_pool_wait_one(pool);
  }
  
  void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool)

@@ -98,8 +89,8 @@ AioTaskPool *coroutine_fn aio_task_pool_new(int 
max_busy_tasks)
  {
  AioTaskPool *pool = g_new0(AioTaskPool, 1);
  
-pool->main_co = qemu_coroutine_self();

  pool->max_busy_tasks = max_busy_tasks;
+qemu_co_queue_init(&pool->waiters);
  
  return pool;

  }




--
Best regards,
Vladimir

Re: [PATCH 1/4] migration/savevm: respect qemu_fclose() error code in save_snapshot()

2020-06-15 Thread Vladimir Sementsov-Ogievskiy


11.06.2020 20:11, Denis V. Lunev wrote:

qemu_fclose() could return error, f.e. if bdrv_co_flush() will return
the error.

This validation will become more important once we will start waiting of
asynchronous IO operations, started from bdrv_write_vmstate(), which are
coming soon.

Signed-off-by: Denis V. Lunev
CC: Kevin Wolf
CC: Max Reitz
CC: Stefan Hajnoczi
CC: Fam Zheng
CC: Juan Quintela
CC: "Dr. David Alan Gilbert"
CC: Vladimir Sementsov-Ogievskiy
CC: Denis Plotnikov


Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

Re: [PATCH] hw/scsi/megasas: Fix possible out-of-bounds array access in tracepoints

2020-06-15 Thread Philippe Mathieu-Daudé

On 6/15/20 9:26 AM, Thomas Huth wrote:
> Some tracepoints in megasas.c use a guest-controlled value as an index
> into the mfi_frame_desc[] array. Thus a malicious guest could cause an
> out-of-bounds error here. Fortunately, the impact is very low since this
> can only happen when the corresponding tracepoints have been enabled
> before, but the problem should be fixed anyway with a proper check.
> 
> Buglink: https://bugs.launchpad.net/qemu/+bug/1882065
> Signed-off-by: Thomas Huth 
> ---
>  hw/scsi/megasas.c | 36 +++-
>  1 file changed, 23 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
> index af18c88b65..aa930226f8 100644
> --- a/hw/scsi/megasas.c
> +++ b/hw/scsi/megasas.c
> @@ -54,10 +54,6 @@
>  #define MEGASAS_FLAG_USE_QUEUE64   1
>  #define MEGASAS_MASK_USE_QUEUE64   (1 << MEGASAS_FLAG_USE_QUEUE64)
>  
> -static const char *mfi_frame_desc[] = {
> -"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI",
> -"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"};
> -
>  typedef struct MegasasCmd {
>  uint32_t index;
>  uint16_t flags;
> @@ -183,6 +179,20 @@ static void megasas_frame_set_scsi_status(MegasasState 
> *s,
>  stb_pci_dma(pci, frame + offsetof(struct mfi_frame_header, scsi_status), 
> v);
>  }
>  
> +static inline const char *mfi_frame_desc(unsigned int cmd)
> +{
> +static const char *mfi_frame_descs[] = {
> +"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI",
> +"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"
> +};
> +
> +if (cmd < ARRAY_SIZE(mfi_frame_descs)) {
> +return mfi_frame_descs[cmd];
> +}
> +
> +return "Unknown";
> +}
> +
>  /*
>   * Context is considered opaque, but the HBA firmware is running
>   * in little endian mode. So convert it to little endian, too.
> @@ -1670,25 +1680,25 @@ static int megasas_handle_scsi(MegasasState *s, 
> MegasasCmd *cmd,
>  if (is_logical) {
>  if (target_id >= MFI_MAX_LD || lun_id != 0) {
>  trace_megasas_scsi_target_not_present(
> -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id);
> +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id);
>  return MFI_STAT_DEVICE_NOT_FOUND;
>  }
>  }
>  sdev = scsi_device_find(&s->bus, 0, target_id, lun_id);
>  
>  cmd->iov_size = le32_to_cpu(cmd->frame->header.data_len);
> -trace_megasas_handle_scsi(mfi_frame_desc[frame_cmd], is_logical,
> +trace_megasas_handle_scsi(mfi_frame_desc(frame_cmd), is_logical,
>target_id, lun_id, sdev, cmd->iov_size);
>  
>  if (!sdev || (megasas_is_jbod(s) && is_logical)) {
>  trace_megasas_scsi_target_not_present(
> -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id);
> +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id);
>  return MFI_STAT_DEVICE_NOT_FOUND;
>  }
>  
>  if (cdb_len > 16) {
>  trace_megasas_scsi_invalid_cdb_len(
> -mfi_frame_desc[frame_cmd], is_logical,
> +mfi_frame_desc(frame_cmd), is_logical,
>  target_id, lun_id, cdb_len);
>  megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE));
>  cmd->frame->header.scsi_status = CHECK_CONDITION;
> @@ -1706,7 +1716,7 @@ static int megasas_handle_scsi(MegasasState *s, 
> MegasasCmd *cmd,
>  cmd->req = scsi_req_new(sdev, cmd->index, lun_id, cdb, cmd);
>  if (!cmd->req) {
>  trace_megasas_scsi_req_alloc_failed(
> -mfi_frame_desc[frame_cmd], target_id, lun_id);
> +mfi_frame_desc(frame_cmd), target_id, lun_id);
>  megasas_write_sense(cmd, SENSE_CODE(NO_SENSE));
>  cmd->frame->header.scsi_status = BUSY;
>  s->event_count++;
> @@ -1751,17 +1761,17 @@ static int megasas_handle_io(MegasasState *s, 
> MegasasCmd *cmd, int frame_cmd)
>  }
>  
>  trace_megasas_handle_io(cmd->index,
> -mfi_frame_desc[frame_cmd], target_id, lun_id,
> +mfi_frame_desc(frame_cmd), target_id, lun_id,
>  (unsigned long)lba_start, (unsigned 
> long)lba_count);
>  if (!sdev) {
>  trace_megasas_io_target_not_present(cmd->index,
> -mfi_frame_desc[frame_cmd], target_id, lun_id);
> +mfi_frame_desc(frame_cmd), target_id, lun_id);
>  return MFI_STAT_DEVICE_NOT_FOUND;
>  }
>  
>  if (cdb_len > 16) {
>  trace_megasas_scsi_invalid_cdb_len(
> -mfi_frame_desc[frame_cmd], 1, target_id, lun_id, cdb_len);
> +mfi_frame_desc(frame_cmd), 1, target_id, lun_id, cdb_len);
>  megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE));
>  cmd->frame->header.scsi_status = CHECK_CONDITION;
>  s->event_count++;
> @@ -1781,7 +1791,7 @@ static int megasas_handle_io(MegasasState *s, 
> MegasasCmd *cmd, int frame_cmd)
>

Re: [PATCH v3 00/11] hw/sd/sdcard: Fix CVE-2020-13253 & cleanups

2020-06-15 Thread Philippe Mathieu-Daudé

On 6/5/20 12:22 PM, Philippe Mathieu-Daudé wrote:
> Patches 2 & 3 fix CVE-2020-13253.

Ping for the CVE fix?...

> The rest are (accumulated) cleanups.
> 
> Supersedes: <20200604182502.24228-1-f4...@amsat.org>
> 
> Philippe Mathieu-Daudé (11):
>   MAINTAINERS: Cc qemu-block mailing list
>   hw/sd/sdcard: Update coding style to make checkpatch.pl happy
>   hw/sd/sdcard: Do not switch to ReceivingData if address is invalid
>   hw/sd/sdcard: Restrict Class 6 commands to SCSD cards
>   hw/sd/sdcard: Update the SDState documentation
>   hw/sd/sdcard: Simplify cmd_valid_while_locked()
>   hw/sd/sdcard: Constify sd_crc*()'s message argument
>   hw/sd/sdcard: Make iolen unsigned
>   hw/sd/sdcard: Correctly display the command name in trace events
>   hw/sd/sdcard: Display offset in read/write_data() trace events
>   hw/sd/sdcard: Simplify realize() a bit
> 
>  hw/sd/sd.c | 122 +
>  MAINTAINERS|   1 +
>  hw/sd/trace-events |   4 +-
>  3 files changed, 83 insertions(+), 44 deletions(-)
>

[PATCH] hw/scsi/megasas: Fix possible out-of-bounds array access in tracepoints

2020-06-15 Thread Thomas Huth

Some tracepoints in megasas.c use a guest-controlled value as an index
into the mfi_frame_desc[] array. Thus a malicious guest could cause an
out-of-bounds error here. Fortunately, the impact is very low since this
can only happen when the corresponding tracepoints have been enabled
before, but the problem should be fixed anyway with a proper check.

Buglink: https://bugs.launchpad.net/qemu/+bug/1882065
Signed-off-by: Thomas Huth 
---
 hw/scsi/megasas.c | 36 +++-
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index af18c88b65..aa930226f8 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -54,10 +54,6 @@
 #define MEGASAS_FLAG_USE_QUEUE64   1
 #define MEGASAS_MASK_USE_QUEUE64   (1 << MEGASAS_FLAG_USE_QUEUE64)
 
-static const char *mfi_frame_desc[] = {
-"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI",
-"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"};
-
 typedef struct MegasasCmd {
 uint32_t index;
 uint16_t flags;
@@ -183,6 +179,20 @@ static void megasas_frame_set_scsi_status(MegasasState *s,
 stb_pci_dma(pci, frame + offsetof(struct mfi_frame_header, scsi_status), 
v);
 }
 
+static inline const char *mfi_frame_desc(unsigned int cmd)
+{
+static const char *mfi_frame_descs[] = {
+"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI",
+"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"
+};
+
+if (cmd < ARRAY_SIZE(mfi_frame_descs)) {
+return mfi_frame_descs[cmd];
+}
+
+return "Unknown";
+}
+
 /*
  * Context is considered opaque, but the HBA firmware is running
  * in little endian mode. So convert it to little endian, too.
@@ -1670,25 +1680,25 @@ static int megasas_handle_scsi(MegasasState *s, 
MegasasCmd *cmd,
 if (is_logical) {
 if (target_id >= MFI_MAX_LD || lun_id != 0) {
 trace_megasas_scsi_target_not_present(
-mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id);
+mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id);
 return MFI_STAT_DEVICE_NOT_FOUND;
 }
 }
 sdev = scsi_device_find(&s->bus, 0, target_id, lun_id);
 
 cmd->iov_size = le32_to_cpu(cmd->frame->header.data_len);
-trace_megasas_handle_scsi(mfi_frame_desc[frame_cmd], is_logical,
+trace_megasas_handle_scsi(mfi_frame_desc(frame_cmd), is_logical,
   target_id, lun_id, sdev, cmd->iov_size);
 
 if (!sdev || (megasas_is_jbod(s) && is_logical)) {
 trace_megasas_scsi_target_not_present(
-mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id);
+mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id);
 return MFI_STAT_DEVICE_NOT_FOUND;
 }
 
 if (cdb_len > 16) {
 trace_megasas_scsi_invalid_cdb_len(
-mfi_frame_desc[frame_cmd], is_logical,
+mfi_frame_desc(frame_cmd), is_logical,
 target_id, lun_id, cdb_len);
 megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE));
 cmd->frame->header.scsi_status = CHECK_CONDITION;
@@ -1706,7 +1716,7 @@ static int megasas_handle_scsi(MegasasState *s, 
MegasasCmd *cmd,
 cmd->req = scsi_req_new(sdev, cmd->index, lun_id, cdb, cmd);
 if (!cmd->req) {
 trace_megasas_scsi_req_alloc_failed(
-mfi_frame_desc[frame_cmd], target_id, lun_id);
+mfi_frame_desc(frame_cmd), target_id, lun_id);
 megasas_write_sense(cmd, SENSE_CODE(NO_SENSE));
 cmd->frame->header.scsi_status = BUSY;
 s->event_count++;
@@ -1751,17 +1761,17 @@ static int megasas_handle_io(MegasasState *s, 
MegasasCmd *cmd, int frame_cmd)
 }
 
 trace_megasas_handle_io(cmd->index,
-mfi_frame_desc[frame_cmd], target_id, lun_id,
+mfi_frame_desc(frame_cmd), target_id, lun_id,
 (unsigned long)lba_start, (unsigned 
long)lba_count);
 if (!sdev) {
 trace_megasas_io_target_not_present(cmd->index,
-mfi_frame_desc[frame_cmd], target_id, lun_id);
+mfi_frame_desc(frame_cmd), target_id, lun_id);
 return MFI_STAT_DEVICE_NOT_FOUND;
 }
 
 if (cdb_len > 16) {
 trace_megasas_scsi_invalid_cdb_len(
-mfi_frame_desc[frame_cmd], 1, target_id, lun_id, cdb_len);
+mfi_frame_desc(frame_cmd), 1, target_id, lun_id, cdb_len);
 megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE));
 cmd->frame->header.scsi_status = CHECK_CONDITION;
 s->event_count++;
@@ -1781,7 +1791,7 @@ static int megasas_handle_io(MegasasState *s, MegasasCmd 
*cmd, int frame_cmd)
 lun_id, cdb, cmd);
 if (!cmd->req) {
 trace_megasas_scsi_req_alloc_failed(
-mfi_frame_desc[frame_cmd], target_id, lun_id);
+mfi_frame_desc(frame_cmd), target_id, lun_id);
 megasas_write_sense(cmd, SENSE_C

51 matches

Mail list logo