[PATCH 18/18] hw/block/nvme: Document zoned parameters in usage text
Added brief descriptions of the new driver properties now available to users to configure features of Zoned Namespace Command Set in the driver. This patch is for documentation only, no functionality change. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 62 +++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 1ecbc31272..b765475bb0 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -9,7 +9,7 @@ */ /** - * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e + * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e * * http://www.nvmexpress.org/resources/ */ @@ -20,7 +20,8 @@ * -device nvme,drive=,serial=,id=, \ * cmb_size_mb=, \ * [pmrdev=,] \ - * max_ioqpairs= + * max_ioqpairs= \ + * zoned= * * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. @@ -32,6 +33,63 @@ * For example: * -object memory-backend-file,id=,share=on,mem-path=, \ * size= -device nvme,...,pmrdev= + * + * Setting "zoned" to true makes the driver emulate zoned namespaces. + * In this case, of the following options are available to configure zoned + * operation: + * zone_size= + * + * zone_capacity= + * + * zone_file= + * Zone metadata file, if specified, allows zone information + * to be persistent across shutdowns and restarts. + * + * zone_descr_ext_size= + * This value needs to be specified in 64B units. If it is zero, + * namespace(s) will not support zone descriptor extensions. + * + * max_active= + * + * max_open= + * + * reset_rcmnd_delay= + * The amount of time that passes between the moment when a zone + * enters Full state and when Reset Zone Recommended attribute + * is set for that zone. + * + * reset_rcmnd_limit= + * If this value is zero (default), RZR attribute is not set for + * any zones. + * + * finish_rcmnd_delay= + * The amount of time that passes between the moment when a zone + * enters an Open or Closed state and when Finish Zone Recommended + * attribute is set for that zone. + * + * finish_rcmnd_limit= + * If this value is zero (default), FZR attribute is not set for + * any zones. + * + * zamds= + * The maximum I/O size that can be supported by Zone Append + * command. Since internally this this value is maintained as + * ZAMDS = log2( / ), some + * values assigned to this property may be rounded down and + * result in a lower maximum ZA data size being in effect. + * + * zone_async_events= + * Enable sending Zone Descriptor Changed AENs to the host. + * + * offline_zones= + * + * rdonly_zones= + * + * cross_zone_read= + * + * fill_pattern= + * Byte pattern to to return for any portions of unwritten data + * during read. */ #include "qemu/osdep.h" -- 2.21.0
[PATCH 12/18] hw/block/nvme: Simulate Zone Active excursions
Added a Boolean flag to turn on simulation of Zone Active Excursions. If the flag, "active_excursions", is set to true, the driver will try to finish one of the currently open zone if max active zones limit is going to get exceeded. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 24 +++- hw/block/nvme.h | 1 + 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 3e51837a63..d9110b39d3 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -539,6 +539,26 @@ static void nvme_auto_transition_zone(NvmeCtrl *n, NvmeNamespace *ns, { NvmeZone *zone; +if (n->params.active_excursions && adding_active && +n->params.max_active_zones && +ns->nr_active_zones == n->params.max_active_zones) { +zone = nvme_peek_zone_head(ns, ns->closed_zones); +if (zone) { +/* + * The namespace is at the limit of active zones. + * Try to finish one of the currently active zones + * to make the needed active zone resource available. + */ +nvme_aor_dec_active(n, ns); +nvme_assign_zone_state(n, ns, zone, NVME_ZONE_STATE_FULL); +zone->d.za &= ~(NVME_ZA_FINISH_RECOMMENDED | +NVME_ZA_RESET_RECOMMENDED); +zone->d.za |= NVME_ZA_FINISHED_BY_CTLR; +zone->tstamp = 0; +trace_pci_nvme_zone_finished_by_controller(zone->d.zslba); +} +} + if (implicit && n->params.max_open_zones && ns->nr_open_zones == n->params.max_open_zones) { zone = nvme_remove_zone_head(n, ns, ns->imp_open_zones); @@ -2630,7 +2650,7 @@ static int nvme_zoned_init_ns(NvmeCtrl *n, NvmeNamespace *ns, int lba_index, /* MAR/MOR are zeroes-based, 0x means no limit */ ns->id_ns_zoned->mar = cpu_to_le32(n->params.max_active_zones - 1); ns->id_ns_zoned->mor = cpu_to_le32(n->params.max_open_zones - 1); -ns->id_ns_zoned->zoc = 0; +ns->id_ns_zoned->zoc = cpu_to_le16(n->params.active_excursions ? 0x2 : 0); ns->id_ns_zoned->ozcs = n->params.cross_zone_read ? 0x01 : 0x00; ns->id_ns_zoned->lbafe[lba_index].zsze = cpu_to_le64(n->params.zone_size); @@ -2975,6 +2995,8 @@ static Property nvme_props[] = { DEFINE_PROP_INT32("max_active", NvmeCtrl, params.max_active_zones, 0), DEFINE_PROP_INT32("max_open", NvmeCtrl, params.max_open_zones, 0), DEFINE_PROP_BOOL("cross_zone_read", NvmeCtrl, params.cross_zone_read, true), +DEFINE_PROP_BOOL("active_excursions", NvmeCtrl, params.active_excursions, + false), DEFINE_PROP_UINT8("fill_pattern", NvmeCtrl, params.fill_pattern, 0), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 75bbad1509..cad154a21f 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -14,6 +14,7 @@ typedef struct NvmeParams { boolzoned; boolcross_zone_read; +boolactive_excursions; uint8_t fill_pattern; uint32_tzamds_bs; uint64_tzone_size; -- 2.21.0
[PATCH 11/18] hw/block/nvme: Introduce max active and open zone limits
Added two module properties, "max_active" and "max_open" to control the maximum number of zones that can be active or open. Once these variables are set to non-default values, the driver checks these limits during I/O and returns Too Many Active or Too Many Open command status if they are exceeded. Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 183 +++- hw/block/nvme.h | 4 ++ 2 files changed, 185 insertions(+), 2 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 7bf452d597..3e51837a63 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -119,6 +119,87 @@ static void nvme_remove_zone(NvmeCtrl *n, NvmeNamespace *ns, NvmeZoneList *zl, zone->prev = zone->next = 0; } +/* + * Take the first zone out from a list, return NULL if the list is empty. + */ +static NvmeZone *nvme_remove_zone_head(NvmeCtrl *n, NvmeNamespace *ns, +NvmeZoneList *zl) +{ +NvmeZone *zone = nvme_peek_zone_head(ns, zl); + +if (zone) { +--zl->size; +if (zl->size == 0) { +zl->head = NVME_ZONE_LIST_NIL; +zl->tail = NVME_ZONE_LIST_NIL; +} else { +zl->head = zone->next; +ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL; +} +zone->prev = zone->next = 0; +} + +return zone; +} + +/* + * Check if we can open a zone without exceeding open/active limits. + * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5). + */ +static int nvme_aor_check(NvmeCtrl *n, NvmeNamespace *ns, + uint32_t act, uint32_t opn) +{ +if (n->params.max_active_zones != 0 && +ns->nr_active_zones + act > n->params.max_active_zones) { +trace_pci_nvme_err_insuff_active_res(n->params.max_active_zones); +return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR; +} +if (n->params.max_open_zones != 0 && +ns->nr_open_zones + opn > n->params.max_open_zones) { +trace_pci_nvme_err_insuff_open_res(n->params.max_open_zones); +return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR; +} + +return NVME_SUCCESS; +} + +static inline void nvme_aor_inc_open(NvmeCtrl *n, NvmeNamespace *ns) +{ +assert(ns->nr_open_zones >= 0); +if (n->params.max_open_zones) { +ns->nr_open_zones++; +assert(ns->nr_open_zones <= n->params.max_open_zones); +} +} + +static inline void nvme_aor_dec_open(NvmeCtrl *n, NvmeNamespace *ns) +{ +if (n->params.max_open_zones) { +assert(ns->nr_open_zones > 0); +ns->nr_open_zones--; +} +assert(ns->nr_open_zones >= 0); +} + +static inline void nvme_aor_inc_active(NvmeCtrl *n, NvmeNamespace *ns) +{ +assert(ns->nr_active_zones >= 0); +if (n->params.max_active_zones) { +ns->nr_active_zones++; +assert(ns->nr_active_zones <= n->params.max_active_zones); +} +} + +static inline void nvme_aor_dec_active(NvmeCtrl *n, NvmeNamespace *ns) +{ +if (n->params.max_active_zones) { +assert(ns->nr_active_zones > 0); +ns->nr_active_zones--; +assert(ns->nr_active_zones >= ns->nr_open_zones); +} +assert(ns->nr_active_zones >= 0); +} + static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, uint8_t state) { @@ -453,6 +534,24 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req) timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); } +static void nvme_auto_transition_zone(NvmeCtrl *n, NvmeNamespace *ns, +bool implicit, bool adding_active) +{ +NvmeZone *zone; + +if (implicit && n->params.max_open_zones && +ns->nr_open_zones == n->params.max_open_zones) { +zone = nvme_remove_zone_head(n, ns, ns->imp_open_zones); +if (zone) { +/* + * Automatically close this implicitly open zone. + */ +nvme_aor_dec_open(n, ns); +nvme_assign_zone_state(n, ns, zone, NVME_ZONE_STATE_CLOSED); +} +} +} + static uint16_t nvme_check_zone_write(NvmeZone *zone, uint64_t slba, uint32_t nlb) { @@ -530,6 +629,23 @@ static uint16_t nvme_check_zone_read(NvmeCtrl *n, NvmeZone *zone, uint64_t slba, return status; } +static uint16_t nvme_auto_open_zone(NvmeCtrl *n, NvmeNamespace *ns, +NvmeZone *zone) +{ +uint16_t status = NVME_SUCCESS; +uint8_t zs = nvme_get_zone_state(zone); + +if (zs == NVME_ZONE_STATE_EMPTY) { +nvme_auto_transition_zone(n, ns, true, true); +status = nvme_aor_check(n, ns, 1, 1); +} else if (zs == NVME_ZONE_STATE_CLOSED) { +nvme_auto_transition_zone(n, ns, true, false); +status = nvme_aor_check(n, ns, 0, 1); +} + +return status; +} + static uint64_t nvme_finalize_zone_write(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, uint32_t nlb) { @@ -542,7 +658,11 @@ static uint64_t nvme_finalize_zone_write(NvmeCtrl *n, NvmeNamespace *ns, switch (zs) { case NVM
[PATCH 15/18] hw/block/nvme: Support Zone Descriptor Extensions
Zone Descriptor Extension is a label that can be assigned to a zone. It can be set to an Empty zone and it stays assigned until the zone is reset. This commit adds a new optional property, "zone_descr_ext_size", to the driver. Its value must be a multiple of 64 bytes. If this value is non-zero, it becomes possible to assign extensions of that size to any Empty zones. The default value for this property is 0, therefore setting extensions is disabled by default. Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 76 ++--- hw/block/nvme.h | 8 ++ 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 8dfb568173..16780d5116 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -1359,6 +1359,26 @@ static bool nvme_cond_offline_all(uint8_t state) return state == NVME_ZONE_STATE_READ_ONLY; } +static uint16_t nvme_set_zd_ext(NvmeCtrl *n, NvmeNamespace *ns, +NvmeZone *zone, uint8_t state) +{ +uint16_t status; + +if (state == NVME_ZONE_STATE_EMPTY) { +nvme_auto_transition_zone(n, ns, false, true); +status = nvme_aor_check(n, ns, 1, 0); +if (status != NVME_SUCCESS) { +return status; +} +nvme_aor_inc_active(n, ns); +zone->d.za |= NVME_ZA_ZD_EXT_VALID; +nvme_assign_zone_state(n, ns, zone, NVME_ZONE_STATE_CLOSED); +return NVME_SUCCESS; +} + +return NVME_ZONE_INVAL_TRANSITION; +} + static uint16_t name_do_zone_op(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, uint8_t state, bool all, uint16_t (*op_hndlr)(NvmeCtrl *, NvmeNamespace *, NvmeZone *, @@ -1387,13 +1407,16 @@ static uint16_t name_do_zone_op(NvmeCtrl *n, NvmeNamespace *ns, static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRequest *req) { +NvmeRwCmd *rw; uint32_t dw13 = le32_to_cpu(cmd->cdw13); +uint64_t prp1, prp2; uint64_t slba = 0; uint64_t zone_idx = 0; uint16_t status; uint8_t action, state; bool all; NvmeZone *zone; +uint8_t *zd_ext; action = dw13 & 0xff; all = dw13 & 0x100; @@ -1448,7 +1471,25 @@ static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeNamespace *ns, case NVME_ZONE_ACTION_SET_ZD_EXT: trace_pci_nvme_set_descriptor_extension(slba, zone_idx); -return NVME_INVALID_FIELD | NVME_DNR; +if (all || !n->params.zd_extension_size) { +return NVME_INVALID_FIELD | NVME_DNR; +} +zd_ext = nvme_get_zd_extension(n, ns, zone_idx); +rw = (NvmeRwCmd *)cmd; +prp1 = le64_to_cpu(rw->prp1); +prp2 = le64_to_cpu(rw->prp2); +status = nvme_dma_write_prp(n, zd_ext, n->params.zd_extension_size, +prp1, prp2); +if (status) { +trace_pci_nvme_err_zd_extension_map_error(zone_idx); +return status; +} + +status = nvme_set_zd_ext(n, ns, zone, state); +if (status == NVME_SUCCESS) { +trace_pci_nvme_zd_extension_set(zone_idx); +return status; +} break; default: @@ -1527,7 +1568,7 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeNamespace *ns, return NVME_INVALID_FIELD | NVME_DNR; } -if (zra == NVME_ZONE_REPORT_EXTENDED) { +if (zra == NVME_ZONE_REPORT_EXTENDED && !n->params.zd_extension_size) { return NVME_INVALID_FIELD | NVME_DNR; } @@ -1539,6 +1580,9 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeNamespace *ns, partial = (dw13 >> 16) & 0x01; zone_entry_sz = sizeof(NvmeZoneDescr); +if (zra == NVME_ZONE_REPORT_EXTENDED) { +zone_entry_sz += n->params.zd_extension_size; +} max_zones = (len - sizeof(NvmeZoneReportHeader)) / zone_entry_sz; buf = g_malloc0(len); @@ -1570,6 +1614,14 @@ static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeNamespace *ns, z->wp = cpu_to_le64(~0ULL); } +if (zra == NVME_ZONE_REPORT_EXTENDED) { +if (zs->d.za & NVME_ZA_ZD_EXT_VALID) { +memcpy(buf_p, nvme_get_zd_extension(n, ns, zone_index), + n->params.zd_extension_size); +} +buf_p += n->params.zd_extension_size; +} + zone_index++; } @@ -2336,7 +2388,7 @@ static uint16_t nvme_handle_changed_zone_log(NvmeCtrl *n, NvmeCmd *cmd, continue; } num_aen_zones++; -if (zone->d.za) { +if (zone->d.za & ~NVME_ZA_ZD_EXT_VALID) { trace_pci_nvme_reporting_changed_zone(zone->d.zslba, zone->d.za); *zid_ptr++ = cpu_to_le64(zone->d.zslba); nids++; @@ -2935,6 +2987,7 @@ static int nvme_init_zone_meta(NvmeCtrl *n, NvmeNamespace *ns, ns->imp_open_zones = g_malloc0(sizeof(NvmeZoneList)); ns->closed_zones = g_malloc0(sizeof(NvmeZoneList)); n
[PATCH 16/18] hw/block/nvme: Add injection of Offline/Read-Only zones
ZNS specification defines two zone conditions for the zones that no longer can function properly, possibly because of flash wear or other internal fault. It is useful to be able to "inject" a small number of such zones for testing purposes. This commit defines two optional driver properties, "offline_zones" and "rdonly_zones". User can assign non-zero values to these variables to specify the number of zones to be initialized as Offline or Read-Only. The actual number of injected zones may be smaller than the requested amount - Read-Only and Offline counts are expected to be much smaller than the total number of drive zones. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 46 ++ hw/block/nvme.h | 2 ++ 2 files changed, 48 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 16780d5116..930f5f9d41 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -2979,8 +2979,11 @@ static int nvme_init_zone_meta(NvmeCtrl *n, NvmeNamespace *ns, uint64_t capacity) { NvmeZone *zone; +Error *err; uint64_t start = 0, zone_size = n->params.zone_size; +uint32_t rnd; int i; +uint16_t zs; ns->zone_array = g_malloc0(n->zone_array_size); ns->exp_open_zones = g_malloc0(sizeof(NvmeZoneList)); @@ -3010,6 +3013,37 @@ static int nvme_init_zone_meta(NvmeCtrl *n, NvmeNamespace *ns, start += zone_size; } +/* If required, make some zones Offline or Read Only */ + +for (i = 0; i < n->params.nr_offline_zones; i++) { +do { +qcrypto_random_bytes(&rnd, sizeof(rnd), &err); +rnd %= n->num_zones; +} while (rnd < n->params.max_open_zones); +zone = &ns->zone_array[rnd]; +zs = nvme_get_zone_state(zone); +if (zs != NVME_ZONE_STATE_OFFLINE) { +nvme_set_zone_state(zone, NVME_ZONE_STATE_OFFLINE); +} else { +i--; +} +} + +for (i = 0; i < n->params.nr_rdonly_zones; i++) { +do { +qcrypto_random_bytes(&rnd, sizeof(rnd), &err); +rnd %= n->num_zones; +} while (rnd < n->params.max_open_zones); +zone = &ns->zone_array[rnd]; +zs = nvme_get_zone_state(zone); +if (zs != NVME_ZONE_STATE_OFFLINE && +zs != NVME_ZONE_STATE_READ_ONLY) { +nvme_set_zone_state(zone, NVME_ZONE_STATE_READ_ONLY); +} else { +i--; +} +} + return 0; } @@ -3062,6 +3096,16 @@ static void nvme_zoned_init_ctrl(NvmeCtrl *n, Error **errp) if (n->params.max_active_zones > nz) { n->params.max_active_zones = nz; } +if (n->params.max_open_zones < nz) { +if (n->params.nr_offline_zones > nz - n->params.max_open_zones) { +n->params.nr_offline_zones = nz - n->params.max_open_zones; +} +if (n->params.nr_rdonly_zones > +nz - n->params.max_open_zones - n->params.nr_offline_zones) { +n->params.nr_rdonly_zones = +nz - n->params.max_open_zones - n->params.nr_offline_zones; +} +} if (n->params.zd_extension_size) { if (n->params.zd_extension_size & 0x3f) { error_setg(errp, @@ -3453,6 +3497,8 @@ static Property nvme_props[] = { DEFINE_PROP_UINT64("finish_rcmnd_delay", NvmeCtrl, params.fzr_delay_usec, 0), DEFINE_PROP_UINT64("finish_rcmnd_limit", NvmeCtrl, params.frl_usec, 0), +DEFINE_PROP_UINT32("offline_zones", NvmeCtrl, params.nr_offline_zones, 0), +DEFINE_PROP_UINT32("rdonly_zones", NvmeCtrl, params.nr_rdonly_zones, 0), DEFINE_PROP_BOOL("zone_async_events", NvmeCtrl, params.zone_async_events, true), DEFINE_PROP_BOOL("cross_zone_read", NvmeCtrl, params.cross_zone_read, true), diff --git a/hw/block/nvme.h b/hw/block/nvme.h index a9ac01434b..1726cc9d6e 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -23,6 +23,8 @@ typedef struct NvmeParams { uint64_tzone_capacity; int32_t max_active_zones; int32_t max_open_zones; +uint32_tnr_offline_zones; +uint32_tnr_rdonly_zones; uint32_tzd_extension_size; uint64_trzr_delay_usec; uint64_trrl_usec; -- 2.21.0
[PATCH 09/18] hw/block/nvme: Define Zoned NS Command Set trace events
The Zoned Namespace Command Set / Namespace Types implementation that is being introduced in this series adds a good number of trace events. Combine all tracepoint definitions into a separate patch to make reviewing more convenient. Signed-off-by: Dmitry Fomichev --- hw/block/trace-events | 41 + 1 file changed, 41 insertions(+) diff --git a/hw/block/trace-events b/hw/block/trace-events index 3f3323fe38..984db8a20c 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -66,6 +66,31 @@ pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects log read" pci_nvme_css_nvm_cset_selected_by_host(uint32_t cc) "NVM command set selected by host, bar.cc=0x%"PRIx32"" pci_nvme_css_all_csets_sel_by_host(uint32_t cc) "all supported command sets selected by host, bar.cc=0x%"PRIx32"" +pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_reset_zone(uint64_t slba, uint32_t zone_idx, int all) "reset zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_offline_zone(uint64_t slba, uint32_t zone_idx, int all) "offline zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32"" +pci_nvme_set_descriptor_extension(uint64_t slba, uint32_t zone_idx) "set zone descriptor extension, slba=%"PRIu64", idx=%"PRIu32"" +pci_nvme_zone_reset_recommended(uint64_t slba) "slba=%"PRIu64"" +pci_nvme_zone_reset_internal_op(uint64_t slba) "slba=%"PRIu64"" +pci_nvme_zone_finish_recommended(uint64_t slba) "slba=%"PRIu64"" +pci_nvme_zone_finish_internal_op(uint64_t slba) "slba=%"PRIu64"" +pci_nvme_zone_finished_by_controller(uint64_t slba) "slba=%"PRIu64"" +pci_nvme_zd_extension_set(uint32_t zone_idx) "set descriptor extension for zone_idx=%"PRIu32"" +pci_nvme_power_on_close(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Closed state" +pci_nvme_power_on_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Empty state" +pci_nvme_power_on_full(uint32_t state, uint64_t slba) "zone state=%"PRIu32", slba=%"PRIu64" transitioned to Full state" +pci_nvme_zone_ae_not_enabled(int info, int log_page, int nsid) "zone async event not enabled, info=0x%"PRIx32", lp=0x%"PRIx32", nsid=%"PRIu32"" +pci_nvme_zone_ae_not_cleared(int info, int log_page, int nsid) "zoned async event not cleared, info=0x%"PRIx32", lp=0x%"PRIx32", nsid=%"PRIu32"" +pci_nvme_zone_aen_not_requested(uint32_t oaes) "zone descriptor AEN are not requested by host, oaes=0x%"PRIx32"" +pci_nvme_getfeat_aen_cfg(uint64_t res) "reporting async event config res=%"PRIu64"" +pci_nvme_setfeat_zone_info_aer_on(void) "zone info change notices enabled" +pci_nvme_setfeat_zone_info_aer_off(void) "zone info change notices disabled" +pci_nvme_changed_zone_log_read(uint16_t nsid) "changed zone list log of ns %"PRIu16"" +pci_nvme_reporting_changed_zone(uint64_t zslba, uint8_t za) "zslba=%"PRIu64", attr=0x%"PRIx8"" +pci_nvme_empty_changed_zone_list(void) "no changes zones to report" +pci_nvme_mapped_zone_file(char *zfile_name, int ret) "mapped zone file %s, error %d" # nvme traces for error conditions pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" @@ -77,10 +102,25 @@ pci_nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not w pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" +pci_nvme_err_capacity_exceeded(uint64_t zone_id, uint64_t nr_zones) "zone capacity exceeded, zone_id=%"PRIu64", nr_zones=%"PRIu64"" +pci_nvme_err_unaligned_zone_cmd(uint8_t action, uint64_t slba, uint64_t zslba) "unaligned zone op 0x%"PRIx32", got slba=%"PRIu64", zslba=%"PRIu64"" +pci_nvme_err_invalid_zone_state_transition(uint8_t state, uint8_t action, uint64_t slba, uint8_t attrs) "0x%"PRIx32"->0x%"PRIx32", slba=%"PRIu64", attrs=0x%"PRIx32"" +pci_nvme_err_write_not_at_wp(uint64_t slba, uint64_t zone, uint64_t wp) "writing at slba=%"PRIu64", zone=%"PRIu64", but wp=%"PRIu64"" +pci_nvme_err_append_not_at_start(uint64_t slba, uint64_t zone) "appending at slba=%"PRIu64", but zone=%"PRIu64"" +pci_nvme_err_zone_write_not_ok(uint64_t slba, uint32_t nlb, uint32_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" +pci_nvme_err_zone_read_not_ok(uint64_t slba, uint32_t nlb, uint32_t status) "slba=%"PRIu64", nlb=%"PRIu32", status=0x%"PRIx16"" +pci_nvme_err_append_too_large(uint64_t slba, uint32_t nlb, uint8_t za
[PATCH 08/18] hw/block/nvme: Make Zoned NS Command Set definitions
Define values and structures that are needed to support Zoned Namespace Command Set (NVMe TP 4053) in PCI NVMe controller emulator. All new protocol definitions are located in include/block/nvme.h and everything added that is specific to this implementation is kept in hw/block/nvme.h. In order to improve scalability, all open, closed and full zones are organized in separate linked lists. Consequently, almost all zone operations don't require scanning of the entire zone array (which potentially can be quite large) - it is only necessary to enumerate one or more zone lists. Zone lists are designed to be position-independent as they can be persisted to the backing file as a part of zone metadata. NvmeZoneList struct defined in this patch serves as a head of every zone list. NvmeZone structure encapsulates NvmeZoneDescriptor defined in Zoned Command Set specification and adds a few more fields that are internal to this implementation. Signed-off-by: Niklas Cassel Signed-off-by: Hans Holmberg Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Alexey Bogoslavsky Signed-off-by: Dmitry Fomichev --- hw/block/nvme.h | 130 +++ include/block/nvme.h | 119 ++- 2 files changed, 248 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index ef1197b4cd..1c7deaf42d 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -3,11 +3,21 @@ #include "block/nvme.h" +#define NVME_DEFAULT_ZONE_SIZE 128 /* MiB */ +#define NVME_DEFAULT_MAX_ZA_SIZE 128 /* KiB */ + typedef struct NvmeParams { char *serial; uint32_t num_queues; /* deprecated since 5.1 */ uint32_t max_ioqpairs; uint32_t cmb_size_mb; + +boolzoned; +boolcross_zone_read; +uint8_t fill_pattern; +uint32_tzamds_bs; +uint64_tzone_size; +uint64_tzone_capacity; } NvmeParams; typedef struct NvmeAsyncEvent { @@ -16,6 +26,8 @@ typedef struct NvmeAsyncEvent { enum NvmeRequestFlags { NVME_REQ_FLG_HAS_SG = 1 << 0, +NVME_REQ_FLG_FILL = 1 << 1, +NVME_REQ_FLG_APPEND = 1 << 2, }; typedef struct NvmeRequest { @@ -23,6 +35,7 @@ typedef struct NvmeRequest { BlockAIOCB *aiocb; uint16_tstatus; uint16_tflags; +uint64_tfill_ofs; NvmeCqe cqe; BlockAcctCookie acct; QEMUSGList qsg; @@ -60,11 +73,35 @@ typedef struct NvmeCQueue { QTAILQ_HEAD(, NvmeRequest) req_list; } NvmeCQueue; +typedef struct NvmeZone { +NvmeZoneDescr d; +uint64_ttstamp; +uint32_tnext; +uint32_tprev; +uint8_t rsvd80[8]; +} NvmeZone; + +#define NVME_ZONE_LIST_NILUINT_MAX + +typedef struct NvmeZoneList { +uint32_thead; +uint32_ttail; +uint32_tsize; +uint8_t rsvd12[4]; +} NvmeZoneList; + typedef struct NvmeNamespace { NvmeIdNsid_ns; uint32_tnsid; uint8_t csi; QemuUUIDuuid; + +NvmeIdNsZoned *id_ns_zoned; +NvmeZone*zone_array; +NvmeZoneList*exp_open_zones; +NvmeZoneList*imp_open_zones; +NvmeZoneList*closed_zones; +NvmeZoneList*full_zones; } NvmeNamespace; static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns) @@ -99,6 +136,7 @@ typedef struct NvmeCtrl { uint32_tnum_namespaces; uint32_tmax_q_ents; uint64_tns_size; + uint8_t *cmbuf; uint32_tirq_status; uint64_thost_timestamp; /* Timestamp sent by the host */ @@ -106,6 +144,12 @@ typedef struct NvmeCtrl { HostMemoryBackend *pmrdev; +int zone_file_fd; +uint32_tnum_zones; +uint64_tzone_size_bs; +uint64_tzone_array_size; +uint8_t zamds; + NvmeNamespace *namespaces; NvmeSQueue **sq; NvmeCQueue **cq; @@ -120,6 +164,86 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns) return n->ns_size >> nvme_ns_lbads(ns); } +static inline uint8_t nvme_get_zone_state(NvmeZone *zone) +{ +return zone->d.zs >> 4; +} + +static inline void nvme_set_zone_state(NvmeZone *zone, enum NvmeZoneState state) +{ +zone->d.zs = state << 4; +} + +static inline uint64_t nvme_zone_rd_boundary(NvmeCtrl *n, NvmeZone *zone) +{ +return zone->d.zslba + n->params.zone_size; +} + +static inline uint64_t nvme_zone_wr_boundary(NvmeZone *zone) +{ +return zone->d.zslba + zone->d.zcap; +} + +static inline bool nvme_wp_is_valid(NvmeZone *zone) +{ +uint8_t st = nvme_get_zone_state(zone); + +return st != NVME_ZONE_STATE_FULL && + st != NVME_ZONE_STATE_READ_ONLY && + st != NVME_ZONE_STATE_OFFLINE; +} + +/* + * Initialize a zone list head. + */ +static inline void nvme
[PATCH 01/18] hw/block/nvme: Move NvmeRequest has_sg field to a bit flag
In addition to the existing has_sg flag, a few more Boolean NvmeRequest flags are going to be introduced in subsequent patches. Convert "has_sg" variable to "flags" and define NvmeRequestFlags enum for individual flag values. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 8 +++- hw/block/nvme.h | 6 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 567bce7519..6e8e4ccdb1 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -349,7 +349,7 @@ static void nvme_rw_cb(void *opaque, int ret) block_acct_failed(blk_get_stats(n->conf.blk), &req->acct); req->status = NVME_INTERNAL_DEV_ERROR; } -if (req->has_sg) { +if (req->flags & NVME_REQ_FLG_HAS_SG) { qemu_sglist_destroy(&req->qsg); } nvme_enqueue_req_completion(cq, req); @@ -358,7 +358,6 @@ static void nvme_rw_cb(void *opaque, int ret) static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, NvmeRequest *req) { -req->has_sg = false; block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, BLOCK_ACCT_FLUSH); req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req); @@ -382,7 +381,6 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, return NVME_LBA_RANGE | NVME_DNR; } -req->has_sg = false; block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0, BLOCK_ACCT_WRITE); req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count, @@ -421,14 +419,13 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd, dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct); if (req->qsg.nsg > 0) { -req->has_sg = true; +req->flags |= NVME_REQ_FLG_HAS_SG; req->aiocb = is_write ? dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req) : dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req); } else { -req->has_sg = false; req->aiocb = is_write ? blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb, req) : @@ -916,6 +913,7 @@ static void nvme_process_sq(void *opaque) QTAILQ_REMOVE(&sq->req_list, req, entry); QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry); memset(&req->cqe, 0, sizeof(req->cqe)); +req->flags = 0; req->cqe.cid = cmd.cid; status = sq->sqid ? nvme_io_cmd(n, &cmd, req) : diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 61dd9b23b8..79513eaa49 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -15,11 +15,15 @@ typedef struct NvmeAsyncEvent { NvmeAerResult result; } NvmeAsyncEvent; +enum NvmeRequestFlags { +NVME_REQ_FLG_HAS_SG = 1 << 0, +}; + typedef struct NvmeRequest { struct NvmeSQueue *sq; BlockAIOCB *aiocb; uint16_tstatus; -boolhas_sg; +uint16_tflags; NvmeCqe cqe; BlockAcctCookie acct; QEMUSGList qsg; -- 2.21.0
[PATCH 17/18] hw/block/nvme: Use zone metadata file for persistence
A ZNS drive that is emulated by this driver is currently initialized with all zones Empty upon startup. However, actual ZNS SSDs save the state and condition of all zones in their internal NVRAM in the event of power loss. When such a drive is powered up again, it closes or finishes all zones that were open at the moment of shutdown. Besides that, the write pointer position as well as the state and condition of all zones is preserved across power-downs. This commit adds the capability to have a persistent zone metadata to the driver. The new optional driver property, "zone_file", is introduced. If added to the command line, this property specifies the name of the file that stores the zone metadata. If "zone_file" is omitted, the driver will initialize with all zones empty, the same as before. If zone metadata is configured to be persistent, then zone descriptor extensions also persist across controller shutdowns. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 371 +--- hw/block/nvme.h | 38 + 2 files changed, 388 insertions(+), 21 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 930f5f9d41..1ecbc31272 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -68,6 +68,8 @@ } while (0) static void nvme_process_sq(void *opaque); +static void nvme_sync_zone_file(NvmeCtrl *n, NvmeNamespace *ns, +NvmeZone *zone, int len); /* * Add a zone to the tail of a zone list. @@ -89,6 +91,7 @@ static void nvme_add_zone_tail(NvmeCtrl *n, NvmeNamespace *ns, NvmeZoneList *zl, zl->tail = idx; } zl->size++; +nvme_set_zone_meta_dirty(n, ns, true); } /* @@ -105,12 +108,15 @@ static void nvme_remove_zone(NvmeCtrl *n, NvmeNamespace *ns, NvmeZoneList *zl, if (zl->size == 0) { zl->head = NVME_ZONE_LIST_NIL; zl->tail = NVME_ZONE_LIST_NIL; +nvme_set_zone_meta_dirty(n, ns, true); } else if (idx == zl->head) { zl->head = zone->next; ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL; +nvme_set_zone_meta_dirty(n, ns, true); } else if (idx == zl->tail) { zl->tail = zone->prev; ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL; +nvme_set_zone_meta_dirty(n, ns, true); } else { ns->zone_array[zone->next].prev = zone->prev; ns->zone_array[zone->prev].next = zone->next; @@ -137,6 +143,7 @@ static NvmeZone *nvme_remove_zone_head(NvmeCtrl *n, NvmeNamespace *ns, ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL; } zone->prev = zone->next = 0; +nvme_set_zone_meta_dirty(n, ns, true); } return zone; @@ -475,6 +482,7 @@ static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns, case NVME_ZONE_STATE_READ_ONLY: zone->tstamp = 0; } +nvme_sync_zone_file(n, ns, zone, sizeof(NvmeZone)); } static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) @@ -2975,9 +2983,114 @@ static const MemoryRegionOps nvme_cmb_ops = { }, }; -static int nvme_init_zone_meta(NvmeCtrl *n, NvmeNamespace *ns, +static int nvme_validate_zone_file(NvmeCtrl *n, NvmeNamespace *ns, uint64_t capacity) { +NvmeZoneMeta *meta = ns->zone_meta; +NvmeZone *zone = ns->zone_array; +uint64_t start = 0, zone_size = n->params.zone_size; +int i, n_imp_open = 0, n_exp_open = 0, n_closed = 0, n_full = 0; + +if (meta->magic != NVME_ZONE_META_MAGIC) { +return 1; +} +if (meta->version != NVME_ZONE_META_VER) { +return 2; +} +if (meta->zone_size != zone_size) { +return 3; +} +if (meta->zone_capacity != n->params.zone_capacity) { +return 4; +} +if (meta->nr_offline_zones != n->params.nr_offline_zones) { +return 5; +} +if (meta->nr_rdonly_zones != n->params.nr_rdonly_zones) { +return 6; +} +if (meta->lba_size != n->conf.logical_block_size) { +return 7; +} +if (meta->zd_extension_size != n->params.zd_extension_size) { +return 8; +} + +for (i = 0; i < n->num_zones; i++, zone++) { +if (start + zone_size > capacity) { +zone_size = capacity - start; +} +if (zone->d.zt != NVME_ZONE_TYPE_SEQ_WRITE) { +return 9; +} +if (zone->d.zcap != n->params.zone_capacity) { +return 10; +} +if (zone->d.zslba != start) { +return 11; +} +switch (nvme_get_zone_state(zone)) { +case NVME_ZONE_STATE_EMPTY: +case NVME_ZONE_STATE_OFFLINE: +case NVME_ZONE_STATE_READ_ONLY: +if (zone->d.wp != start) { +return 12; +} +break; +case NVME_ZONE_STATE_IMPLICITLY_OPEN: +if (zone->d.wp < start || +zone->d.wp >= zone->d.zslba + zone->d.zcap) { +return 13; +} +n_imp_open++; +break; +case N
[PATCH 02/18] hw/block/nvme: Define 64 bit cqe.result
From: Ajay Joshi A new write command, Zone Append, is added as a part of Zoned Namespace Command Set. Upon successful completion of this command, the controller returns the start LBA of the performed write operation in cqe.result field. Therefore, the maximum size of this variable needs to be changed from 32 to 64 bit, consuming the reserved 32 bit field that follows the result in CQE struct. Since the existing commands are expected to return a 32 bit LE value, two separate variables, result32 and result64, are now kept in a union. Signed-off-by: Ajay Joshi Signed-off-by: Dmitry Fomichev --- block/nvme.c | 2 +- block/trace-events | 2 +- hw/block/nvme.c | 6 +++--- include/block/nvme.h | 6 -- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/block/nvme.c b/block/nvme.c index eb2f54dd9d..ca245ec574 100644 --- a/block/nvme.c +++ b/block/nvme.c @@ -287,7 +287,7 @@ static inline int nvme_translate_error(const NvmeCqe *c) { uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF; if (status) { -trace_nvme_error(le32_to_cpu(c->result), +trace_nvme_error(le64_to_cpu(c->result64), le16_to_cpu(c->sq_head), le16_to_cpu(c->sq_id), le16_to_cpu(c->cid), diff --git a/block/trace-events b/block/trace-events index 29dff8881c..05c1393943 100644 --- a/block/trace-events +++ b/block/trace-events @@ -156,7 +156,7 @@ vxhs_get_creds(const char *cacert, const char *client_key, const char *client_ce # nvme.c nvme_kick(void *s, int queue) "s %p queue %d" nvme_dma_flush_queue_wait(void *s) "s %p" -nvme_error(int cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %d sq_head %d sqid %d cid %d status 0x%x" +nvme_error(uint64_t cmd_specific, int sq_head, int sqid, int cid, int status) "cmd_specific %ld sq_head %d sqid %d cid %d status 0x%x" nvme_process_completion(void *s, int index, int inflight) "s %p queue %d inflight %d" nvme_process_completion_queue_busy(void *s, int index) "s %p queue %d" nvme_complete_command(void *s, int index, int cid) "s %p queue %d cid %d" diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 6e8e4ccdb1..94110db596 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -822,7 +822,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return NVME_INVALID_FIELD | NVME_DNR; } -req->cqe.result = result; +req->cqe.result32 = result; return NVME_SUCCESS; } @@ -858,8 +858,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) ((dw11 >> 16) & 0x) + 1, n->params.max_ioqpairs, n->params.max_ioqpairs); -req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) | - ((n->params.max_ioqpairs - 1) << 16)); +req->cqe.result32 = cpu_to_le32((n->params.max_ioqpairs - 1) | +((n->params.max_ioqpairs - 1) << 16)); break; case NVME_TIMESTAMP: return nvme_set_feature_timestamp(n, cmd); diff --git a/include/block/nvme.h b/include/block/nvme.h index 1720ee1d51..9c3a04dcd7 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -577,8 +577,10 @@ typedef struct NvmeAerResult { } NvmeAerResult; typedef struct NvmeCqe { -uint32_tresult; -uint32_trsvd; +union { +uint64_t result64; +uint32_t result32; +}; uint16_tsq_head; uint16_tsq_id; uint16_tcid; -- 2.21.0
[PATCH 13/18] hw/block/nvme: Set Finish/Reset Zone Recommended attributes
Added logic to set and reset FZR and RZR zone attributes. Four new driver properties are added to control the timing of setting and resetting these attributes. FZR/RZR delay lasts from the zone operation and until when the corresponding zone attribute is set. FZR/RZR limits set the time period between setting FZR or RZR attribute and resetting it simulating the internal controller action on that zone. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 99 + hw/block/nvme.h | 13 ++- 2 files changed, 111 insertions(+), 1 deletion(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index d9110b39d3..6e6616f7e9 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -200,6 +200,84 @@ static inline void nvme_aor_dec_active(NvmeCtrl *n, NvmeNamespace *ns) assert(ns->nr_active_zones >= 0); } +static void nvme_set_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone) +{ +assert(zone->flags & NVME_ZFLAGS_SET_RZR); +zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +zone->flags &= ~NVME_ZFLAGS_TS_DELAY; +zone->d.za |= NVME_ZA_RESET_RECOMMENDED; +zone->flags &= ~NVME_ZFLAGS_SET_RZR; +trace_pci_nvme_zone_reset_recommended(zone->d.zslba); +} + +static void nvme_clear_rzr(NvmeCtrl *n, NvmeNamespace *ns, +NvmeZone *zone, bool notify) +{ +if (n->params.rrl_usec) { +zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY); +notify = notify && (zone->d.za & NVME_ZA_RESET_RECOMMENDED); +zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED; +zone->tstamp = 0; +} +} + +static void nvme_set_fzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone) +{ +assert(zone->flags & NVME_ZFLAGS_SET_FZR); +zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +zone->flags &= ~NVME_ZFLAGS_TS_DELAY; +zone->d.za |= NVME_ZA_FINISH_RECOMMENDED; +zone->flags &= ~NVME_ZFLAGS_SET_FZR; +trace_pci_nvme_zone_finish_recommended(zone->d.zslba); +} + +static void nvme_clear_fzr(NvmeCtrl *n, NvmeNamespace *ns, +NvmeZone *zone, bool notify) +{ +if (n->params.frl_usec) { +zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY); +notify = notify && (zone->d.za & NVME_ZA_FINISH_RECOMMENDED); +zone->d.za &= ~NVME_ZA_FINISH_RECOMMENDED; +zone->tstamp = 0; +} +} + +static void nvme_schedule_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone) +{ +if (n->params.frl_usec) { +zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY); +zone->d.za &= ~NVME_ZA_FINISH_RECOMMENDED; +zone->tstamp = 0; +} +if (n->params.rrl_usec) { +zone->flags |= NVME_ZFLAGS_SET_RZR; +if (n->params.rzr_delay_usec) { +zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +zone->flags |= NVME_ZFLAGS_TS_DELAY; +} else { +nvme_set_rzr(n, ns, zone); +} +} +} + +static void nvme_schedule_fzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone) +{ +if (n->params.rrl_usec) { +zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY); +zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED; +zone->tstamp = 0; +} +if (n->params.frl_usec) { +zone->flags |= NVME_ZFLAGS_SET_FZR; +if (n->params.fzr_delay_usec) { +zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); +zone->flags |= NVME_ZFLAGS_TS_DELAY; +} else { +nvme_set_fzr(n, ns, zone); +} +} +} + static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, uint8_t state) { @@ -207,15 +285,19 @@ static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns, switch (nvme_get_zone_state(zone)) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: nvme_remove_zone(n, ns, ns->exp_open_zones, zone); +nvme_clear_fzr(n, ns, zone, false); break; case NVME_ZONE_STATE_IMPLICITLY_OPEN: nvme_remove_zone(n, ns, ns->imp_open_zones, zone); +nvme_clear_fzr(n, ns, zone, false); break; case NVME_ZONE_STATE_CLOSED: nvme_remove_zone(n, ns, ns->closed_zones, zone); +nvme_clear_fzr(n, ns, zone, false); break; case NVME_ZONE_STATE_FULL: nvme_remove_zone(n, ns, ns->full_zones, zone); +nvme_clear_rzr(n, ns, zone, false); } } @@ -224,15 +306,19 @@ static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns, switch (state) { case NVME_ZONE_STATE_EXPLICITLY_OPEN: nvme_add_zone_tail(n, ns, ns->exp_open_zones, zone); +nvme_schedule_fzr(n, ns, zone); break; case NVME_ZONE_STATE_IMPLICITLY_OPEN: nvme_add_zone_tail(n, ns, ns->imp_open_zones, zone); +nvme_schedule_fzr(n, ns, zone); break; case NVME_ZONE_STATE_CLOSED: nvme_add_zone_tail(n, ns, ns->closed_zones, zone); +nvme_schedu
[PATCH 05/18] hw/block/nvme: Introduce the Namespace Types definitions
From: Niklas Cassel Define the structures and constants required to implement Namespace Types support. Signed-off-by: Niklas Cassel Signed-off-by: Dmitry Fomichev --- hw/block/nvme.h | 3 ++ include/block/nvme.h | 75 +--- 2 files changed, 73 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index dae7b977af..fbf40c7256 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -62,6 +62,9 @@ typedef struct NvmeCQueue { typedef struct NvmeNamespace { NvmeIdNsid_ns; +uint32_tnsid; +uint8_t csi; +QemuUUIDuuid; } NvmeNamespace; static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns) diff --git a/include/block/nvme.h b/include/block/nvme.h index 6a58bac0c2..5a1e5e137c 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -50,6 +50,11 @@ enum NvmeCapMask { CAP_PMR_MASK = 0x1, }; +enum NvmeCapCssBits { +CAP_CSS_NVM= 0x01, +CAP_CSS_CSI_SUPP = 0x40, +}; + #define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) #define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT)& CAP_CQR_MASK) #define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT)& CAP_AMS_MASK) @@ -101,6 +106,12 @@ enum NvmeCcMask { CC_IOCQES_MASK = 0xf, }; +enum NvmeCcCss { +CSS_NVM_ONLY= 0, +CSS_ALL_NSTYPES = 6, +CSS_ADMIN_ONLY = 7, +}; + #define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK) #define NVME_CC_CSS(cc)((cc >> CC_CSS_SHIFT)& CC_CSS_MASK) #define NVME_CC_MPS(cc)((cc >> CC_MPS_SHIFT)& CC_MPS_MASK) @@ -109,6 +120,21 @@ enum NvmeCcMask { #define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) #define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) +#define NVME_SET_CC_EN(cc, val) \ +(cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT) +#define NVME_SET_CC_CSS(cc, val)\ +(cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT) +#define NVME_SET_CC_MPS(cc, val)\ +(cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT) +#define NVME_SET_CC_AMS(cc, val)\ +(cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT) +#define NVME_SET_CC_SHN(cc, val)\ +(cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT) +#define NVME_SET_CC_IOSQES(cc, val) \ +(cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT) +#define NVME_SET_CC_IOCQES(cc, val) \ +(cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT) + enum NvmeCstsShift { CSTS_RDY_SHIFT = 0, CSTS_CFS_SHIFT = 1, @@ -482,10 +508,41 @@ typedef struct NvmeIdentify { uint64_trsvd2[2]; uint64_tprp1; uint64_tprp2; -uint32_tcns; -uint32_trsvd11[5]; +uint8_t cns; +uint8_t rsvd4; +uint16_tctrlid; +uint16_tnvmsetid; +uint8_t rsvd3; +uint8_t csi; +uint32_trsvd12[4]; } NvmeIdentify; +typedef struct NvmeNsIdDesc { +uint8_t nidt; +uint8_t nidl; +uint16_trsvd2; +} NvmeNsIdDesc; + +enum NvmeNidType { +NVME_NIDT_EUI64 = 0x01, +NVME_NIDT_NGUID = 0x02, +NVME_NIDT_UUID = 0x03, +NVME_NIDT_CSI = 0x04, +}; + +enum NvmeNidLength { +NVME_NIDL_EUI64 = 8, +NVME_NIDL_NGUID = 16, +NVME_NIDL_UUID = 16, +NVME_NIDL_CSI = 1, +}; + +enum NvmeCsi { +NVME_CSI_NVM= 0x00, +}; + +#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi))) + typedef struct NvmeRwCmd { uint8_t opcode; uint8_t flags; @@ -603,6 +660,7 @@ enum NvmeStatusCodes { NVME_CMD_ABORT_MISSING_FUSE = 0x000a, NVME_INVALID_NSID = 0x000b, NVME_CMD_SEQ_ERROR = 0x000c, +NVME_CMD_SET_CMB_REJECTED = 0x002b, NVME_LBA_RANGE = 0x0080, NVME_CAP_EXCEEDED = 0x0081, NVME_NS_NOT_READY = 0x0082, @@ -729,9 +787,14 @@ typedef struct NvmePSD { #define NVME_IDENTIFY_DATA_SIZE 4096 enum { -NVME_ID_CNS_NS = 0x0, -NVME_ID_CNS_CTRL = 0x1, -NVME_ID_CNS_NS_ACTIVE_LIST = 0x2, +NVME_ID_CNS_NS= 0x0, +NVME_ID_CNS_CTRL = 0x1, +NVME_ID_CNS_NS_ACTIVE_LIST= 0x2, +NVME_ID_CNS_NS_DESC_LIST = 0x03, +NVME_ID_CNS_CS_NS = 0x05, +NVME_ID_CNS_CS_CTRL = 0x06, +NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07, +NVME_ID_CNS_IO_COMMAND_SET= 0x1c, }; typedef struct NvmeIdCtrl { @@ -825,6 +888,7 @@ enum NvmeFeatureIds { NVME_WRITE_ATOMICITY= 0xa, NVME_ASYNCHRONOUS_EVENT_CONF= 0xb, NVME_TIMESTAMP = 0xe, +NVME_COMMAND_SET_PROFILE= 0x19, NVME_SOFTWARE_PROGRESS_MARKER = 0x80 }; @@ -914,6 +978,7 @@ static inline void _nvme_check_size(void) QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) !=
[PATCH 04/18] hw/block/nvme: Add Commands Supported and Effects log
This log page becomes necessary to implement to allow checking for Zone Append command support in Zoned Namespace Command Set. This commit adds the code to report this log page for NVM Command Set only. The parts that are specific to zoned operation will be added later in the series. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 62 +++ hw/block/trace-events | 4 +++ include/block/nvme.h | 18 + 3 files changed, 84 insertions(+) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 94110db596..d000c7c9fb 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -870,6 +870,66 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return NVME_SUCCESS; } +static uint16_t nvme_handle_cmd_effects(NvmeCtrl *n, NvmeCmd *cmd, +uint64_t prp1, uint64_t prp2, uint64_t ofs, uint32_t len) +{ + NvmeEffectsLog cmd_eff_log = {}; + uint32_t *iocs = cmd_eff_log.iocs; + +trace_pci_nvme_cmd_supp_and_effects_log_read(); + +if (ofs != 0) { +trace_pci_nvme_err_invalid_effects_log_offset(ofs); +return NVME_INVALID_FIELD | NVME_DNR; +} +if (len != sizeof(cmd_eff_log)) { +trace_pci_nvme_err_invalid_effects_log_len(len); +return NVME_INVALID_FIELD | NVME_DNR; +} + +iocs[NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFFECTS_CSUPP; +iocs[NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFFECTS_CSUPP; +iocs[NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFFECTS_CSUPP; +iocs[NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFFECTS_CSUPP; +iocs[NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFFECTS_CSUPP; +iocs[NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFFECTS_CSUPP; +iocs[NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFFECTS_CSUPP; +iocs[NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFFECTS_CSUPP; + +iocs[NVME_CMD_FLUSH] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC; +iocs[NVME_CMD_WRITE_ZEROS] = NVME_CMD_EFFECTS_CSUPP | + NVME_CMD_EFFECTS_LBCC; +iocs[NVME_CMD_WRITE] = NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC; +iocs[NVME_CMD_READ] = NVME_CMD_EFFECTS_CSUPP; + +return nvme_dma_read_prp(n, (uint8_t *)&cmd_eff_log, len, prp1, prp2); +} + +static uint16_t nvme_get_log_page(NvmeCtrl *n, NvmeCmd *cmd) +{ +uint64_t prp1 = le64_to_cpu(cmd->prp1); +uint64_t prp2 = le64_to_cpu(cmd->prp2); +uint32_t dw10 = le32_to_cpu(cmd->cdw10); +uint32_t dw11 = le32_to_cpu(cmd->cdw11); +uint64_t dw12 = le32_to_cpu(cmd->cdw12); +uint64_t dw13 = le32_to_cpu(cmd->cdw13); +uint64_t ofs = (dw13 << 32) | dw12; +uint32_t numdl, numdu, len; +uint16_t lid = dw10 & 0xff; + +numdl = dw10 >> 16; +numdu = dw11 & 0x; +len = (((numdu << 16) | numdl) + 1) << 2; + +switch (lid) { +case NVME_LOG_CMD_EFFECTS: +return nvme_handle_cmd_effects(n, cmd, prp1, prp2, ofs, len); +} + +trace_pci_nvme_unsupported_log_page(lid); +return NVME_INVALID_FIELD | NVME_DNR; +} + static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) { switch (cmd->opcode) { @@ -887,6 +947,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req) return nvme_set_feature(n, cmd, req); case NVME_ADM_CMD_GET_FEATURES: return nvme_get_feature(n, cmd, req); +case NVME_ADM_CMD_GET_LOG_PAGE: +return nvme_get_log_page(n, cmd); default: trace_pci_nvme_err_invalid_admin_opc(cmd->opcode); return NVME_INVALID_OPCODE | NVME_DNR; diff --git a/hw/block/trace-events b/hw/block/trace-events index 958fcc5508..423d491e27 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -58,6 +58,7 @@ pci_nvme_mmio_start_success(void) "setting controller enable bit succeeded" pci_nvme_mmio_stopped(void) "cleared controller enable bit" pci_nvme_mmio_shutdown_set(void) "shutdown bit set" pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" +pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects log read" # nvme traces for error conditions pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" @@ -69,6 +70,8 @@ pci_nvme_err_invalid_ns(uint32_t ns, uint32_t limit) "invalid namespace %u not w pci_nvme_err_invalid_opc(uint8_t opc) "invalid opcode 0x%"PRIx8"" pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" +pci_nvme_err_invalid_effects_log_offset(uint64_t ofs) "commands supported and effects log offset must be 0, got %"PRIu64"" +pci_nvme_err_invalid_effects_log_len(uint32_t len) "commands supported and effects log size is 4096, got %"PRIu32"" pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" pci_nvme_err_inv
[PATCH 10/18] hw/block/nvme: Support Zoned Namespace Command Set
The driver has been changed to advertise NVM Command Set when "zoned" driver property is not set (default) and Zoned Namespace Command Set otherwise. Handlers for three new NVMe commands introduced in Zoned Namespace Command Set specification are added, namely for Zone Management Receive, Zone Management Send and Zone Append. Driver initialization code has been extended to create a proper configuration for zoned operation using driver properties. Read/Write command handler is modified to only allow writes at the write pointer if the namespace is zoned. For Zone Append command, writes implicitly happen at the write pointer and the starting write pointer value is returned as the result of the command. Read Zeroes handler is modified to add zoned checks that are identical to those done as a part of Write flow. The code to support for Zone Descriptor Extensions is not included in this commit and the driver always reports ZDES 0. A later commit in this series will add ZDE support. This commit doesn't yet include checks for active and open zone limits. It is assumed that there are no limits on either active or open zones. Signed-off-by: Niklas Cassel Signed-off-by: Hans Holmberg Signed-off-by: Ajay Joshi Signed-off-by: Chaitanya Kulkarni Signed-off-by: Matias Bjorling Signed-off-by: Aravind Ramesh Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Adam Manzanares Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 963 ++-- 1 file changed, 933 insertions(+), 30 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 0db1f2002d..7bf452d597 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -37,6 +37,7 @@ #include "qemu/osdep.h" #include "qemu/units.h" #include "qemu/error-report.h" +#include "crypto/random.h" #include "hw/block/block.h" #include "hw/pci/msix.h" #include "hw/pci/pci.h" @@ -68,6 +69,98 @@ static void nvme_process_sq(void *opaque); +/* + * Add a zone to the tail of a zone list. + */ +static void nvme_add_zone_tail(NvmeCtrl *n, NvmeNamespace *ns, NvmeZoneList *zl, +NvmeZone *zone) +{ +uint32_t idx = (uint32_t)(zone - ns->zone_array); + +assert(nvme_zone_not_in_list(zone)); + +if (!zl->size) { +zl->head = zl->tail = idx; +zone->next = zone->prev = NVME_ZONE_LIST_NIL; +} else { +ns->zone_array[zl->tail].next = idx; +zone->prev = zl->tail; +zone->next = NVME_ZONE_LIST_NIL; +zl->tail = idx; +} +zl->size++; +} + +/* + * Remove a zone from a zone list. The zone must be linked in the list. + */ +static void nvme_remove_zone(NvmeCtrl *n, NvmeNamespace *ns, NvmeZoneList *zl, +NvmeZone *zone) +{ +uint32_t idx = (uint32_t)(zone - ns->zone_array); + +assert(!nvme_zone_not_in_list(zone)); + +--zl->size; +if (zl->size == 0) { +zl->head = NVME_ZONE_LIST_NIL; +zl->tail = NVME_ZONE_LIST_NIL; +} else if (idx == zl->head) { +zl->head = zone->next; +ns->zone_array[zl->head].prev = NVME_ZONE_LIST_NIL; +} else if (idx == zl->tail) { +zl->tail = zone->prev; +ns->zone_array[zl->tail].next = NVME_ZONE_LIST_NIL; +} else { +ns->zone_array[zone->next].prev = zone->prev; +ns->zone_array[zone->prev].next = zone->next; +} + +zone->prev = zone->next = 0; +} + +static void nvme_assign_zone_state(NvmeCtrl *n, NvmeNamespace *ns, +NvmeZone *zone, uint8_t state) +{ +if (!nvme_zone_not_in_list(zone)) { +switch (nvme_get_zone_state(zone)) { +case NVME_ZONE_STATE_EXPLICITLY_OPEN: +nvme_remove_zone(n, ns, ns->exp_open_zones, zone); +break; +case NVME_ZONE_STATE_IMPLICITLY_OPEN: +nvme_remove_zone(n, ns, ns->imp_open_zones, zone); +break; +case NVME_ZONE_STATE_CLOSED: +nvme_remove_zone(n, ns, ns->closed_zones, zone); +break; +case NVME_ZONE_STATE_FULL: +nvme_remove_zone(n, ns, ns->full_zones, zone); +} + } + +nvme_set_zone_state(zone, state); + +switch (state) { +case NVME_ZONE_STATE_EXPLICITLY_OPEN: +nvme_add_zone_tail(n, ns, ns->exp_open_zones, zone); +break; +case NVME_ZONE_STATE_IMPLICITLY_OPEN: +nvme_add_zone_tail(n, ns, ns->imp_open_zones, zone); +break; +case NVME_ZONE_STATE_CLOSED: +nvme_add_zone_tail(n, ns, ns->closed_zones, zone); +break; +case NVME_ZONE_STATE_FULL: +nvme_add_zone_tail(n, ns, ns->full_zones, zone); +break; +default: +zone->d.za = 0; +/* fall through */ +case NVME_ZONE_STATE_READ_ONLY: +zone->tstamp = 0; +} +} + static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr) { hwaddr low = n->ctrl_mem.addr; @@ -313,6 +406,7 @@ static void nvme_post_cqes(void *opaque) QTAILQ_REMOVE(&cq->req_list, req, entry); sq = req->sq; + req->cqe.status = cpu_to_le16((req->status <
[PATCH 03/18] hw/block/nvme: Clean up unused AER definitions
Removed unused struct NvmeAerResult and SMART-related async event codes. All other event codes are now categorized by their type. This avoids having to define the same values in a single enum, NvmeAsyncEventRequest, that is now removed. Later commits in this series will define additional values in some of these enums. No functional change. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.h | 1 - include/block/nvme.h | 43 ++- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/hw/block/nvme.h b/hw/block/nvme.h index 79513eaa49..dae7b977af 100644 --- a/hw/block/nvme.h +++ b/hw/block/nvme.h @@ -12,7 +12,6 @@ typedef struct NvmeParams { typedef struct NvmeAsyncEvent { QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry; -NvmeAerResult result; } NvmeAsyncEvent; enum NvmeRequestFlags { diff --git a/include/block/nvme.h b/include/block/nvme.h index 9c3a04dcd7..3099df99eb 100644 --- a/include/block/nvme.h +++ b/include/block/nvme.h @@ -553,28 +553,30 @@ typedef struct NvmeDsmRange { uint64_tslba; } NvmeDsmRange; -enum NvmeAsyncEventRequest { -NVME_AER_TYPE_ERROR = 0, -NVME_AER_TYPE_SMART = 1, -NVME_AER_TYPE_IO_SPECIFIC = 6, -NVME_AER_TYPE_VENDOR_SPECIFIC = 7, -NVME_AER_INFO_ERR_INVALID_SQ= 0, -NVME_AER_INFO_ERR_INVALID_DB= 1, -NVME_AER_INFO_ERR_DIAG_FAIL = 2, -NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3, -NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR= 4, -NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5, -NVME_AER_INFO_SMART_RELIABILITY = 0, -NVME_AER_INFO_SMART_TEMP_THRESH = 1, -NVME_AER_INFO_SMART_SPARE_THRESH= 2, +enum NvmeAsyncEventType { +NVME_AER_TYPE_ERROR = 0x00, +NVME_AER_TYPE_SMART = 0x01, +NVME_AER_TYPE_NOTICE= 0x02, +NVME_AER_TYPE_CMDSET_SPECIFIC = 0x06, +NVME_AER_TYPE_VENDOR_SPECIFIC = 0x07, }; -typedef struct NvmeAerResult { -uint8_t event_type; -uint8_t event_info; -uint8_t log_page; -uint8_t resv; -} NvmeAerResult; +enum NvmeAsyncErrorInfo { +NVME_AER_ERR_INVALID_SQ = 0x00, +NVME_AER_ERR_INVALID_DB = 0x01, +NVME_AER_ERR_DIAG_FAIL = 0x02, +NVME_AER_ERR_PERS_INTERNAL_ERR = 0x03, +NVME_AER_ERR_TRANS_INTERNAL_ERR = 0x04, +NVME_AER_ERR_FW_IMG_LOAD_ERR= 0x05, +}; + +enum NvmeAsyncNoticeInfo { +NVME_AER_NOTICE_NS_CHANGED = 0x00, +}; + +enum NvmeAsyncEventCfg { +NVME_AEN_CFG_NS_ATTR= 1 << 8, +}; typedef struct NvmeCqe { union { @@ -881,7 +883,6 @@ enum NvmeIdNsDps { static inline void _nvme_check_size(void) { -QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); -- 2.21.0
[PATCH 07/18] hw/block/nvme: Add support for Namespace Types
From: Niklas Cassel Namespace Types introduce a new command set, "I/O Command Sets", that allows the host to retrieve the command sets associated with a namespace. Introduce support for the command set, and enable detection for the NVM Command Set. Signed-off-by: Niklas Cassel Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 210 ++-- hw/block/nvme.h | 11 +++ 2 files changed, 216 insertions(+), 5 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index d000c7c9fb..0db1f2002d 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -685,6 +685,26 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c) prp1, prp2); } +static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeIdentify *c) +{ +uint64_t prp1 = le64_to_cpu(c->prp1); +uint64_t prp2 = le64_to_cpu(c->prp2); +static const int data_len = NVME_IDENTIFY_DATA_SIZE; +uint32_t *list; +uint16_t ret; + +trace_pci_nvme_identify_ctrl_csi(c->csi); + +if (c->csi == NVME_CSI_NVM) { +list = g_malloc0(data_len); +ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2); +g_free(list); +return ret; +} else { +return NVME_INVALID_FIELD | NVME_DNR; +} +} + static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c) { NvmeNamespace *ns; @@ -700,11 +720,42 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c) } ns = &n->namespaces[nsid - 1]; +assert(nsid == ns->nsid); return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1, prp2); } +static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeIdentify *c) +{ +NvmeNamespace *ns; +uint32_t nsid = le32_to_cpu(c->nsid); +uint64_t prp1 = le64_to_cpu(c->prp1); +uint64_t prp2 = le64_to_cpu(c->prp2); +static const int data_len = NVME_IDENTIFY_DATA_SIZE; +uint32_t *list; +uint16_t ret; + +trace_pci_nvme_identify_ns_csi(nsid, c->csi); + +if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { +trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); +return NVME_INVALID_NSID | NVME_DNR; +} + +ns = &n->namespaces[nsid - 1]; +assert(nsid == ns->nsid); + +if (c->csi == NVME_CSI_NVM) { +list = g_malloc0(data_len); +ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2); +g_free(list); +return ret; +} else { +return NVME_INVALID_FIELD | NVME_DNR; +} +} + static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) { static const int data_len = NVME_IDENTIFY_DATA_SIZE; @@ -732,6 +783,99 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c) return ret; } +static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeIdentify *c) +{ +static const int data_len = NVME_IDENTIFY_DATA_SIZE; +uint32_t min_nsid = le32_to_cpu(c->nsid); +uint64_t prp1 = le64_to_cpu(c->prp1); +uint64_t prp2 = le64_to_cpu(c->prp2); +uint32_t *list; +uint16_t ret; +int i, j = 0; + +trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi); + +if (c->csi != NVME_CSI_NVM) { +return NVME_INVALID_FIELD | NVME_DNR; +} + +list = g_malloc0(data_len); +for (i = 0; i < n->num_namespaces; i++) { +if (i < min_nsid) { +continue; +} +list[j++] = cpu_to_le32(i + 1); +if (j == data_len / sizeof(uint32_t)) { +break; +} +} +ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2); +g_free(list); +return ret; +} + +static uint16_t nvme_list_ns_descriptors(NvmeCtrl *n, NvmeIdentify *c) +{ +NvmeNamespace *ns; +uint32_t nsid = le32_to_cpu(c->nsid); +uint64_t prp1 = le64_to_cpu(c->prp1); +uint64_t prp2 = le64_to_cpu(c->prp2); +void *buf_ptr; +NvmeNsIdDesc *desc; +static const int data_len = NVME_IDENTIFY_DATA_SIZE; +uint8_t *buf; +uint16_t status; + +trace_pci_nvme_list_ns_descriptors(); + +if (unlikely(nsid == 0 || nsid > n->num_namespaces)) { +trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces); +return NVME_INVALID_NSID | NVME_DNR; +} + +ns = &n->namespaces[nsid - 1]; +assert(nsid == ns->nsid); + +buf = g_malloc0(data_len); +buf_ptr = buf; + +desc = buf_ptr; +desc->nidt = NVME_NIDT_UUID; +desc->nidl = NVME_NIDL_UUID; +buf_ptr += sizeof(*desc); +memcpy(buf_ptr, ns->uuid.data, NVME_NIDL_UUID); +buf_ptr += NVME_NIDL_UUID; + +desc = buf_ptr; +desc->nidt = NVME_NIDT_CSI; +desc->nidl = NVME_NIDL_CSI; +buf_ptr += sizeof(*desc); +*(uint8_t *)buf_ptr = NVME_CSI_NVM; + +status = nvme_dma_read_prp(n, buf, data_len, prp1, prp2); +g_free(buf); +return status; +} + +static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeIdentify *c) +{ +uint64_t prp1 = le64_to_cpu(c->prp1); +uint64_t prp2 = le64_to_cpu(c->prp2); +stati
[PATCH 06/18] hw/block/nvme: Define trace events related to NS Types
A few trace events are defined that are relevant to implementing Namespace Types (NVMe TP 4056). Signed-off-by: Dmitry Fomichev --- hw/block/trace-events | 11 +++ 1 file changed, 11 insertions(+) diff --git a/hw/block/trace-events b/hw/block/trace-events index 423d491e27..3f3323fe38 100644 --- a/hw/block/trace-events +++ b/hw/block/trace-events @@ -39,8 +39,13 @@ pci_nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, pci_nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16"" pci_nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16"" pci_nvme_identify_ctrl(void) "identify controller" +pci_nvme_identify_ctrl_csi(uint8_t csi) "identify controller, csi=0x%"PRIx8"" pci_nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16"" +pci_nvme_identify_ns_csi(uint16_t ns, uint8_t csi) "identify namespace, nsid=%"PRIu16", csi=0x%"PRIx8"" pci_nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16"" +pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "identify namespace list, nsid=%"PRIu16", csi=0x%"PRIx8"" +pci_nvme_list_ns_descriptors(void) "identify namespace descriptors" +pci_nvme_identify_cmd_set(void) "identify i/o command set" pci_nvme_getfeat_vwcache(const char* result) "get feature volatile write cache, result=%s" pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d" pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d" @@ -59,6 +64,8 @@ pci_nvme_mmio_stopped(void) "cleared controller enable bit" pci_nvme_mmio_shutdown_set(void) "shutdown bit set" pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared" pci_nvme_cmd_supp_and_effects_log_read(void) "commands supported and effects log read" +pci_nvme_css_nvm_cset_selected_by_host(uint32_t cc) "NVM command set selected by host, bar.cc=0x%"PRIx32"" +pci_nvme_css_all_csets_sel_by_host(uint32_t cc) "all supported command sets selected by host, bar.cc=0x%"PRIx32"" # nvme traces for error conditions pci_nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size" @@ -72,6 +79,9 @@ pci_nvme_err_invalid_admin_opc(uint8_t opc) "invalid admin opcode 0x%"PRIx8"" pci_nvme_err_invalid_lba_range(uint64_t start, uint64_t len, uint64_t limit) "Invalid LBA start=%"PRIu64" len=%"PRIu64" limit=%"PRIu64"" pci_nvme_err_invalid_effects_log_offset(uint64_t ofs) "commands supported and effects log offset must be 0, got %"PRIu64"" pci_nvme_err_invalid_effects_log_len(uint32_t len) "commands supported and effects log size is 4096, got %"PRIu32"" +pci_nvme_err_change_css_when_enabled(void) "changing CC.CSS while controller is enabled" +pci_nvme_err_only_nvm_cmd_set_avail(void) "setting 110b CC.CSS, but only NVM command set is enabled" +pci_nvme_err_invalid_iocsci(uint32_t idx) "unsupported command set combination index %"PRIu32"" pci_nvme_err_invalid_del_sq(uint16_t qid) "invalid submission queue deletion, sid=%"PRIu16"" pci_nvme_err_invalid_create_sq_cqid(uint16_t cqid) "failed creating submission queue, invalid cqid=%"PRIu16"" pci_nvme_err_invalid_create_sq_sqid(uint16_t sqid) "failed creating submission queue, invalid sqid=%"PRIu16"" @@ -127,6 +137,7 @@ pci_nvme_ub_db_wr_invalid_cqhead(uint32_t qid, uint16_t new_head) "completion qu pci_nvme_ub_db_wr_invalid_sq(uint32_t qid) "submission queue doorbell write for nonexistent queue, sqid=%"PRIu32", ignoring" pci_nvme_ub_db_wr_invalid_sqtail(uint32_t qid, uint16_t new_tail) "submission queue doorbell write value beyond queue size, sqid=%"PRIu32", new_head=%"PRIu16", ignoring" pci_nvme_unsupported_log_page(uint16_t lid) "unsupported log page 0x%"PRIx16"" +pci_nvme_ub_unknown_css_value(void) "unknown value in cc.css field" # xen-block.c xen_block_realize(const char *type, uint32_t disk, uint32_t partition) "%s d%up%u" -- 2.21.0
[PATCH 14/18] hw/block/nvme: Generate zone AENs
Added an optional Boolean "zone_async_events" property to the driver. Once it's turned on, the namespace will be sending "Zone Descriptor Changed" asynchronous events to the host in particular situations defined by the protocol. In order to clear these AENs, the host needs to read the newly added Changed Zones Log. Signed-off-by: Dmitry Fomichev --- hw/block/nvme.c | 300 ++- hw/block/nvme.h | 13 +- include/block/nvme.h | 23 +++- 3 files changed, 328 insertions(+), 8 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index 6e6616f7e9..8dfb568173 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -200,12 +200,66 @@ static inline void nvme_aor_dec_active(NvmeCtrl *n, NvmeNamespace *ns) assert(ns->nr_active_zones >= 0); } +static bool nvme_complete_async_req(NvmeCtrl *n, NvmeNamespace *ns, +enum NvmeAsyncEventType type, uint8_t info) +{ +NvmeAsyncEvent *ae; +uint32_t nsid = 0; +uint8_t log_page = 0; + +switch (type) { +case NVME_AER_TYPE_ERROR: +case NVME_AER_TYPE_SMART: +break; +case NVME_AER_TYPE_NOTICE: +switch (info) { +case NVME_AER_NOTICE_ZONE_DESCR_CHANGED: +log_page = NVME_LOG_ZONE_CHANGED_LIST; +nsid = ns->nsid; +if (!(n->ae_cfg & NVME_AEN_CFG_ZONE_DESCR_CHNGD_NOTICES)) { +trace_pci_nvme_zone_ae_not_enabled(info, log_page, nsid); +return false; +} +if (ns->aen_pending) { +trace_pci_nvme_zone_ae_not_cleared(info, log_page, nsid); +return false; +} +ns->aen_pending = true; +} +break; +case NVME_AER_TYPE_CMDSET_SPECIFIC: +case NVME_AER_TYPE_VENDOR_SPECIFIC: +break; +} + +ae = g_malloc0(sizeof(*ae)); +ae->res = type; +ae->res |= (info << 8) & 0xff00; +ae->res |= (log_page << 16) & 0xff; +ae->nsid = nsid; + +QTAILQ_INSERT_TAIL(&n->async_reqs, ae, entry); +timer_mod(n->admin_cq.timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500); +return true; +} + +static inline void nvme_notify_zone_changed(NvmeCtrl *n, NvmeNamespace *ns, +NvmeZone *zone) +{ +if (n->ae_cfg) { +zone->flags |= NVME_ZFLAGS_AEN_PEND; +nvme_complete_async_req(n, ns, NVME_AER_TYPE_NOTICE, +NVME_AER_NOTICE_ZONE_DESCR_CHANGED); +} +} + static void nvme_set_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone) { assert(zone->flags & NVME_ZFLAGS_SET_RZR); zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); zone->flags &= ~NVME_ZFLAGS_TS_DELAY; zone->d.za |= NVME_ZA_RESET_RECOMMENDED; +nvme_notify_zone_changed(n, ns, zone); zone->flags &= ~NVME_ZFLAGS_SET_RZR; trace_pci_nvme_zone_reset_recommended(zone->d.zslba); } @@ -214,10 +268,14 @@ static void nvme_clear_rzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, bool notify) { if (n->params.rrl_usec) { -zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY); +zone->flags &= ~(NVME_ZFLAGS_SET_RZR | NVME_ZFLAGS_TS_DELAY | + NVME_ZFLAGS_AEN_PEND); notify = notify && (zone->d.za & NVME_ZA_RESET_RECOMMENDED); zone->d.za &= ~NVME_ZA_RESET_RECOMMENDED; zone->tstamp = 0; +if (notify) { +nvme_notify_zone_changed(n, ns, zone); +} } } @@ -227,6 +285,7 @@ static void nvme_set_fzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone) zone->tstamp = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); zone->flags &= ~NVME_ZFLAGS_TS_DELAY; zone->d.za |= NVME_ZA_FINISH_RECOMMENDED; +nvme_notify_zone_changed(n, ns, zone); zone->flags &= ~NVME_ZFLAGS_SET_FZR; trace_pci_nvme_zone_finish_recommended(zone->d.zslba); } @@ -235,13 +294,61 @@ static void nvme_clear_fzr(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone, bool notify) { if (n->params.frl_usec) { -zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY); +zone->flags &= ~(NVME_ZFLAGS_SET_FZR | NVME_ZFLAGS_TS_DELAY | + NVME_ZFLAGS_AEN_PEND); notify = notify && (zone->d.za & NVME_ZA_FINISH_RECOMMENDED); zone->d.za &= ~NVME_ZA_FINISH_RECOMMENDED; zone->tstamp = 0; +if (notify) { +nvme_notify_zone_changed(n, ns, zone); +} } } +static bool nvme_process_rrl(NvmeCtrl *n, NvmeNamespace *ns, NvmeZone *zone) +{ +if (zone->flags & NVME_ZFLAGS_SET_RZR) { +if (zone->flags & NVME_ZFLAGS_TS_DELAY) { +assert(!(zone->d.za & NVME_ZA_RESET_RECOMMENDED)); +if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - zone->tstamp >= +n->params.rzr_delay_usec) { +nvme_set_rzr(n, ns, zone); +return true; +} +} else if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - zone->tstamp >= + n->params.rrl_us
[PATCH 00/18] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set
Zoned Namespace (ZNS) Command Set is a newly introduced command set published by the NVM Express, Inc. organization as TP 4053. The main design goals of ZNS are to provide hardware designers the means to reduce NVMe controller complexity and to allow achieving a better I/O latency and throughput. SSDs that implement this interface are commonly known as ZNS SSDs. This command set is implementing a zoned storage model, similarly to ZAC/ZBC. As such, there is already support in Linux, allowing one to perform the majority of tasks needed for managing ZNS SSDs. The Zoned Namespace Command Set relies on another TP, known as Namespace Types (NVMe TP 4056), which introduces support for having multiple command sets per namespace. Both ZNS and Namespace Types specifications can be downloaded by visiting the following link - https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs.zip This patch series adds Namespace Types support and zoned namespace emulation capability to the existing NVMe PCI driver. The patchset is organized as follows - The first several patches are preparatory and are added to allow for an easier review of the subsequent commits. The group of patches that follows adds NS Types support with only NVM Command Set being available. Finally, the last group of commits makes definitions and adds new code to support Zoned Namespace Command Set. This series is based on block-next/block branch. Based-on: <20200608190821.3293867-1-ebl...@redhat.com> Ajay Joshi (1): hw/block/nvme: Define 64 bit cqe.result Dmitry Fomichev (15): hw/block/nvme: Move NvmeRequest has_sg field to a bit flag hw/block/nvme: Clean up unused AER definitions hw/block/nvme: Add Commands Supported and Effects log hw/block/nvme: Define trace events related to NS Types hw/block/nvme: Make Zoned NS Command Set definitions hw/block/nvme: Define Zoned NS Command Set trace events hw/block/nvme: Support Zoned Namespace Command Set hw/block/nvme: Introduce max active and open zone limits hw/block/nvme: Simulate Zone Active excursions hw/block/nvme: Set Finish/Reset Zone Recommended attributes hw/block/nvme: Generate zone AENs hw/block/nvme: Support Zone Descriptor Extensions hw/block/nvme: Add injection of Offline/Read-Only zones hw/block/nvme: Use zone metadata file for persistence hw/block/nvme: Document zoned parameters in usage text Niklas Cassel (2): hw/block/nvme: Introduce the Namespace Types definitions hw/block/nvme: Add support for Namespace Types block/nvme.c |2 +- block/trace-events|2 +- hw/block/nvme.c | 2316 - hw/block/nvme.h | 228 +++- hw/block/trace-events | 56 + include/block/nvme.h | 282 - 6 files changed, 2820 insertions(+), 66 deletions(-) -- 2.21.0
Re: [PATCH v2 2/2] block: Call attention to truncation of long NBD exports
On 6/10/20 11:37 AM, Eric Blake wrote: Commit 93676c88 relaxed our NBD client code to request export names up to the NBD protocol maximum of 4096 bytes without NUL terminator, even though the block layer can't store anything longer than 4096 bytes including NUL terminator for display to the user. Since this means there are some export names where we have to truncate things, we can at least try to make the truncation a bit more obvious for the user. Note that in spite of the truncated display name, we can still communicate with an NBD server using such a long export name; this was deemed nicer than refusing to even connect to such a server (since the server may not be under our control, and since determining our actual length limits gets tricky when nbd://host:port/export and nbd+unix:///export?socket=/path are themselves variable-length expansions beyond the export name but count towards the block layer name length). +++ b/block/nbd.c } else if (host && !s->export) { -snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s:%s", host, port); +len = snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "nbd://%s:%s", host, port); +} +if (len > sizeof(bs->exact_filename)) { Max pointed out off-list that this is off-by-one: by not using >=, I am failing to detect overflow when exactly one byte is truncated. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 1/2] nbd/server: Avoid long error message assertions CVE-2020-10761
On 6/10/20 11:37 AM, Eric Blake wrote: We may later want to further sanitize the user-supplied strings we place into our error messages, such as scrubbing out control characters, but that is less important to the CVE fix, so it can be a later patch to the new nbd_sanitize_name. +static char * +nbd_sanitize_name(const char *name) +{ +if (strnlen(name, 80) < 80) { +return g_strdup(name); +} +/* XXX Should we also try to sanitize any control characters? */ +return g_strdup_printf("%.80s...", name); Max pointed out off-list that this can take a valid UTF-8 name from the client and truncate it mid-character to make our reply NOT valid UTF-8, which is a (minor) violation of the NBD protocol. We have not yet implemented strict UTF-8 enforcement in qemu (neither our client nor server code takes pains to only send UTF-8, nor validates that incoming strings are valid UTF-8); and while the server would previously echo non-UTF-8 (where the client violated protocol first), this is now a case where the server can be coerced into violating protocol first. I guess I may end up doing a followup patch that adds incoming validation and in the process avoids chopping a multi-byte character, but that's just as easy to fold in with my question about sanitizing control characters. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH] block: file-posix: Fail unmap with NO_FALLBACK on block device
On Sat, Jun 13, 2020 at 8:08 PM Nir Soffer wrote: > > Punching holes on block device uses blkdev_issue_zeroout() with > BLKDEV_ZERO_NOFALLBACK but there is no guarantee that this is fast > enough for pre-zeroing an entire device. > > Zeroing block device can be slow as writing zeroes or 100 times faster, > depending on the storage. There is no way to tell if zeroing it fast > enough. The kernel BLKDEV_ZERO_NOFALLBACK flag does not mean that the > operation is fast; it just means that the kernel will not fall back to > manual zeroing. > > Here is an example converting 10g image with 8g of data to block device: > > $ ./qemu-img info test.img > image: test.img > file format: raw > virtual size: 10 GiB (10737418240 bytes) > disk size: 8 GiB > > $ time ./qemu-img convert -f raw -O raw -t none -T none -W test.img > /dev/test/lv1 > > Before: > > real1m20.483s > user0m0.490s > sys 0m0.739s > > After: > > real0m55.831s > user0m0.610s > sys 0m0.956s I did more testing with real server and storage, and the results confirm what I reported here in my vm based environment and poor storage. Testing this LUN: # multipath -ll 3600a098038304437415d4b6a59684a52 dm-3 NETAPP,LUN C-Mode size=5.0T features='3 queue_if_no_path pg_init_retries 50' hwhandler='1 alua' wp=rw |-+- policy='service-time 0' prio=50 status=active | |- 18:0:0:0 sdb 8:16 active ready running | `- 19:0:0:0 sdc 8:32 active ready running `-+- policy='service-time 0' prio=10 status=enabled |- 20:0:0:0 sdd 8:48 active ready running `- 21:0:0:0 sde 8:64 active ready running The destination is 100g logical volume on this LUN: # qemu-img info test-lv image: test-lv file format: raw virtual size: 100 GiB (107374182400 bytes) disk size: 0 B The source image is 100g image with 48g of data: # qemu-img info fedora-31-100g-50p.raw image: fedora-31-100g-50p.raw file format: raw virtual size: 100 GiB (107374182400 bytes) disk size: 48.4 GiB We can zero 2.3 g/s: # time blkdiscard -z test-lv real 0m43.902s user 0m0.002s sys 0m0.130s (I should really test with fallocate instead of blkdiscard, but the results look the same.) # iostat -xdm dm-3 5 Devicer/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util dm-320.80 301.40 0.98 2323.31 0.00 0.00 0.00 0.00 26.56 854.50 257.9448.23 7893.41 0.73 23.58 dm-315.20 297.20 0.80 2321.67 0.00 0.00 0.00 0.00 26.43 836.06 248.7253.80 7999.30 0.78 24.22 We can write 445m/s: # dd if=/dev/zero bs=2M count=51200 of=test-lv oflag=direct conv=fsync 107374182400 bytes (107 GB, 100 GiB) copied, 241.257 s, 445 MB/s # iostat -xdm dm-3 5 Devicer/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util dm-3 6.60 6910.00 0.39431.85 0.00 0.00 0.00 0.002.482.70 15.1960.7364.00 0.14 98.84 dm-340.80 6682.60 1.59417.61 0.00 0.00 0.00 0.001.712.73 14.9240.0063.99 0.15 97.60 dm-3 6.60 6887.40 0.39430.46 0.00 0.00 0.00 0.002.152.66 14.9260.7364.00 0.14 98.22 Testing latest qemu-img: # rpm -q qemu-img qemu-img-4.2.0-22.module+el8.2.1+6758+cb8d64c2.x86_64 # time qemu-img convert -p -f raw -O raw -t none -W fedora-31-100g-50p.raw test-lv (100.00/100%) real 2m2.337s user 0m2.708s sys 0m17.326s # iostat -xdm dm-3 5 pre zero phase: Devicer/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util dm-324.00 265.40 1.00 2123.20 0.00 0.00 0.00 0.00 36.81 543.52 144.9942.48 8192.00 0.70 20.14 dm-3 9.60 283.60 0.59 2265.60 0.00 0.00 0.00 0.00 35.42 576.80 163.7862.50 8180.44 0.70 20.58 dm-324.00 272.00 1.00 2176.00 0.00 0.00 0.00 0.00 22.89 512.40 139.7742.48 8192.00 0.67 19.90 copy phase: Devicer/s w/s rMB/s wMB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await aqu-sz rareq-sz wareq-sz svctm %util dm-327.20 10671.20 1.19655.84 0.00 0.00 0.00 0.002.70 10.99 111.9844.8362.93 0.09 96.74 dm-3 6.40 11537.00 0.39712.33 0.00 0.00 0.00 0.003.00 11.90 131.5262.5063.23 0.08 97.82 dm-327.20 12400.20 1.19765.47 0.00 0.00 0.00 0.003.60 11.16 132.3144.8363.21 0.08 95.50 dm-3 9.60 11312.60 0.59698.20 0.00 0.20 0.00 0.003.73 11.69 126.6463.0063.20 0.09 97.70 Testing latest qemu-img + this patch: # rpm -q qemu-img qemu-img-4.2.0-25.module+el8.2.1+6815+1c792dc8.nsoffer202006140516.x86_64 # time qemu-img convert
Re: [PATCH v8 0/8] block: enhance handling of size-related BlockConf properties
On Fri, May 29, 2020 at 01:55:08AM +0300, Roman Kagan wrote: > BlockConf includes several properties counted in bytes. > > Enhance their handling in some aspects, specifically > > - accept common size suffixes (k, m) > - perform consistency checks on the values > - lift the upper limit on physical_block_size and logical_block_size > > Also fix the accessor for opt_io_size in virtio-blk to make it consistent with > the size of the field. > > History: > v7 -> v8: > - replace stringify with %u in error messages [Eric] > - fix wording in logs [Eric] > > v6 -> v7: > - avoid overflow in min_io_size check [Eric] > - try again to perform the art form in patch splitting [Eric] > > v5 -> v6: > - fix forgotten xen-block and swim > - add prop_size32 instead of going with 64bit > > v4 -> v5: > - re-split the patches [Philippe] > - fix/reword error messages [Philippe, Kevin] > - do early return on failed consistency check [Philippe] > - use QEMU_IS_ALIGNED instead of open coding [Philippe] > - make all BlockConf size props support suffixes > - expand the log for virtio-blk opt_io_size [Michael] > > v3 -> v4: > - add patch to fix opt_io_size width in virtio-blk > - add patch to perform consistency checks [Kevin] > - check min_io_size against truncation [Kevin] > > v2 -> v3: > - mention qcow2 cluster size limit in the log and comment [Eric] > > v1 -> v2: > - cap the property at 2 MiB [Eric] > - accept size suffixes > > Roman Kagan (8): > virtio-blk: store opt_io_size with correct size > block: consolidate blocksize properties consistency checks > qdev-properties: blocksize: use same limits in code and description > qdev-properties: add size32 property type > qdev-properties: make blocksize accept size suffixes > block: make BlockConf size props 32bit and accept size suffixes > qdev-properties: add getter for size32 and blocksize > block: lift blocksize property limit to 2 MiB > > include/hw/block/block.h | 14 +- > include/hw/qdev-properties.h | 5 +- > hw/block/block.c | 40 ++- > hw/block/fdc.c | 5 +- > hw/block/nvme.c | 5 +- > hw/block/swim.c | 5 +- > hw/block/virtio-blk.c| 9 +- > hw/block/xen-block.c | 6 +- > hw/core/qdev-properties.c| 85 +- > hw/ide/qdev.c| 5 +- > hw/scsi/scsi-disk.c | 12 +- > hw/usb/dev-storage.c | 5 +- > tests/qemu-iotests/172.out | 532 +-- > 13 files changed, 419 insertions(+), 309 deletions(-) Ping?
Re: [PATCH v7 00/22] nvme: small fixes, refactoring and cleanups
Am 09.06.2020 um 21:03 hat Klaus Jensen geschrieben: > From: Klaus Jensen > > Hi all, > > As per our discussion about how to amend the bug I introduced in > "hw/block/nvme: allow use of any valid msix vector", this is a respin > without that patch. > > Kevin, it applies cleanly on top of your block tree with all current > hw/block/bnvme patches removed. Thanks, applied to the block branch. Kevin
Re: [PATCH v3 11/11] hw/sd/sdcard: Simplify realize() a bit
On Fri, 5 Jun 2020 at 11:28, Philippe Mathieu-Daudé wrote: > > From: Philippe Mathieu-Daudé > > We don't need to check if sd->blk is set twice. > > Signed-off-by: Philippe Mathieu-Daudé > --- > hw/sd/sd.c | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > diff --git a/hw/sd/sd.c b/hw/sd/sd.c > index a1b25ed36f..060ca9d993 100644 > --- a/hw/sd/sd.c > +++ b/hw/sd/sd.c > @@ -2123,12 +2123,12 @@ static void sd_realize(DeviceState *dev, Error **errp) > return; > } > > -if (sd->blk && blk_is_read_only(sd->blk)) { > -error_setg(errp, "Cannot use read-only drive as SD card"); > -return; > -} > - > if (sd->blk) { > +if (blk_is_read_only(sd->blk)) { > +error_setg(errp, "Cannot use read-only drive as SD card"); > +return; > +} > + > ret = blk_set_perm(sd->blk, BLK_PERM_CONSISTENT_READ | > BLK_PERM_WRITE, > BLK_PERM_ALL, errp); > if (ret < 0) { Originally written with the pattern of "check all the error cases first; then do actual work". But if you prefer this way around Reviewed-by: Peter Maydell thanks -- PMM
Re: [PATCH v3 09/11] hw/sd/sdcard: Correctly display the command name in trace events
On Fri, 5 Jun 2020 at 11:25, Philippe Mathieu-Daudé wrote: > > From: Philippe Mathieu-Daudé > > Some ACMD were incorrectly displayed. Fix by remembering if we > are processing a ACMD (with current_cmd_is_acmd) and add the > sd_current_cmd_name() helper, which display to correct name > regardless it is a CMD or ACMD. > > Signed-off-by: Philippe Mathieu-Daudé > --- > hw/sd/sd.c | 17 ++--- > 1 file changed, 14 insertions(+), 3 deletions(-) > > diff --git a/hw/sd/sd.c b/hw/sd/sd.c > index 952be36399..fad34ab184 100644 > --- a/hw/sd/sd.c > +++ b/hw/sd/sd.c > @@ -114,6 +114,7 @@ struct SDState { > uint8_t pwd[16]; > uint32_t pwd_len; > uint8_t function_group[6]; > +bool current_cmd_is_acmd; This is extra state, so strictly speaking it needs to be migrated (though the only thing we would get wrong is a possible wrong trace message after a migration load). > uint8_t current_cmd; > /* True if we will handle the next command as an ACMD. Note that this > does > * *not* track the APP_CMD status bit! > @@ -1687,6 +1688,8 @@ int sd_do_command(SDState *sd, SDRequest *req, >req->cmd); > req->cmd &= 0x3f; > } > +sd->current_cmd = req->cmd; > +sd->current_cmd_is_acmd = sd->expecting_acmd; I'm not 100% sure about moving the update of sd->current_cmd down here -- if it's an illegal command that seems wrong. > if (sd->card_status & CARD_IS_LOCKED) { > if (!cmd_valid_while_locked(sd, req->cmd)) { > @@ -1714,7 +1717,6 @@ int sd_do_command(SDState *sd, SDRequest *req, > /* Valid command, we can update the 'state before command' bits. > * (Do this now so they appear in r1 responses.) > */ > -sd->current_cmd = req->cmd; > sd->card_status &= ~CURRENT_STATE; > sd->card_status |= (last_state << 9); > } > @@ -1775,6 +1777,15 @@ send_response: > return rsplen; > } thanks -- PMM
Re: [PATCH v3 10/11] hw/sd/sdcard: Display offset in read/write_data() trace events
On Fri, 5 Jun 2020 at 11:32, Philippe Mathieu-Daudé wrote: > > From: Philippe Mathieu-Daudé > > Having 'base address' and 'relative offset' displayed > separately is more helpful than the absolute address. > > Signed-off-by: Philippe Mathieu-Daudé > --- Reviewed-by: Peter Maydell thanks -- PMM
Re: [PATCH v3 08/11] hw/sd/sdcard: Make iolen unsigned
On Fri, 5 Jun 2020 at 11:27, Philippe Mathieu-Daudé wrote: > > I/O request length can not be negative. > > Signed-off-by: Philippe Mathieu-Daudé > --- > hw/sd/sd.c | 2 +- > hw/sd/trace-events | 2 +- > 2 files changed, 2 insertions(+), 2 deletions(-) > > diff --git a/hw/sd/sd.c b/hw/sd/sd.c > index 9d51138b11..952be36399 100644 > --- a/hw/sd/sd.c > +++ b/hw/sd/sd.c > @@ -1950,7 +1950,7 @@ uint8_t sd_read_data(SDState *sd) > { > /* TODO: Append CRCs */ > uint8_t ret; > -int io_len; > +size_t io_len; size_t seems an odd choice -- we initialize it with io_len = (sd->ocr & (1 << 30)) ? 512 : sd->blk_len; where sd->blk_len is a uint32_t, and we use it mainly with BLK_READ_BLOCK(sd->data_start, io_len); where BLK_READ_BLOCK is a rather unnecessary macroization of sd_blk_read(), which takes a uint32_t. thanks -- PMM
Re: [PATCH] hw/scsi/megasas: Fix possible out-of-bounds array access in tracepoints
On 15/06/20 09:26, Thomas Huth wrote: > Some tracepoints in megasas.c use a guest-controlled value as an index > into the mfi_frame_desc[] array. Thus a malicious guest could cause an > out-of-bounds error here. Fortunately, the impact is very low since this > can only happen when the corresponding tracepoints have been enabled > before, but the problem should be fixed anyway with a proper check. > > Buglink: https://bugs.launchpad.net/qemu/+bug/1882065 > Signed-off-by: Thomas Huth > --- > hw/scsi/megasas.c | 36 +++- > 1 file changed, 23 insertions(+), 13 deletions(-) > > diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c > index af18c88b65..aa930226f8 100644 > --- a/hw/scsi/megasas.c > +++ b/hw/scsi/megasas.c > @@ -54,10 +54,6 @@ > #define MEGASAS_FLAG_USE_QUEUE64 1 > #define MEGASAS_MASK_USE_QUEUE64 (1 << MEGASAS_FLAG_USE_QUEUE64) > > -static const char *mfi_frame_desc[] = { > -"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI", > -"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"}; > - > typedef struct MegasasCmd { > uint32_t index; > uint16_t flags; > @@ -183,6 +179,20 @@ static void megasas_frame_set_scsi_status(MegasasState > *s, > stb_pci_dma(pci, frame + offsetof(struct mfi_frame_header, scsi_status), > v); > } > > +static inline const char *mfi_frame_desc(unsigned int cmd) > +{ > +static const char *mfi_frame_descs[] = { > +"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI", > +"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop" > +}; > + > +if (cmd < ARRAY_SIZE(mfi_frame_descs)) { > +return mfi_frame_descs[cmd]; > +} > + > +return "Unknown"; > +} > + > /* > * Context is considered opaque, but the HBA firmware is running > * in little endian mode. So convert it to little endian, too. > @@ -1670,25 +1680,25 @@ static int megasas_handle_scsi(MegasasState *s, > MegasasCmd *cmd, > if (is_logical) { > if (target_id >= MFI_MAX_LD || lun_id != 0) { > trace_megasas_scsi_target_not_present( > -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id); > +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id); > return MFI_STAT_DEVICE_NOT_FOUND; > } > } > sdev = scsi_device_find(&s->bus, 0, target_id, lun_id); > > cmd->iov_size = le32_to_cpu(cmd->frame->header.data_len); > -trace_megasas_handle_scsi(mfi_frame_desc[frame_cmd], is_logical, > +trace_megasas_handle_scsi(mfi_frame_desc(frame_cmd), is_logical, >target_id, lun_id, sdev, cmd->iov_size); > > if (!sdev || (megasas_is_jbod(s) && is_logical)) { > trace_megasas_scsi_target_not_present( > -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id); > +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id); > return MFI_STAT_DEVICE_NOT_FOUND; > } > > if (cdb_len > 16) { > trace_megasas_scsi_invalid_cdb_len( > -mfi_frame_desc[frame_cmd], is_logical, > +mfi_frame_desc(frame_cmd), is_logical, > target_id, lun_id, cdb_len); > megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE)); > cmd->frame->header.scsi_status = CHECK_CONDITION; > @@ -1706,7 +1716,7 @@ static int megasas_handle_scsi(MegasasState *s, > MegasasCmd *cmd, > cmd->req = scsi_req_new(sdev, cmd->index, lun_id, cdb, cmd); > if (!cmd->req) { > trace_megasas_scsi_req_alloc_failed( > -mfi_frame_desc[frame_cmd], target_id, lun_id); > +mfi_frame_desc(frame_cmd), target_id, lun_id); > megasas_write_sense(cmd, SENSE_CODE(NO_SENSE)); > cmd->frame->header.scsi_status = BUSY; > s->event_count++; > @@ -1751,17 +1761,17 @@ static int megasas_handle_io(MegasasState *s, > MegasasCmd *cmd, int frame_cmd) > } > > trace_megasas_handle_io(cmd->index, > -mfi_frame_desc[frame_cmd], target_id, lun_id, > +mfi_frame_desc(frame_cmd), target_id, lun_id, > (unsigned long)lba_start, (unsigned > long)lba_count); > if (!sdev) { > trace_megasas_io_target_not_present(cmd->index, > -mfi_frame_desc[frame_cmd], target_id, lun_id); > +mfi_frame_desc(frame_cmd), target_id, lun_id); > return MFI_STAT_DEVICE_NOT_FOUND; > } > > if (cdb_len > 16) { > trace_megasas_scsi_invalid_cdb_len( > -mfi_frame_desc[frame_cmd], 1, target_id, lun_id, cdb_len); > +mfi_frame_desc(frame_cmd), 1, target_id, lun_id, cdb_len); > megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE)); > cmd->frame->header.scsi_status = CHECK_CONDITION; > s->event_count++; > @@ -1781,7 +1791,7 @@ static int megasas_handle_io(MegasasState *s, > MegasasCmd *cmd, int frame_cmd) >
Re: [PATCH v3 06/11] hw/sd/sdcard: Simplify cmd_valid_while_locked()
On Fri, 5 Jun 2020 at 11:27, Philippe Mathieu-Daudé wrote: > > From: Philippe Mathieu-Daudé > > cmd_valid_while_locked() only needs to read SDRequest->cmd, > pass it directly and make it const. > > Signed-off-by: Philippe Mathieu-Daudé > --- > hw/sd/sd.c | 11 +-- > 1 file changed, 5 insertions(+), 6 deletions(-) Reviewed-by: Peter Maydell thanks -- PMM
Re: [PATCH v3 05/11] hw/sd/sdcard: Update the SDState documentation
On Fri, 5 Jun 2020 at 11:22, Philippe Mathieu-Daudé wrote: > > From: Philippe Mathieu-Daudé > > Add more descriptive comments to keep a clear separation > between static property vs runtime changeable. > > Suggested-by: Peter Maydell > Signed-off-by: Philippe Mathieu-Daudé > --- > hw/sd/sd.c | 5 - > 1 file changed, 4 insertions(+), 1 deletion(-) Reviewed-by: Peter Maydell thanks -- PMM
Re: [PATCH v3 03/11] hw/sd/sdcard: Do not switch to ReceivingData if address is invalid
On Fri, 5 Jun 2020 at 11:25, Philippe Mathieu-Daudé wrote: > > From: Philippe Mathieu-Daudé > > Only move the state machine to ReceivingData if there is no > pending error. This avoids later OOB access while processing > commands queued. > > "SD Specifications Part 1 Physical Layer Simplified Spec. v3.01" > > 4.3.3 Data Read > > Read command is rejected if BLOCK_LEN_ERROR or ADDRESS_ERROR > occurred and no data transfer is performed. > > 4.3.4 Data Write > > Write command is rejected if BLOCK_LEN_ERROR or ADDRESS_ERROR > occurred and no data transfer is performed. It's not clear from the spec that this should also apply to WP_VIOLATION errors. The text about WP_VIOLATION suggests that it is handled by aborting the data transfer (ie set the error bit, stay in receive-data state, wait for a stop command, but ignore all further data transfer), which is I think distinct from "rejecting" the command. If that theory is right then moving the check for the ADDRESS_ERROR in this patch is correct but the WP_VIOLATION tests should stay as they are, I think. NB: is the buffer overrun we're trying to protect against caused by passing sd_wp_addr() a bad address? Maybe we should assert in sd_addr_to_wpnum() that the address is in range, as a defence. thanks -- PMM
Re: [PATCH v3 02/11] hw/sd/sdcard: Update coding style to make checkpatch.pl happy
On Fri, 5 Jun 2020 at 11:24, Philippe Mathieu-Daudé wrote: > > To make the next commit easier to review, clean this code first. > > Signed-off-by: Philippe Mathieu-Daudé > --- > hw/sd/sd.c | 24 > 1 file changed, 16 insertions(+), 8 deletions(-) Reviewed-by: Peter Maydell thanks -- PMM
Re: [PATCH v3 04/11] hw/sd/sdcard: Restrict Class 6 commands to SCSD cards
On Fri, 5 Jun 2020 at 11:23, Philippe Mathieu-Daudé wrote: > > From: Philippe Mathieu-Daudé > > Only SCSD cards support Class 6 (Block Oriented Write Protection) > commands. > > "SD Specifications Part 1 Physical Layer Simplified Spec. v3.01" > > 4.3.14 Command Functional Difference in Card Capacity Types > > * Write Protected Group > > SDHC and SDXC do not support write-protected groups. Issuing > CMD28, CMD29 and CMD30 generates the ILLEGAL_COMMAND error. > > Signed-off-by: Philippe Mathieu-Daudé > --- Reviewed-by: Peter Maydell thanks -- PMM
Re: [PATCH v3 0/4] block: seriously improve savevm performance
* Denis V. Lunev (d...@openvz.org) wrote: > On 6/15/20 3:17 PM, Dr. David Alan Gilbert wrote: > > * Denis V. Lunev (d...@openvz.org) wrote: > >> This series do standard basic things: > >> - it creates intermediate buffer for all writes from QEMU migration code > >> to QCOW2 image, > >> - this buffer is sent to disk asynchronously, allowing several writes to > >> run in parallel. > >> > >> In general, migration code is fantastically inefficent (by observation), > >> buffers are not aligned and sent with arbitrary pieces, a lot of time > >> less than 100 bytes at a chunk, which results in read-modify-write > >> operations with non-cached operations. It should also be noted that all > >> operations are performed into unallocated image blocks, which also suffer > >> due to partial writes to such new clusters. > > It surprises me a little that you're not benefiting from the buffer > > internal to qemu-file.c > > > > Dave > There are a lot of problems with this buffer: > > Flushes to block driver state are performed in the abstract places, > pushing > Â a) small IO > Â b) non-aligned IO both to > Â Â Â Â Â Â 1) page size > Â Â Â Â Â Â 2) cluster size > It should also be noted that buffer in QEMU file is quite small and > all IO operations with it are synchronous. IO, like ethernet, wants > good queues. Yeh, for ethernet we immediately get the kernels buffer so it's not too bad; and I guess the async page writes are easier as well. Dave > The difference is on the table. > > Den > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [PATCH v3 0/4] block: seriously improve savevm performance
On 6/15/20 3:17 PM, Dr. David Alan Gilbert wrote: > * Denis V. Lunev (d...@openvz.org) wrote: >> This series do standard basic things: >> - it creates intermediate buffer for all writes from QEMU migration code >> to QCOW2 image, >> - this buffer is sent to disk asynchronously, allowing several writes to >> run in parallel. >> >> In general, migration code is fantastically inefficent (by observation), >> buffers are not aligned and sent with arbitrary pieces, a lot of time >> less than 100 bytes at a chunk, which results in read-modify-write >> operations with non-cached operations. It should also be noted that all >> operations are performed into unallocated image blocks, which also suffer >> due to partial writes to such new clusters. > It surprises me a little that you're not benefiting from the buffer > internal to qemu-file.c > > Dave There are a lot of problems with this buffer: Flushes to block driver state are performed in the abstract places, pushing a) small IO b) non-aligned IO both to 1) page size 2) cluster size It should also be noted that buffer in QEMU file is quite small and all IO operations with it are synchronous. IO, like ethernet, wants good queues. The difference is on the table. Den
Re: [PATCH v3 0/4] block: seriously improve savevm performance
* Denis V. Lunev (d...@openvz.org) wrote: > This series do standard basic things: > - it creates intermediate buffer for all writes from QEMU migration code > to QCOW2 image, > - this buffer is sent to disk asynchronously, allowing several writes to > run in parallel. > > In general, migration code is fantastically inefficent (by observation), > buffers are not aligned and sent with arbitrary pieces, a lot of time > less than 100 bytes at a chunk, which results in read-modify-write > operations with non-cached operations. It should also be noted that all > operations are performed into unallocated image blocks, which also suffer > due to partial writes to such new clusters. It surprises me a little that you're not benefiting from the buffer internal to qemu-file.c Dave > This patch series is an implementation of idea discussed in the RFC > posted by Denis Plotnikov > https://lists.gnu.org/archive/html/qemu-devel/2020-04/msg01925.html > Results with this series over NVME are better than original code > original rfcthis > cached: 1.79s 2.38s 1.27s > non-cached: 3.29s 1.31s 0.81s > > Changes from v2: > - code moved from QCOW2 level to generic block level > - created bdrv_flush_vmstate helper to fix 022, 029 tests > - added recursive for bs->file in bdrv_co_flush_vmstate (fix 267) > - fixed blk_save_vmstate helper > - fixed coroutine wait as Vladimir suggested with waiting fixes from me > > Changes from v1: > - patchew warning fixed > - fixed validation that only 1 waiter is allowed in patch 1 > > Signed-off-by: Denis V. Lunev > CC: Kevin Wolf > CC: Max Reitz > CC: Stefan Hajnoczi > CC: Fam Zheng > CC: Juan Quintela > CC: "Dr. David Alan Gilbert" > CC: Vladimir Sementsov-Ogievskiy > CC: Denis Plotnikov > > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [PATCH 1/4] migration/savevm: respect qemu_fclose() error code in save_snapshot()
* Denis V. Lunev (d...@openvz.org) wrote: > qemu_fclose() could return error, f.e. if bdrv_co_flush() will return > the error. > > This validation will become more important once we will start waiting of > asynchronous IO operations, started from bdrv_write_vmstate(), which are > coming soon. > > Signed-off-by: Denis V. Lunev > CC: Kevin Wolf > CC: Max Reitz > CC: Stefan Hajnoczi > CC: Fam Zheng > CC: Juan Quintela > CC: "Dr. David Alan Gilbert" > CC: Vladimir Sementsov-Ogievskiy > CC: Denis Plotnikov We check the return value in very few other places; I think in the migration case we do flushes and assume that if the flushes work we were OK; then most of the closes happen on error paths or after points we think we're done. Reviewed-by: Dr. David Alan Gilbert > --- > migration/savevm.c | 8 ++-- > 1 file changed, 6 insertions(+), 2 deletions(-) > > diff --git a/migration/savevm.c b/migration/savevm.c > index c00a6807d9..0ff5bb40ed 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -2628,7 +2628,7 @@ int save_snapshot(const char *name, Error **errp) > { > BlockDriverState *bs, *bs1; > QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1; > -int ret = -1; > +int ret = -1, ret2; > QEMUFile *f; > int saved_vm_running; > uint64_t vm_state_size; > @@ -2712,10 +2712,14 @@ int save_snapshot(const char *name, Error **errp) > } > ret = qemu_savevm_state(f, errp); > vm_state_size = qemu_ftell(f); > -qemu_fclose(f); > +ret2 = qemu_fclose(f); > if (ret < 0) { > goto the_end; > } > +if (ret2 < 0) { > +ret = ret2; > +goto the_end; > +} > > /* The bdrv_all_create_snapshot() call that follows acquires the > AioContext > * for itself. BDRV_POLL_WHILE() does not support nested locking because > -- > 2.17.1 > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [PATCH v7 4/9] qcow2_format.py: Dump bitmap directory information
12.06.2020 03:04, Andrey Shinkevich wrote: Read and dump entries from the bitmap directory of QCOW2 image. It extends the output in the test case #291. Header extension: magic 0x23852875 (Bitmaps) ... Bitmap name bitmap-1 flag auto table size8 (bytes) bitmap_table_offset 0x9 bitmap_table_size 1 flags 0 type 1 granularity_bits 16 name_size 8 extra_data_size 0 Suggested-by: Kevin Wolf Signed-off-by: Andrey Shinkevich Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Hmm I don't remember me gave it.. --- tests/qemu-iotests/291.out | 52 ++ tests/qemu-iotests/qcow2_format.py | 75 ++ 2 files changed, 127 insertions(+) diff --git a/tests/qemu-iotests/291.out b/tests/qemu-iotests/291.out index ccfcdc5..d847419 100644 --- a/tests/qemu-iotests/291.out +++ b/tests/qemu-iotests/291.out @@ -33,6 +33,27 @@ reserved320 bitmap_directory_size 0x40 bitmap_directory_offset 0x51 +Bitmap name b1 +table size8 (bytes) +bitmap_table_offset 0x4e +bitmap_table_size 1 +flags 0 +type 1 +granularity_bits 19 +name_size 2 +extra_data_size 0 + +Bitmap name b2 +flag auto +table size8 (bytes) +bitmap_table_offset 0x50 +bitmap_table_size 1 +flags 2 both having flags and flag in the output looks strange. If you want good human look of flags field, you should implement a special formatter-class for it, like Flags64. Maybe, something like this: flags 0x3 (in_use auto) +type 1 +granularity_bits 16 +name_size 2 +extra_data_size 0 + === Bitmap preservation not possible to non-qcow2 === @@ -98,6 +119,37 @@ reserved320 bitmap_directory_size 0x60 bitmap_directory_offset 0x52 +Bitmap name b1 +table size8 (bytes) +bitmap_table_offset 0x47 +bitmap_table_size 1 +flags 0 +type 1 +granularity_bits 19 +name_size 2 +extra_data_size 0 + +Bitmap name b2 +flag auto +table size8 (bytes) +bitmap_table_offset 0x49 +bitmap_table_size 1 +flags 2 +type 1 +granularity_bits 16 +name_size 2 +extra_data_size 0 + +Bitmap name b0 +table size8 (bytes) +bitmap_table_offset 0x51 +bitmap_table_size 1 +flags 0 +type 1 +granularity_bits 16 +name_size 2 +extra_data_size 0 + === Check bitmap contents === diff --git a/tests/qemu-iotests/qcow2_format.py b/tests/qemu-iotests/qcow2_format.py index d4f..90eff92 100644 --- a/tests/qemu-iotests/qcow2_format.py +++ b/tests/qemu-iotests/qcow2_format.py @@ -103,6 +103,10 @@ class Qcow2Struct(metaclass=Qcow2StructMeta): print('{:<25} {}'.format(f[2], value_str)) +# seek relative to the current position in the file +FROM_CURRENT = 1 Firstly, I though that it's a global variable to control class behavior. Only than I understood that you just decided to use a named constant instead of just 1... So, I'd prefer to use just 1. + + class Qcow2BitmapExt(Qcow2Struct): fields = ( @@ -112,6 +116,73 @@ class Qcow2BitmapExt(Qcow2Struct): ('u64', '{:#x}', 'bitmap_directory_offset') ) +def read_bitmap_directory(self, fd): +self.bitmaps = [] +fd.seek(self.bitmap_directory_offset) +buf_size = struct.calcsize(Qcow2BitmapDirEntry.fmt) + +for n in range(self.nb_bitmaps): +buf = fd.read(buf_size) +dir_entry = Qcow2BitmapDirEntry(data=buf) Base class of Qcow2BitmapDirEntry can load from fd. We should definitely utilize this possibility, instead of writing it again. +fd.seek(dir_entry.extra_data_size, FROM_CURRENT) +bitmap_name = fd.read(dir_entry.name_size) +dir_entry.name = bitmap_name.decode('ascii') You should initialize object in its constructor, not by hand here. Actually, the code here should look like this: self.bitmap_directory = [Qcow2BitmapDirEntry(fd) for _ in range(self.nb_bitmaps)] OK, you may leave a loop, and even calculating of final alignment here, but real fields should be read and initialized in Qcow2BitmapDirEntry constructor +self.bitmaps.append(dir_entry) +entry_raw_size = dir_entry.bitmap_dir_entry_raw_size() +shift = ((
Re: [PATCH v7 3/9] qcow2_format.py: make printable data an extension class member
12.06.2020 03:04, Andrey Shinkevich wrote: Let us differ binary data type from string one for the extension data variable and keep the string as the QcowHeaderExtension class member. Keep my r-b, but I just want to note, that I missed, what is the actual aim of this change.. Signed-off-by: Andrey Shinkevich Reviewed-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy --- tests/qemu-iotests/qcow2_format.py | 14 -- 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/qemu-iotests/qcow2_format.py b/tests/qemu-iotests/qcow2_format.py index 0f65fd1..d4f 100644 --- a/tests/qemu-iotests/qcow2_format.py +++ b/tests/qemu-iotests/qcow2_format.py @@ -164,6 +164,13 @@ class QcowHeaderExtension(Qcow2Struct): self.data = fd.read(padded) assert self.data is not None +data_str = self.data[:self.length] +if all(c in string.printable.encode('ascii') for c in data_str): +data_str = f"'{ data_str.decode('ascii') }'" +else: +data_str = '' +self.data_str = data_str + if self.magic == QCOW2_EXT_MAGIC_BITMAPS: self.obj = Qcow2BitmapExt(data=self.data) else: @@ -173,12 +180,7 @@ class QcowHeaderExtension(Qcow2Struct): super().dump() if self.obj is None: -data = self.data[:self.length] -if all(c in string.printable.encode('ascii') for c in data): -data = f"'{ data.decode('ascii') }'" -else: -data = '' -print(f'{"data":<25} {data}') +print(f'{"data":<25} {self.data_str}') else: self.obj.dump() -- Best regards, Vladimir
Re: [PATCH v7 2/9] qcow2: Fix capitalization of header extension constant.
12.06.2020 03:04, Andrey Shinkevich wrote: Make the capitalization of the hexadecimal numbers consistent for the QCOW2 header extension constants in docs/interop/qcow2.txt. Suggested-by: Eric Blake Signed-off-by: Andrey Shinkevich Reviewed-by: Vladimir Sementsov-Ogievskiy -- Best regards, Vladimir
Re: [PATCH v7 1/9] iotests: Fix for magic hexadecimal output in 291
12.06.2020 03:04, Andrey Shinkevich wrote: This issue was introduced in the earlier patch: "qcow2_format: refactor QcowHeaderExtension as a subclass of Qcow2Struct". Signed-off-by: Andrey Shinkevich This change was squashed to original commit --- tests/qemu-iotests/291.out | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/qemu-iotests/291.out b/tests/qemu-iotests/291.out index 1d4f9cd..ccfcdc5 100644 --- a/tests/qemu-iotests/291.out +++ b/tests/qemu-iotests/291.out @@ -16,17 +16,17 @@ wrote 1048576/1048576 bytes at offset 2097152 1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) Check resulting qcow2 header extensions: Header extension: -magic 3799591626 (Backing format) +magic 0xe2792aca (Backing format) length5 data 'qcow2' Header extension: -magic 1745090647 (Feature table) +magic 0x6803f857 (Feature table) length336 data Header extension: -magic 595929205 (Bitmaps) +magic 0x23852875 (Bitmaps) length24 nb_bitmaps2 reserved320 @@ -86,12 +86,12 @@ Format specific information: corrupt: false Check resulting qcow2 header extensions: Header extension: -magic 1745090647 (Feature table) +magic 0x6803f857 (Feature table) length336 data Header extension: -magic 595929205 (Bitmaps) +magic 0x23852875 (Bitmaps) length24 nb_bitmaps3 reserved320 -- Best regards, Vladimir
Re: [PATCH v7 0/9] iotests: Dump QCOW2 dirty bitmaps metadata
12.06.2020 03:04, Andrey Shinkevich wrote: Note: based on the Vladimir's series [v5 00/13] iotests: Dump QCOW2 dirty bitmaps metadata It's merged to master, so, based on master. (except for 01, which is not needed, thanks to Eric) Add dirty bitmap information to QCOW2 metadata dump in the qcow2_format.py. v7: 01: Fix for magic hexadecimal output in 291 02: Bitmap table output format improvement. 03: Incremental change in the test 291 output. v6: 01: Fixing capitalization of header extension constant. (Suggested by Eric) 02: The cluster size global variable discarded and passed as a parameter. 03: Re-based to Vladimir's v5 series. 04: The code of passing qcow2.py JSON format key moved to separate patch. 05: Making dict(s) for dumping in JSON format was substituted with a copy of __dict__. v5: The Vladimir's preliminary series v4: The Vladimir's preliminary series Andrey Shinkevich (9): iotests: Fix for magic hexadecimal output in 291 qcow2: Fix capitalization of header extension constant. qcow2_format.py: make printable data an extension class member qcow2_format.py: Dump bitmap directory information qcow2_format.py: pass cluster size to substructures qcow2_format.py: Dump bitmap table serialized entries qcow2.py: Introduce '-j' key to dump in JSON format qcow2_format.py: collect fields to dump in JSON format qcow2_format.py: support dumping metadata in JSON format block/qcow2.c | 2 +- docs/interop/qcow2.txt | 2 +- tests/qemu-iotests/291.out | 112 ++- tests/qemu-iotests/qcow2.py| 20 +++- tests/qemu-iotests/qcow2_format.py | 217 ++--- 5 files changed, 327 insertions(+), 26 deletions(-) -- Best regards, Vladimir
Re: [PATCH 2/4] block/aio_task: allow start/wait task from any coroutine
15.06.2020 10:47, Vladimir Sementsov-Ogievskiy wrote: 11.06.2020 20:11, Denis V. Lunev wrote: From: Vladimir Sementsov-Ogievskiy Currently, aio task pool assumes that there is a main coroutine, which creates tasks and wait for them. Let's remove the restriction by using CoQueue. Code becomes clearer, interface more obvious. Signed-off-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Denis V. Lunev CC: Kevin Wolf CC: Max Reitz CC: Stefan Hajnoczi CC: Fam Zheng CC: Juan Quintela CC: "Dr. David Alan Gilbert" CC: Vladimir Sementsov-Ogievskiy CC: Denis Plotnikov --- block/aio_task.c | 21 ++--- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/block/aio_task.c b/block/aio_task.c index 88989fa248..cf62e5c58b 100644 --- a/block/aio_task.c +++ b/block/aio_task.c @@ -27,11 +27,10 @@ #include "block/aio_task.h" struct AioTaskPool { - Coroutine *main_co; int status; int max_busy_tasks; int busy_tasks; - bool waiting; + CoQueue waiters; }; static void coroutine_fn aio_task_co(void *opaque) @@ -52,31 +51,23 @@ static void coroutine_fn aio_task_co(void *opaque) g_free(task); - if (pool->waiting) { - pool->waiting = false; - aio_co_wake(pool->main_co); - } + qemu_co_queue_restart_all(&pool->waiters); } void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool) { assert(pool->busy_tasks > 0); - assert(qemu_coroutine_self() == pool->main_co); - pool->waiting = true; - qemu_coroutine_yield(); + qemu_co_queue_wait(&pool->waiters, NULL); - assert(!pool->waiting); assert(pool->busy_tasks < pool->max_busy_tasks); As we wake up several coroutines now, I'm afraid this assertion may start to fire. And aio_task_pool_wait_one() becomes useless as a public API (still, it's used only locally, so we can make it static). I'll send updated patch after reviewing the rest of the series. Hm, OK, we have two kinds of waiters: waiting for a slot and for all tasks to finish. So, either we need two queues, or do like this patch (one queue, but wake-up all waiters, for them to check does their condition satisfied or not). I'm OK with this patch with the following squashed-in: diff --git a/include/block/aio_task.h b/include/block/aio_task.h index 50bc1e1817..50b1c036c5 100644 --- a/include/block/aio_task.h +++ b/include/block/aio_task.h @@ -48,7 +48,6 @@ bool aio_task_pool_empty(AioTaskPool *pool); void coroutine_fn aio_task_pool_start_task(AioTaskPool *pool, AioTask *task); void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool); -void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool); void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool); #endif /* BLOCK_AIO_TASK_H */ diff --git a/block/aio_task.c b/block/aio_task.c index cf62e5c58b..7ba15ff41f 100644 --- a/block/aio_task.c +++ b/block/aio_task.c @@ -54,26 +54,17 @@ static void coroutine_fn aio_task_co(void *opaque) qemu_co_queue_restart_all(&pool->waiters); } -void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool) -{ -assert(pool->busy_tasks > 0); - -qemu_co_queue_wait(&pool->waiters, NULL); - -assert(pool->busy_tasks < pool->max_busy_tasks); -} - void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool) { while (pool->busy_tasks >= pool->max_busy_tasks) { -aio_task_pool_wait_one(pool); +qemu_co_queue_wait(&pool->waiters, NULL); } } void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool) { while (pool->busy_tasks > 0) { -aio_task_pool_wait_one(pool); +qemu_co_queue_wait(&pool->waiters, NULL); } } -- Best regards, Vladimir
Re: [PATCH 4/4] block/io: improve savevm performance
11.06.2020 20:11, Denis V. Lunev wrote: This patch does 2 standard basic things: - it creates intermediate buffer for all writes from QEMU migration code to block driver, - this buffer is sent to disk asynchronously, allowing several writes to run in parallel. Thus bdrv_vmstate_write() is becoming asynchronous. All pending operations completion are performed in newly invented bdrv_flush_vmstate(). In general, migration code is fantastically inefficent (by observation), buffers are not aligned and sent with arbitrary pieces, a lot of time less than 100 bytes at a chunk, which results in read-modify-write operations if target file descriptor is opened with O_DIRECT. It should also be noted that all operations are performed into unallocated image blocks, which also suffer due to partial writes to such new clusters even on cached file descriptors. Snapshot creation time (2 GB Fedora-31 VM running over NVME storage): original fixed cached: 1.79s 1.27s non-cached: 3.29s 0.81s The difference over HDD would be more significant :) Signed-off-by: Denis V. Lunev CC: Kevin Wolf CC: Max Reitz CC: Stefan Hajnoczi CC: Fam Zheng CC: Juan Quintela CC: "Dr. David Alan Gilbert" CC: Vladimir Sementsov-Ogievskiy CC: Denis Plotnikov --- block/io.c| 121 +- include/block/block_int.h | 8 +++ 2 files changed, 127 insertions(+), 2 deletions(-) diff --git a/block/io.c b/block/io.c index fbf352f39d..698f1eef76 100644 --- a/block/io.c +++ b/block/io.c @@ -26,6 +26,7 @@ #include "trace.h" #include "sysemu/block-backend.h" #include "block/aio-wait.h" +#include "block/aio_task.h" #include "block/blockjob.h" #include "block/blockjob_int.h" #include "block/block_int.h" @@ -2633,6 +2634,102 @@ typedef struct BdrvVmstateCo { int ret; } BdrvVmstateCo; +typedef struct BdrvVMStateTask { +AioTask task; + +BlockDriverState *bs; +int64_t offset; +void *buf; +size_t bytes; +} BdrvVMStateTask; + +typedef struct BdrvSaveVMState { +AioTaskPool *pool; +BdrvVMStateTask *t; +} BdrvSaveVMState; + + +static coroutine_fn int bdrv_co_vmstate_save_task_entry(AioTask *task) +{ +int err = 0; +BdrvVMStateTask *t = container_of(task, BdrvVMStateTask, task); + +if (t->bytes != 0) { +QEMUIOVector local_qiov; +qemu_iovec_init_buf(&local_qiov, t->buf, t->bytes); Consider special oneliner for this case: QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, t->buf, t->bytes); + +bdrv_inc_in_flight(t->bs); +err = t->bs->drv->bdrv_save_vmstate(t->bs, &local_qiov, t->offset); +bdrv_dec_in_flight(t->bs); +} + +qemu_vfree(t->buf); +return err; +} + +static BdrvVMStateTask *bdrv_vmstate_task_create(BlockDriverState *bs, + int64_t pos, size_t size) +{ +BdrvVMStateTask *t = g_new(BdrvVMStateTask, 1); + +*t = (BdrvVMStateTask) { +.task.func = bdrv_co_vmstate_save_task_entry, +.buf = qemu_blockalign(bs, size), +.offset = pos, +.bs = bs, +}; + +return t; +} + +static int bdrv_co_do_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos) +{ +BdrvSaveVMState *state = bs->savevm_state; +BdrvVMStateTask *t; +size_t buf_size = MAX(bdrv_get_cluster_size(bs), 1 * MiB); +size_t to_copy; +size_t off; + +if (state == NULL) { +state = g_new(BdrvSaveVMState, 1); +*state = (BdrvSaveVMState) { +.pool = aio_task_pool_new(BDRV_VMSTATE_WORKERS_MAX), +.t = bdrv_vmstate_task_create(bs, pos, buf_size), +}; + +bs->savevm_state = state; +} + +if (aio_task_pool_status(state->pool) < 0) { +return aio_task_pool_status(state->pool); So, on failure, we still need flush, to cleanup the state. I think better is to cleanup it immediately here (goto fail, etc.). Aha, actually in blk_save_vmstate(), you don't do bdrv_flush_vmstate() if bdrv_save_vmstate() failed. +} + +t = state->t; +if (t->offset + t->bytes != pos) { +/* Normally this branch is not reachable from migration */ Aha, like a cache-miss. OK +return bs->drv->bdrv_save_vmstate(bs, qiov, pos); +} + +off = 0; +while (1) { +to_copy = MIN(qiov->size - off, buf_size - t->bytes); +qemu_iovec_to_buf(qiov, off, t->buf + t->bytes, to_copy); +t->bytes += to_copy; +if (t->bytes < buf_size) { +return qiov->size; +} + +aio_task_pool_start_task(state->pool, &t->task); + +pos += to_copy; +off += to_copy; +state->t = t = bdrv_vmstate_task_create(bs, pos, buf_size); +} + +return qiov->size; this is unreachable actually. So, I'd drop it or do "break" in a loop instead of return. +} + static int coroutine_fn bdrv_co_rw_vmstate(Bloc
Re: [PATCH 3/4] block, migration: add bdrv_flush_vmstate helper
11.06.2020 20:11, Denis V. Lunev wrote: Right now bdrv_fclose() is just calling bdrv_flush(). The problem is that migration code is working inefficently from black layer terms and are frequently called for very small pieces of not properly aligned data. Block layer is capable to work this way, but this is very slow. This patch is a preparation for the introduction of the intermediate buffer at block driver state. It would be beneficial to separate conventional bdrv_flush() from closing QEMU file from migration code. The patch also forces bdrv_flush_vmstate() operation inside synchronous blk_save_vmstate() operation. This helper is used from qemu-io only. Signed-off-by: Denis V. Lunev CC: Kevin Wolf CC: Max Reitz CC: Stefan Hajnoczi CC: Fam Zheng CC: Juan Quintela CC: "Dr. David Alan Gilbert" CC: Vladimir Sementsov-Ogievskiy CC: Denis Plotnikov --- block/block-backend.c | 6 +- block/io.c| 39 +++ include/block/block.h | 1 + migration/savevm.c| 3 +++ 4 files changed, 48 insertions(+), 1 deletion(-) diff --git a/block/block-backend.c b/block/block-backend.c index 9342a475cb..2107ace699 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -2177,7 +2177,7 @@ int blk_truncate(BlockBackend *blk, int64_t offset, bool exact, int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, int64_t pos, int size) { -int ret; +int ret, ret2; if (!blk_is_available(blk)) { return -ENOMEDIUM; @@ -2187,6 +2187,10 @@ int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, if (ret < 0) { return ret; } +ret2 = bdrv_flush_vmstate(blk_bs(blk)); +if (ret2 < 0) { +return ret; +} if (ret == size && !blk->enable_write_cache) { ret = bdrv_flush(blk_bs(blk)); diff --git a/block/io.c b/block/io.c index 121ce17a49..fbf352f39d 100644 --- a/block/io.c +++ b/block/io.c @@ -2725,6 +2725,45 @@ int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) return bdrv_rw_vmstate(bs, qiov, pos, true); } + +typedef struct FlushVMStateCo { +BlockDriverState *bs; +int ret; +} FlushVMStateCo; + +static int coroutine_fn bdrv_co_flush_vmstate(BlockDriverState *bs) +{ +return 0; +} + +static void coroutine_fn bdrv_flush_vmstate_co_entry(void *opaque) +{ +FlushVMStateCo *rwco = opaque; + +rwco->ret = bdrv_co_flush_vmstate(rwco->bs); +aio_wait_kick(); +} + +int bdrv_flush_vmstate(BlockDriverState *bs) +{ +Coroutine *co; +FlushVMStateCo flush_co = { +.bs = bs, +.ret = NOT_DONE, +}; + +if (qemu_in_coroutine()) { +/* Fast-path if already in coroutine context */ +bdrv_flush_vmstate_co_entry(&flush_co); +} else { +co = qemu_coroutine_create(bdrv_flush_vmstate_co_entry, &flush_co); +bdrv_coroutine_enter(bs, co); +BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE); +} + +return flush_co.ret; +} In block/io.c, since 7d2410cea154bf, these coroutine wrappers are using bdrv_run_co() instead. I hope, it's an intermediate state, and my series with auto-generated wrappers will be applied, still, now we should use bdrv_run_co()-based approach for consistency. Otherwise, patch looks OK for me, just add a new interface, doing nothing for now. Still, would be good to add a comment in block.h, that bdrv_flush_vmsate is needed after bdrv_save_vmstate. + /**/ /* async I/Os */ diff --git a/include/block/block.h b/include/block/block.h index 25e299605e..024525b87d 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -572,6 +572,7 @@ int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, int64_t pos, int size); +int bdrv_flush_vmstate(BlockDriverState *bs); void bdrv_img_create(const char *filename, const char *fmt, const char *base_filename, const char *base_fmt, diff --git a/migration/savevm.c b/migration/savevm.c index 0ff5bb40ed..9698c909d7 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -150,6 +150,9 @@ static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos, static int bdrv_fclose(void *opaque, Error **errp) { +int err = bdrv_flush_vmstate(opaque); +if (err < 0) +return err; return bdrv_flush(opaque); } -- Best regards, Vladimir
Re: [PATCH 2/4] block/aio_task: allow start/wait task from any coroutine
11.06.2020 20:11, Denis V. Lunev wrote: From: Vladimir Sementsov-Ogievskiy Currently, aio task pool assumes that there is a main coroutine, which creates tasks and wait for them. Let's remove the restriction by using CoQueue. Code becomes clearer, interface more obvious. Signed-off-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Denis V. Lunev CC: Kevin Wolf CC: Max Reitz CC: Stefan Hajnoczi CC: Fam Zheng CC: Juan Quintela CC: "Dr. David Alan Gilbert" CC: Vladimir Sementsov-Ogievskiy CC: Denis Plotnikov --- block/aio_task.c | 21 ++--- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/block/aio_task.c b/block/aio_task.c index 88989fa248..cf62e5c58b 100644 --- a/block/aio_task.c +++ b/block/aio_task.c @@ -27,11 +27,10 @@ #include "block/aio_task.h" struct AioTaskPool { -Coroutine *main_co; int status; int max_busy_tasks; int busy_tasks; -bool waiting; +CoQueue waiters; }; static void coroutine_fn aio_task_co(void *opaque) @@ -52,31 +51,23 @@ static void coroutine_fn aio_task_co(void *opaque) g_free(task); -if (pool->waiting) { -pool->waiting = false; -aio_co_wake(pool->main_co); -} +qemu_co_queue_restart_all(&pool->waiters); } void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool) { assert(pool->busy_tasks > 0); -assert(qemu_coroutine_self() == pool->main_co); -pool->waiting = true; -qemu_coroutine_yield(); +qemu_co_queue_wait(&pool->waiters, NULL); -assert(!pool->waiting); assert(pool->busy_tasks < pool->max_busy_tasks); As we wake up several coroutines now, I'm afraid this assertion may start to fire. And aio_task_pool_wait_one() becomes useless as a public API (still, it's used only locally, so we can make it static). I'll send updated patch after reviewing the rest of the series. } void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool) { -if (pool->busy_tasks < pool->max_busy_tasks) { -return; +while (pool->busy_tasks >= pool->max_busy_tasks) { +aio_task_pool_wait_one(pool); } - -aio_task_pool_wait_one(pool); } void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool) @@ -98,8 +89,8 @@ AioTaskPool *coroutine_fn aio_task_pool_new(int max_busy_tasks) { AioTaskPool *pool = g_new0(AioTaskPool, 1); -pool->main_co = qemu_coroutine_self(); pool->max_busy_tasks = max_busy_tasks; +qemu_co_queue_init(&pool->waiters); return pool; } -- Best regards, Vladimir
Re: [PATCH 1/4] migration/savevm: respect qemu_fclose() error code in save_snapshot()
11.06.2020 20:11, Denis V. Lunev wrote: qemu_fclose() could return error, f.e. if bdrv_co_flush() will return the error. This validation will become more important once we will start waiting of asynchronous IO operations, started from bdrv_write_vmstate(), which are coming soon. Signed-off-by: Denis V. Lunev CC: Kevin Wolf CC: Max Reitz CC: Stefan Hajnoczi CC: Fam Zheng CC: Juan Quintela CC: "Dr. David Alan Gilbert" CC: Vladimir Sementsov-Ogievskiy CC: Denis Plotnikov Reviewed-by: Vladimir Sementsov-Ogievskiy -- Best regards, Vladimir
Re: [PATCH] hw/scsi/megasas: Fix possible out-of-bounds array access in tracepoints
On 6/15/20 9:26 AM, Thomas Huth wrote: > Some tracepoints in megasas.c use a guest-controlled value as an index > into the mfi_frame_desc[] array. Thus a malicious guest could cause an > out-of-bounds error here. Fortunately, the impact is very low since this > can only happen when the corresponding tracepoints have been enabled > before, but the problem should be fixed anyway with a proper check. > > Buglink: https://bugs.launchpad.net/qemu/+bug/1882065 > Signed-off-by: Thomas Huth > --- > hw/scsi/megasas.c | 36 +++- > 1 file changed, 23 insertions(+), 13 deletions(-) > > diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c > index af18c88b65..aa930226f8 100644 > --- a/hw/scsi/megasas.c > +++ b/hw/scsi/megasas.c > @@ -54,10 +54,6 @@ > #define MEGASAS_FLAG_USE_QUEUE64 1 > #define MEGASAS_MASK_USE_QUEUE64 (1 << MEGASAS_FLAG_USE_QUEUE64) > > -static const char *mfi_frame_desc[] = { > -"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI", > -"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"}; > - > typedef struct MegasasCmd { > uint32_t index; > uint16_t flags; > @@ -183,6 +179,20 @@ static void megasas_frame_set_scsi_status(MegasasState > *s, > stb_pci_dma(pci, frame + offsetof(struct mfi_frame_header, scsi_status), > v); > } > > +static inline const char *mfi_frame_desc(unsigned int cmd) > +{ > +static const char *mfi_frame_descs[] = { > +"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI", > +"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop" > +}; > + > +if (cmd < ARRAY_SIZE(mfi_frame_descs)) { > +return mfi_frame_descs[cmd]; > +} > + > +return "Unknown"; > +} > + > /* > * Context is considered opaque, but the HBA firmware is running > * in little endian mode. So convert it to little endian, too. > @@ -1670,25 +1680,25 @@ static int megasas_handle_scsi(MegasasState *s, > MegasasCmd *cmd, > if (is_logical) { > if (target_id >= MFI_MAX_LD || lun_id != 0) { > trace_megasas_scsi_target_not_present( > -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id); > +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id); > return MFI_STAT_DEVICE_NOT_FOUND; > } > } > sdev = scsi_device_find(&s->bus, 0, target_id, lun_id); > > cmd->iov_size = le32_to_cpu(cmd->frame->header.data_len); > -trace_megasas_handle_scsi(mfi_frame_desc[frame_cmd], is_logical, > +trace_megasas_handle_scsi(mfi_frame_desc(frame_cmd), is_logical, >target_id, lun_id, sdev, cmd->iov_size); > > if (!sdev || (megasas_is_jbod(s) && is_logical)) { > trace_megasas_scsi_target_not_present( > -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id); > +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id); > return MFI_STAT_DEVICE_NOT_FOUND; > } > > if (cdb_len > 16) { > trace_megasas_scsi_invalid_cdb_len( > -mfi_frame_desc[frame_cmd], is_logical, > +mfi_frame_desc(frame_cmd), is_logical, > target_id, lun_id, cdb_len); > megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE)); > cmd->frame->header.scsi_status = CHECK_CONDITION; > @@ -1706,7 +1716,7 @@ static int megasas_handle_scsi(MegasasState *s, > MegasasCmd *cmd, > cmd->req = scsi_req_new(sdev, cmd->index, lun_id, cdb, cmd); > if (!cmd->req) { > trace_megasas_scsi_req_alloc_failed( > -mfi_frame_desc[frame_cmd], target_id, lun_id); > +mfi_frame_desc(frame_cmd), target_id, lun_id); > megasas_write_sense(cmd, SENSE_CODE(NO_SENSE)); > cmd->frame->header.scsi_status = BUSY; > s->event_count++; > @@ -1751,17 +1761,17 @@ static int megasas_handle_io(MegasasState *s, > MegasasCmd *cmd, int frame_cmd) > } > > trace_megasas_handle_io(cmd->index, > -mfi_frame_desc[frame_cmd], target_id, lun_id, > +mfi_frame_desc(frame_cmd), target_id, lun_id, > (unsigned long)lba_start, (unsigned > long)lba_count); > if (!sdev) { > trace_megasas_io_target_not_present(cmd->index, > -mfi_frame_desc[frame_cmd], target_id, lun_id); > +mfi_frame_desc(frame_cmd), target_id, lun_id); > return MFI_STAT_DEVICE_NOT_FOUND; > } > > if (cdb_len > 16) { > trace_megasas_scsi_invalid_cdb_len( > -mfi_frame_desc[frame_cmd], 1, target_id, lun_id, cdb_len); > +mfi_frame_desc(frame_cmd), 1, target_id, lun_id, cdb_len); > megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE)); > cmd->frame->header.scsi_status = CHECK_CONDITION; > s->event_count++; > @@ -1781,7 +1791,7 @@ static int megasas_handle_io(MegasasState *s, > MegasasCmd *cmd, int frame_cmd) >
Re: [PATCH v3 00/11] hw/sd/sdcard: Fix CVE-2020-13253 & cleanups
On 6/5/20 12:22 PM, Philippe Mathieu-Daudé wrote: > Patches 2 & 3 fix CVE-2020-13253. Ping for the CVE fix?... > The rest are (accumulated) cleanups. > > Supersedes: <20200604182502.24228-1-f4...@amsat.org> > > Philippe Mathieu-Daudé (11): > MAINTAINERS: Cc qemu-block mailing list > hw/sd/sdcard: Update coding style to make checkpatch.pl happy > hw/sd/sdcard: Do not switch to ReceivingData if address is invalid > hw/sd/sdcard: Restrict Class 6 commands to SCSD cards > hw/sd/sdcard: Update the SDState documentation > hw/sd/sdcard: Simplify cmd_valid_while_locked() > hw/sd/sdcard: Constify sd_crc*()'s message argument > hw/sd/sdcard: Make iolen unsigned > hw/sd/sdcard: Correctly display the command name in trace events > hw/sd/sdcard: Display offset in read/write_data() trace events > hw/sd/sdcard: Simplify realize() a bit > > hw/sd/sd.c | 122 + > MAINTAINERS| 1 + > hw/sd/trace-events | 4 +- > 3 files changed, 83 insertions(+), 44 deletions(-) >
[PATCH] hw/scsi/megasas: Fix possible out-of-bounds array access in tracepoints
Some tracepoints in megasas.c use a guest-controlled value as an index into the mfi_frame_desc[] array. Thus a malicious guest could cause an out-of-bounds error here. Fortunately, the impact is very low since this can only happen when the corresponding tracepoints have been enabled before, but the problem should be fixed anyway with a proper check. Buglink: https://bugs.launchpad.net/qemu/+bug/1882065 Signed-off-by: Thomas Huth --- hw/scsi/megasas.c | 36 +++- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c index af18c88b65..aa930226f8 100644 --- a/hw/scsi/megasas.c +++ b/hw/scsi/megasas.c @@ -54,10 +54,6 @@ #define MEGASAS_FLAG_USE_QUEUE64 1 #define MEGASAS_MASK_USE_QUEUE64 (1 << MEGASAS_FLAG_USE_QUEUE64) -static const char *mfi_frame_desc[] = { -"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI", -"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop"}; - typedef struct MegasasCmd { uint32_t index; uint16_t flags; @@ -183,6 +179,20 @@ static void megasas_frame_set_scsi_status(MegasasState *s, stb_pci_dma(pci, frame + offsetof(struct mfi_frame_header, scsi_status), v); } +static inline const char *mfi_frame_desc(unsigned int cmd) +{ +static const char *mfi_frame_descs[] = { +"MFI init", "LD Read", "LD Write", "LD SCSI", "PD SCSI", +"MFI Doorbell", "MFI Abort", "MFI SMP", "MFI Stop" +}; + +if (cmd < ARRAY_SIZE(mfi_frame_descs)) { +return mfi_frame_descs[cmd]; +} + +return "Unknown"; +} + /* * Context is considered opaque, but the HBA firmware is running * in little endian mode. So convert it to little endian, too. @@ -1670,25 +1680,25 @@ static int megasas_handle_scsi(MegasasState *s, MegasasCmd *cmd, if (is_logical) { if (target_id >= MFI_MAX_LD || lun_id != 0) { trace_megasas_scsi_target_not_present( -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id); +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id); return MFI_STAT_DEVICE_NOT_FOUND; } } sdev = scsi_device_find(&s->bus, 0, target_id, lun_id); cmd->iov_size = le32_to_cpu(cmd->frame->header.data_len); -trace_megasas_handle_scsi(mfi_frame_desc[frame_cmd], is_logical, +trace_megasas_handle_scsi(mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id, sdev, cmd->iov_size); if (!sdev || (megasas_is_jbod(s) && is_logical)) { trace_megasas_scsi_target_not_present( -mfi_frame_desc[frame_cmd], is_logical, target_id, lun_id); +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id); return MFI_STAT_DEVICE_NOT_FOUND; } if (cdb_len > 16) { trace_megasas_scsi_invalid_cdb_len( -mfi_frame_desc[frame_cmd], is_logical, +mfi_frame_desc(frame_cmd), is_logical, target_id, lun_id, cdb_len); megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE)); cmd->frame->header.scsi_status = CHECK_CONDITION; @@ -1706,7 +1716,7 @@ static int megasas_handle_scsi(MegasasState *s, MegasasCmd *cmd, cmd->req = scsi_req_new(sdev, cmd->index, lun_id, cdb, cmd); if (!cmd->req) { trace_megasas_scsi_req_alloc_failed( -mfi_frame_desc[frame_cmd], target_id, lun_id); +mfi_frame_desc(frame_cmd), target_id, lun_id); megasas_write_sense(cmd, SENSE_CODE(NO_SENSE)); cmd->frame->header.scsi_status = BUSY; s->event_count++; @@ -1751,17 +1761,17 @@ static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd, int frame_cmd) } trace_megasas_handle_io(cmd->index, -mfi_frame_desc[frame_cmd], target_id, lun_id, +mfi_frame_desc(frame_cmd), target_id, lun_id, (unsigned long)lba_start, (unsigned long)lba_count); if (!sdev) { trace_megasas_io_target_not_present(cmd->index, -mfi_frame_desc[frame_cmd], target_id, lun_id); +mfi_frame_desc(frame_cmd), target_id, lun_id); return MFI_STAT_DEVICE_NOT_FOUND; } if (cdb_len > 16) { trace_megasas_scsi_invalid_cdb_len( -mfi_frame_desc[frame_cmd], 1, target_id, lun_id, cdb_len); +mfi_frame_desc(frame_cmd), 1, target_id, lun_id, cdb_len); megasas_write_sense(cmd, SENSE_CODE(INVALID_OPCODE)); cmd->frame->header.scsi_status = CHECK_CONDITION; s->event_count++; @@ -1781,7 +1791,7 @@ static int megasas_handle_io(MegasasState *s, MegasasCmd *cmd, int frame_cmd) lun_id, cdb, cmd); if (!cmd->req) { trace_megasas_scsi_req_alloc_failed( -mfi_frame_desc[frame_cmd], target_id, lun_id); +mfi_frame_desc(frame_cmd), target_id, lun_id); megasas_write_sense(cmd, SENSE_C