[PULL 3/6] hw/nvme: move format parameter parsing

2022-03-03 Thread Klaus Jensen
From: Klaus Jensen 

There is no need to extract the format command parameters for each
namespace. Move it to the entry point.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 31 ++-
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 71c60482c75f..d8701ebf2fa8 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5452,6 +5452,11 @@ typedef struct NvmeFormatAIOCB {
 uint32_t nsid;
 bool broadcast;
 int64_t offset;
+
+uint8_t lbaf;
+uint8_t mset;
+uint8_t pi;
+uint8_t pil;
 } NvmeFormatAIOCB;
 
 static void nvme_format_bh(void *opaque);
@@ -5471,14 +5476,9 @@ static const AIOCBInfo nvme_format_aiocb_info = {
 .get_aio_context = nvme_get_aio_context,
 };
 
-static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
+static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
+uint8_t pi, uint8_t pil)
 {
-uint32_t dw10 = le32_to_cpu(cmd->cdw10);
-uint8_t lbaf = dw10 & 0xf;
-uint8_t pi = (dw10 >> 5) & 0x7;
-uint8_t mset = (dw10 >> 4) & 0x1;
-uint8_t pil = (dw10 >> 8) & 0x1;
-
 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
 
 ns->id_ns.dps = (pil << 3) | pi;
@@ -5490,7 +5490,6 @@ static void nvme_format_set(NvmeNamespace *ns, NvmeCmd 
*cmd)
 static void nvme_format_ns_cb(void *opaque, int ret)
 {
 NvmeFormatAIOCB *iocb = opaque;
-NvmeRequest *req = iocb->req;
 NvmeNamespace *ns = iocb->ns;
 int bytes;
 
@@ -5512,7 +5511,7 @@ static void nvme_format_ns_cb(void *opaque, int ret)
 return;
 }
 
-nvme_format_set(ns, >cmd);
+nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
 ns->status = 0x0;
 iocb->ns = NULL;
 iocb->offset = 0;
@@ -5548,9 +5547,6 @@ static void nvme_format_bh(void *opaque)
 NvmeFormatAIOCB *iocb = opaque;
 NvmeRequest *req = iocb->req;
 NvmeCtrl *n = nvme_ctrl(req);
-uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
-uint8_t lbaf = dw10 & 0xf;
-uint8_t pi = (dw10 >> 5) & 0x7;
 uint16_t status;
 int i;
 
@@ -5572,7 +5568,7 @@ static void nvme_format_bh(void *opaque)
 goto done;
 }
 
-status = nvme_format_check(iocb->ns, lbaf, pi);
+status = nvme_format_check(iocb->ns, iocb->lbaf, iocb->pi);
 if (status) {
 req->status = status;
 goto done;
@@ -5595,6 +5591,11 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 {
 NvmeFormatAIOCB *iocb;
 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
+uint8_t lbaf = dw10 & 0xf;
+uint8_t mset = (dw10 >> 4) & 0x1;
+uint8_t pi = (dw10 >> 5) & 0x7;
+uint8_t pil = (dw10 >> 8) & 0x1;
 uint16_t status;
 
 iocb = qemu_aio_get(_format_aiocb_info, NULL, nvme_misc_cb, req);
@@ -5604,6 +5605,10 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 iocb->ret = 0;
 iocb->ns = NULL;
 iocb->nsid = 0;
+iocb->lbaf = lbaf;
+iocb->mset = mset;
+iocb->pi = pi;
+iocb->pil = pil;
 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
 iocb->offset = 0;
 
-- 
2.35.1




[PULL 6/6] hw/nvme: 64-bit pi support

2022-03-03 Thread Klaus Jensen
From: Naveen Nagar 

This adds support for one possible new protection information format
introduced in TP4068 (and integrated in NVMe 2.0): the 64-bit CRC guard
and 48-bit reference tag. This version does not support storage tags.

Like the CRC16 support already present, this uses a software
implementation of CRC64 (so it is naturally pretty slow). But its good
enough for verification purposes.

This may go nicely hand-in-hand with the support that Keith submitted
for the Linux kernel[1].

  [1]: 
https://lore.kernel.org/linux-nvme/20220126165214.ga1782...@dhcp-10-100-145-180.wdc.com/T/

Reviewed-by: Keith Busch 
Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 163 +++
 hw/nvme/dif.c| 363 +--
 hw/nvme/dif.h| 143 -
 hw/nvme/ns.c |  35 -
 hw/nvme/nvme.h   |   3 +
 hw/nvme/trace-events |  12 +-
 include/block/nvme.h |  67 ++--
 7 files changed, 648 insertions(+), 138 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index f1683960b87e..03760ddeae8c 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -2050,9 +2050,12 @@ static void nvme_verify_cb(void *opaque, int ret)
 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
 uint16_t apptag = le16_to_cpu(rw->apptag);
 uint16_t appmask = le16_to_cpu(rw->appmask);
-uint32_t reftag = le32_to_cpu(rw->reftag);
+uint64_t reftag = le32_to_cpu(rw->reftag);
+uint64_t cdw3 = le32_to_cpu(rw->cdw3);
 uint16_t status;
 
+reftag |= cdw3 << 32;
+
 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
 
 if (ret) {
@@ -2141,7 +2144,8 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
 uint16_t apptag = le16_to_cpu(rw->apptag);
 uint16_t appmask = le16_to_cpu(rw->appmask);
-uint32_t reftag = le32_to_cpu(rw->reftag);
+uint64_t reftag = le32_to_cpu(rw->reftag);
+uint64_t cdw3 = le32_to_cpu(rw->cdw3);
 struct nvme_compare_ctx *ctx = req->opaque;
 g_autofree uint8_t *buf = NULL;
 BlockBackend *blk = ns->blkconf.blk;
@@ -2149,6 +2153,8 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
 BlockAcctStats *stats = blk_get_stats(blk);
 uint16_t status = NVME_SUCCESS;
 
+reftag |= cdw3 << 32;
+
 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
 
 if (ret) {
@@ -2527,7 +2533,8 @@ typedef struct NvmeCopyAIOCB {
 QEMUBH *bh;
 int ret;
 
-NvmeCopySourceRange *ranges;
+void *ranges;
+unsigned int format;
 int nr;
 int idx;
 
@@ -2538,7 +2545,7 @@ typedef struct NvmeCopyAIOCB {
 BlockAcctCookie write;
 } acct;
 
-uint32_t reftag;
+uint64_t reftag;
 uint64_t slba;
 
 NvmeZone *zone;
@@ -2592,13 +2599,101 @@ static void nvme_copy_bh(void *opaque)
 
 static void nvme_copy_cb(void *opaque, int ret);
 
+static void nvme_copy_source_range_parse_format0(void *ranges, int idx,
+ uint64_t *slba, uint32_t *nlb,
+ uint16_t *apptag,
+ uint16_t *appmask,
+ uint64_t *reftag)
+{
+NvmeCopySourceRangeFormat0 *_ranges = ranges;
+
+if (slba) {
+*slba = le64_to_cpu(_ranges[idx].slba);
+}
+
+if (nlb) {
+*nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
+}
+
+if (apptag) {
+*apptag = le16_to_cpu(_ranges[idx].apptag);
+}
+
+if (appmask) {
+*appmask = le16_to_cpu(_ranges[idx].appmask);
+}
+
+if (reftag) {
+*reftag = le32_to_cpu(_ranges[idx].reftag);
+}
+}
+
+static void nvme_copy_source_range_parse_format1(void *ranges, int idx,
+ uint64_t *slba, uint32_t *nlb,
+ uint16_t *apptag,
+ uint16_t *appmask,
+ uint64_t *reftag)
+{
+NvmeCopySourceRangeFormat1 *_ranges = ranges;
+
+if (slba) {
+*slba = le64_to_cpu(_ranges[idx].slba);
+}
+
+if (nlb) {
+*nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
+}
+
+if (apptag) {
+*apptag = le16_to_cpu(_ranges[idx].apptag);
+}
+
+if (appmask) {
+*appmask = le16_to_cpu(_ranges[idx].appmask);
+}
+
+if (reftag) {
+*reftag = 0;
+
+*reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
+*reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
+*reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
+*reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
+*reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
+*reftag |= (uint64_t)_ranges[idx].sr[9];
+}
+}
+
+static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
+   

[PULL 4/6] hw/nvme: add support for the lbafee hbs feature

2022-03-03 Thread Klaus Jensen
From: Naveen Nagar 

Add support for up to 64 LBA formats through the LBAFEE field of the
Host Behavior Support feature.

Reviewed-by: Keith Busch 
Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 34 +++---
 hw/nvme/ns.c | 15 +--
 hw/nvme/nvme.h   |  1 +
 include/block/nvme.h |  7 +--
 4 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index d8701ebf2fa8..52ab3450b975 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5165,6 +5165,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 uint32_t nsid = le32_to_cpu(cmd->nsid);
 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
 uint8_t save = NVME_SETFEAT_SAVE(dw10);
+uint16_t status;
 int i;
 
 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
@@ -5287,8 +5288,26 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
NvmeRequest *req)
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, req);
 case NVME_HOST_BEHAVIOR_SUPPORT:
-return nvme_h2c(n, (uint8_t *)>features.hbs,
-sizeof(n->features.hbs), req);
+status = nvme_h2c(n, (uint8_t *)>features.hbs,
+  sizeof(n->features.hbs), req);
+if (status) {
+return status;
+}
+
+for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+ns = nvme_ns(n, i);
+
+if (!ns) {
+continue;
+}
+
+ns->id_ns.nlbaf = ns->nlbaf - 1;
+if (!n->features.hbs.lbafee) {
+ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
+}
+}
+
+return status;
 case NVME_COMMAND_SET_PROFILE:
 if (dw11 & 0x1ff) {
 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
@@ -5479,10 +5498,13 @@ static const AIOCBInfo nvme_format_aiocb_info = {
 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
 uint8_t pi, uint8_t pil)
 {
+uint8_t lbafl = lbaf & 0xf;
+uint8_t lbafu = lbaf >> 4;
+
 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
 
 ns->id_ns.dps = (pil << 3) | pi;
-ns->id_ns.flbas = lbaf | (mset << 4);
+ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
 
 nvme_ns_init_format(ns);
 }
@@ -5596,6 +5618,7 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
 uint8_t mset = (dw10 >> 4) & 0x1;
 uint8_t pi = (dw10 >> 5) & 0x7;
 uint8_t pil = (dw10 >> 8) & 0x1;
+uint8_t lbafu = (dw10 >> 12) & 0x3;
 uint16_t status;
 
 iocb = qemu_aio_get(_format_aiocb_info, NULL, nvme_misc_cb, req);
@@ -5612,6 +5635,10 @@ static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest 
*req)
 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
 iocb->offset = 0;
 
+if (n->features.hbs.lbafee) {
+iocb->lbaf |= lbafu << 4;
+}
+
 if (!iocb->broadcast) {
 if (!nvme_nsid_valid(n, nsid)) {
 status = NVME_INVALID_NSID | NVME_DNR;
@@ -6587,6 +6614,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->cntlid = cpu_to_le16(n->cntlid);
 
 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
+id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS);
 
 id->rab = 6;
 
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index ee673f1a5bef..8dfb55130beb 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -112,10 +112,11 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 [7] = { .ds = 12, .ms = 64 },
 };
 
+ns->nlbaf = 8;
+
 memcpy(_ns->lbaf, , sizeof(lbaf));
-id_ns->nlbaf = 7;
 
-for (i = 0; i <= id_ns->nlbaf; i++) {
+for (i = 0; i < ns->nlbaf; i++) {
 NvmeLBAF *lbaf = _ns->lbaf[i];
 if (lbaf->ds == ds) {
 if (lbaf->ms == ms) {
@@ -126,12 +127,14 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 }
 
 /* add non-standard lba format */
-id_ns->nlbaf++;
-id_ns->lbaf[id_ns->nlbaf].ds = ds;
-id_ns->lbaf[id_ns->nlbaf].ms = ms;
-id_ns->flbas |= id_ns->nlbaf;
+id_ns->lbaf[ns->nlbaf].ds = ds;
+id_ns->lbaf[ns->nlbaf].ms = ms;
+ns->nlbaf++;
+
+id_ns->flbas |= i;
 
 lbaf_found:
+id_ns->nlbaf = ns->nlbaf - 1;
 nvme_ns_init_format(ns);
 
 return 0;
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 103407038e74..e715c3255a29 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -128,6 +128,7 @@ typedef struct NvmeNamespace {
 int64_t  moff;
 NvmeIdNs id_ns;
 NvmeLBAF lbaf;
+unsigned int nlbaf;
 size_t   lbasz;
 const uint32_t *iocs;
 uint8_t  csi;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e527c728f975..37afc9be9b18 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -,6 +,10 @@ enum NvmeIdCtrlOaes {
 NVME_OAES_NS_ATTR   = 1 << 8,
 };
 
+enum NvmeIdCtrlCtratt {
+NVME_CTRATT_ELBAS   = 1 << 15,
+};
+
 enum 

[PULL 5/6] hw/nvme: add pi tuple size helper

2022-03-03 Thread Klaus Jensen
From: Klaus Jensen 

A subsequent patch will introduce a new tuple size; so add a helper and
use that instead of sizeof() and magic numbers.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c | 14 --
 hw/nvme/dif.c  | 16 
 hw/nvme/dif.h  |  5 +
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 52ab3450b975..f1683960b87e 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1068,7 +1068,8 @@ static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, 
NvmeRequest *req)
 size_t len = nvme_l2b(ns, nlb);
 uint16_t status;
 
-if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
+if (nvme_ns_ext(ns) &&
+!(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
 NvmeSg sg;
 
 len += nvme_m2b(ns, nlb);
@@ -1247,7 +1248,8 @@ uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, 
uint32_t len,
 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
 
-if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
+if (nvme_ns_ext(ns) &&
+!(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
 return nvme_tx_interleaved(n, >sg, ptr, len, ns->lbasz,
ns->lbaf.ms, 0, dir);
 }
@@ -2184,7 +2186,7 @@ static void nvme_compare_mdata_cb(void *opaque, int ret)
  * tuple.
  */
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += 
ns->lbaf.ms) {
@@ -3167,7 +3169,7 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
 bool pract = prinfo & NVME_PRINFO_PRACT;
 
-if (pract && ns->lbaf.ms == 8) {
+if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 mapped_size = data_size;
 }
 }
@@ -3244,7 +3246,7 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,
 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
 bool pract = prinfo & NVME_PRINFO_PRACT;
 
-if (pract && ns->lbaf.ms == 8) {
+if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 mapped_size -= nvme_m2b(ns, nlb);
 }
 }
@@ -5553,7 +,7 @@ static uint16_t nvme_format_check(NvmeNamespace *ns, 
uint8_t lbaf, uint8_t pi)
 return NVME_INVALID_FORMAT | NVME_DNR;
 }
 
-if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
+if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
 return NVME_INVALID_FORMAT | NVME_DNR;
 }
 
diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c
index cd0cea2b5ebd..891385f33f20 100644
--- a/hw/nvme/dif.c
+++ b/hw/nvme/dif.c
@@ -48,7 +48,7 @@ void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t 
*buf, size_t len,
 int16_t pil = 0;
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 trace_pci_nvme_dif_pract_generate_dif(len, ns->lbasz, ns->lbasz + pil,
@@ -145,7 +145,7 @@ uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, 
size_t len,
 }
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 trace_pci_nvme_dif_check(prinfo, ns->lbasz + pil);
@@ -184,7 +184,7 @@ uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t 
*mbuf, size_t mlen,
 
 
 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
-pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 }
 
 do {
@@ -210,7 +210,7 @@ uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t 
*mbuf, size_t mlen,
 end = mbufp + mlen;
 
 for (; mbufp < end; mbufp += ns->lbaf.ms) {
-memset(mbufp + pil, 0xff, sizeof(NvmeDifTuple));
+memset(mbufp + pil, 0xff, nvme_pi_tuple_size(ns));
 }
 }
 
@@ -284,7 +284,7 @@ static void nvme_dif_rw_check_cb(void *opaque, int ret)
 goto out;
 }
 
-if (prinfo & NVME_PRINFO_PRACT && ns->lbaf.ms == 8) {
+if (prinfo & NVME_PRINFO_PRACT && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
 goto out;
 }
 
@@ -388,7 +388,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req)
 
 if (pract) {
 uint8_t *mbuf, *end;
-int16_t pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
+int16_t pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
 
 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
 if (status) {
@@ -428,7 +428,7 @@ uint16_t nvme_dif_rw(NvmeCtrl *n, 

[PULL 2/6] hw/nvme: add host behavior support feature

2022-03-03 Thread Klaus Jensen
From: Naveen Nagar 

Add support for getting and setting the Host Behavior Support feature.

Reviewed-by: Keith Busch 
Signed-off-by: Naveen Nagar 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 8 
 hw/nvme/nvme.h   | 4 +++-
 include/block/nvme.h | 9 +
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index d08af3bdc1a2..71c60482c75f 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -196,6 +196,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
 [NVME_WRITE_ATOMICITY]  = true,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = true,
 [NVME_TIMESTAMP]= true,
+[NVME_HOST_BEHAVIOR_SUPPORT]= true,
 [NVME_COMMAND_SET_PROFILE]  = true,
 };
 
@@ -206,6 +207,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
+[NVME_HOST_BEHAVIOR_SUPPORT]= NVME_FEAT_CAP_CHANGE,
 [NVME_COMMAND_SET_PROFILE]  = NVME_FEAT_CAP_CHANGE,
 };
 
@@ -5091,6 +5093,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
 goto out;
 case NVME_TIMESTAMP:
 return nvme_get_feature_timestamp(n, req);
+case NVME_HOST_BEHAVIOR_SUPPORT:
+return nvme_c2h(n, (uint8_t *)>features.hbs,
+sizeof(n->features.hbs), req);
 default:
 break;
 }
@@ -5281,6 +5286,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 break;
 case NVME_TIMESTAMP:
 return nvme_set_feature_timestamp(n, req);
+case NVME_HOST_BEHAVIOR_SUPPORT:
+return nvme_h2c(n, (uint8_t *)>features.hbs,
+sizeof(n->features.hbs), req);
 case NVME_COMMAND_SET_PROFILE:
 if (dw11 & 0x1ff) {
 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 801176a2bd5e..103407038e74 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -468,7 +468,9 @@ typedef struct NvmeCtrl {
 uint16_t temp_thresh_hi;
 uint16_t temp_thresh_low;
 };
-uint32_tasync_config;
+
+uint32_tasync_config;
+NvmeHostBehaviorSupport hbs;
 } features;
 } NvmeCtrl;
 
diff --git a/include/block/nvme.h b/include/block/nvme.h
index cd068ac89142..e527c728f975 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1216,6 +1216,7 @@ enum NvmeFeatureIds {
 NVME_WRITE_ATOMICITY= 0xa,
 NVME_ASYNCHRONOUS_EVENT_CONF= 0xb,
 NVME_TIMESTAMP  = 0xe,
+NVME_HOST_BEHAVIOR_SUPPORT  = 0x16,
 NVME_COMMAND_SET_PROFILE= 0x19,
 NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
 NVME_FID_MAX= 0x100,
@@ -1257,6 +1258,13 @@ typedef struct QEMU_PACKED NvmeRangeType {
 uint8_t rsvd48[16];
 } NvmeRangeType;
 
+typedef struct NvmeHostBehaviorSupport {
+uint8_t acre;
+uint8_t etdas;
+uint8_t lbafee;
+uint8_t rsvd3[509];
+} NvmeHostBehaviorSupport;
+
 typedef struct QEMU_PACKED NvmeLBAF {
 uint16_tms;
 uint8_t ds;
@@ -1520,6 +1528,7 @@ static inline void _nvme_check_size(void)
 QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
+QEMU_BUILD_BUG_ON(sizeof(NvmeHostBehaviorSupport) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
 QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
 QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
-- 
2.35.1




[PULL 1/6] hw/nvme: move dif/pi prototypes into dif.h

2022-03-03 Thread Klaus Jensen
From: Klaus Jensen 

Move dif/pi data structures and inlines to dif.h.

Reviewed-by: Keith Busch 
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c |  1 +
 hw/nvme/dif.c  |  1 +
 hw/nvme/dif.h  | 53 ++
 hw/nvme/nvme.h | 50 ---
 4 files changed, 55 insertions(+), 50 deletions(-)
 create mode 100644 hw/nvme/dif.h

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 98aac98bef5f..d08af3bdc1a2 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -163,6 +163,7 @@
 #include "migration/vmstate.h"
 
 #include "nvme.h"
+#include "dif.h"
 #include "trace.h"
 
 #define NVME_MAX_IOQPAIRS 0x
diff --git a/hw/nvme/dif.c b/hw/nvme/dif.c
index 5dbd18b2a4a5..cd0cea2b5ebd 100644
--- a/hw/nvme/dif.c
+++ b/hw/nvme/dif.c
@@ -13,6 +13,7 @@
 #include "sysemu/block-backend.h"
 
 #include "nvme.h"
+#include "dif.h"
 #include "trace.h"
 
 uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint8_t prinfo, uint64_t slba,
diff --git a/hw/nvme/dif.h b/hw/nvme/dif.h
new file mode 100644
index ..e36fea30e71e
--- /dev/null
+++ b/hw/nvme/dif.h
@@ -0,0 +1,53 @@
+#ifndef HW_NVME_DIF_H
+#define HW_NVME_DIF_H
+
+/* from Linux kernel (crypto/crct10dif_common.c) */
+static const uint16_t t10_dif_crc_table[256] = {
+0x, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
+0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
+0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
+0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
+0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1,
+0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C,
+0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C,
+0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781,
+0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8,
+0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255,
+0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925,
+0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698,
+0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472,
+0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF,
+0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF,
+0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02,
+0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA,
+0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067,
+0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17,
+0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA,
+0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640,
+0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD,
+0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D,
+0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30,
+0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759,
+0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4,
+0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394,
+0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29,
+0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3,
+0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E,
+0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E,
+0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3
+};
+
+uint16_t nvme_check_prinfo(NvmeNamespace *ns, uint8_t prinfo, uint64_t slba,
+   uint32_t reftag);
+uint16_t nvme_dif_mangle_mdata(NvmeNamespace *ns, uint8_t *mbuf, size_t mlen,
+   uint64_t slba);
+void nvme_dif_pract_generate_dif(NvmeNamespace *ns, uint8_t *buf, size_t len,
+ uint8_t *mbuf, size_t mlen, uint16_t apptag,
+ uint32_t *reftag);
+uint16_t nvme_dif_check(NvmeNamespace *ns, uint8_t *buf, size_t len,
+uint8_t *mbuf, size_t mlen, uint8_t prinfo,
+uint64_t slba, uint16_t apptag,
+uint16_t appmask, uint32_t *reftag);
+uint16_t nvme_dif_rw(NvmeCtrl *n, NvmeRequest *req);
+
+#endif /* HW_NVME_DIF_H */
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 90c0bb7ce236..801176a2bd5e 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -513,54 +513,4 @@ void nvme_rw_complete_cb(void *opaque, int ret);
 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
NvmeCmd *cmd);
 
-/* from Linux kernel (crypto/crct10dif_common.c) */
-static const uint16_t t10_dif_crc_table[256] = {
-0x, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B,
-0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6,
-0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6,
-0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B,
-

[PULL 0/6] hw/nvme updates

2022-03-03 Thread Klaus Jensen
From: Klaus Jensen 

Hi Peter,

Last round of hw/nvme updates for v7.0.

The following changes since commit 64ada298b98a51eb2512607f6e6180cb330c47b1:

  Merge remote-tracking branch 'remotes/legoater/tags/pull-ppc-20220302' into 
staging (2022-03-02 12:38:46 +)

are available in the Git repository at:

  git://git.infradead.org/qemu-nvme.git tags/nvme-next-pull-request

for you to fetch changes up to 44219b6029fc52d5e967a963be91a9cf33f9f185:

  hw/nvme: 64-bit pi support (2022-03-03 09:30:21 +0100)


hw/nvme updates

- add enhanced protection information (64-bit guard)



Klaus Jensen (3):
  hw/nvme: move dif/pi prototypes into dif.h
  hw/nvme: move format parameter parsing
  hw/nvme: add pi tuple size helper

Naveen Nagar (3):
  hw/nvme: add host behavior support feature
  hw/nvme: add support for the lbafee hbs feature
  hw/nvme: 64-bit pi support

 hw/nvme/ctrl.c   | 235 +--
 hw/nvme/dif.c| 378 +--
 hw/nvme/dif.h| 191 ++
 hw/nvme/ns.c |  50 --
 hw/nvme/nvme.h   |  58 +--
 hw/nvme/trace-events |  12 +-
 include/block/nvme.h |  81 --
 7 files changed, 793 insertions(+), 212 deletions(-)
 create mode 100644 hw/nvme/dif.h

-- 
2.35.1




Re: [PATCH 5/5] iotests: fortify compare_images() against crashes

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 03:59:02PM -0500, John Snow wrote:
> Fority compare_images() to be more discerning about the status codes it

Fortify

> receives. If qemu_img() returns an exit code that implies it didn't
> actually perform the comparison, treat that as an exceptional
> circumstance and force the caller to be aware of the peril.
> 
> If a negative test is desired (Perhaps to test how qemu_img compare

perhaps

> behaves on malformed images, for instance), it is still possible to
> catch the exception in the test and deal with that circumstance
> manually.
> 
> Signed-off-by: John Snow 
> ---
>  tests/qemu-iotests/iotests.py | 21 -
>  1 file changed, 16 insertions(+), 5 deletions(-)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 4/5] iotests: make qemu_img raise on non-zero rc by default

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 03:59:01PM -0500, John Snow wrote:
> re-write qemu_img() as a function that will by default raise a
> VerboseProcessException (extended from CalledProcessException) on
> non-zero return codes. This will produce a stack trace that will show
> the command line arguments and return code from the failed process run.
> 
> Users that want something more flexible (there appears to be only one)
> can use check=False and manage the return themselves. However, when the
> return code is negative, the Exception will be raised no matter what.
> This is done under the belief that there's no legitimate reason, even in
> negative tests, to see a crash from qemu-img.
> 
> Signed-off-by: John Snow 
> ---
>  tests/qemu-iotests/257|  8 --
>  tests/qemu-iotests/iotests.py | 54 +++
>  2 files changed, 53 insertions(+), 9 deletions(-)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 2/5] python/utils: add VerboseProcessError

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 03:58:59PM -0500, John Snow wrote:
> This adds an Exception that extends the Python stdlib
> subprocess.CalledProcessError.
> 
> The difference is that the str() method of this exception also adds the
> stdout/stderr logs. In effect, if this exception goes unhandled, Python
> will print the output in a visually distinct wrapper to the terminal so
> that it's easy to spot in a sea of traceback information.
> 
> Signed-off-by: John Snow 
> ---
>  python/qemu/utils/__init__.py | 36 +++
>  1 file changed, 36 insertions(+)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 4/9] util/oslib-win32: Return NULL on qemu_try_memalign() with zero size

2022-03-03 Thread Richard Henderson

On 3/3/22 06:55, Peter Maydell wrote:

Alternately, force size == 1, so that we always get a non-NULL value that can 
be freed.
That's a change on the POSIX side as well, of course.


Yes, I had a look at what actual malloc() implementations tend
to do, and the answer seems to be that forcing size to 1 gives
less weird behaviour for the application. So here that would be

if (size == 0) {
size++;
}
ptr = _aligned_malloc(size, alignment);

We don't need to do anything on the POSIX side (unless we want to
enforce consistency of handling the size==0 case).


I would do this unconditionally.  The POSIX manpage says that either NULL or a unique 
pointer is a valid return value into *memptr here for size == 0.  What we want in our 
caller is NULL if and only if error.



I'd quite like to get this series in before softfreeze (though mostly
just for my personal convenience so it's not hanging around as a
loose end I have to come back to after we reopen for 7.1). Does anybody
object if I squash in that change and put this in a pullrequest,
or would you prefer to see a v2 series first?


I'm happy with a squash and PR.


r~



Re: [PATCH 04/12] qemu-nbd: add --tls-hostname option for TLS certificate validation

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 04:03:22PM +, Daniel P. Berrangé wrote:
> When using the --list option, qemu-nbd acts as an NBD client rather
> than a server. As such when using TLS, it has a need to validate
> the server certificate. This adds a --tls-hostname option which can
> be used to override the default hostname used for certificate
> validation.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  docs/tools/qemu-nbd.rst | 14 ++
>  qemu-nbd.c  | 17 -
>  2 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst
> index 6031f96893..acce54a39d 100644
> --- a/docs/tools/qemu-nbd.rst
> +++ b/docs/tools/qemu-nbd.rst
> @@ -169,6 +169,20 @@ driver options if ``--image-opts`` is specified.
>option; or provide the credentials needed for connecting as a client
>in list mode.
>  
> +.. option:: --tls-hostname=hostname
> +
> +  When validating an x509 certificate received over a TLS connection,
> +  the hostname that the NBD client used to connect will be checked
> +  against information in the server provided certificate. Sometimes
> +  it might be required to override the hostname used to perform this
> +  check. For example if the NBD client is using a tunnel from localhost
> +  to connect to the remote server. In this case the `--tls-hostname`

For example, if the ... to the remote server, the `--tls-hostname`


> +  option should be used to set the officially expected hostname of
> +  the remote NBD server. This can also be used if accessing NBD over
> +  a UNIX socket where there is no inherant hostname available. This

inherent

> +  only is only permitted when acting as a NBD client with the `--list`

s/only is/is/

> +  option.
> +
>  .. option:: --fork
>  

> @@ -835,6 +841,10 @@ int main(int argc, char **argv)
>  error_report("TLS authorization is incompatible with export 
> list");
>  exit(EXIT_FAILURE);
>  }
> +if (tlshostname && !list) {
> +error_report("TLS hostname is only required with export list");

maybe s/required/supported/

> +exit(EXIT_FAILURE);
> +}
>  tlscreds = nbd_get_tls_creds(tlscredsid, list, _err);
>  if (local_err) {
>  error_reportf_err(local_err, "Failed to get TLS creds: ");
> @@ -845,6 +855,10 @@ int main(int argc, char **argv)
>  error_report("--tls-authz is not permitted without --tls-creds");
>  exit(EXIT_FAILURE);
>  }
> +if (tlshostname) {
> +error_report("--tls-hostname is not permitted without 
> --tls-creds");
> +exit(EXIT_FAILURE);
> +}
>  }
>  
>  if (selinux_label) {
> @@ -861,7 +875,8 @@ int main(int argc, char **argv)
>  
>  if (list) {
>  saddr = nbd_build_socket_address(sockpath, bindto, port);
> -return qemu_nbd_client_list(saddr, tlscreds, bindto);
> +return qemu_nbd_client_list(saddr, tlscreds,
> +tlshostname ? tlshostname : bindto);

With the grammar fixes,

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 1/5] python/utils: add add_visual_margin() text decoration utility

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 03:58:58PM -0500, John Snow wrote:
> >>> print(add_visual_margin(msg, width=72, name="Commit Message"))
> ┏━ Commit Message ━━
> ┃ add_visual_margin() takes a chunk of text and wraps it in a visual
> ┃ container that force-wraps to a specified width. An optional title
> ┃ label may be given, and any of the individual glyphs used to draw the
> ┃ box may be replaced or specified as well.
> ┗━━━

I see you dropped the right margin compared to earlier versions, but
agree that this is still a nice visual indicator, and probably easier
to maintain in this form.  And it got rid of the weird spacing on the
left when the wrap point hit at the wrong time.

> +Decorate and wrap some text with a visual decoration around it.
> +
> +This function assumes that the text decoration characters are single
> +characters that display using a single monospace column.
> +
> +┏━ Example ━
> +┃ This is what this function looks like with text content that's
> +┃ wrapped to 72 characters. The right-hand margin is left open to
> +┃ acommodate the occasional unicode character that might make
> +┃ predicting the total "visual" width of a line difficult. This
> +┃ provides a visual distinction that's good-enough, though.
> +┗━━━

Yep - hand-waving away Unicode messiness is certainly easiest ;)

Reviewed-by: Eric Blake 

[take with a grain of salt - my python is weak. But as you said in the
cover letter, it's fairly straightforward to reproduce an environment
where you can see it in action for hands-on testing]

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2 4/4] iotests/185: Add post-READY quit tests

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 05:48:14PM +0100, Hanna Reitz wrote:
> 185 tests quitting qemu while a block job is active.  It does not
> specifically test quitting qemu while a mirror or active commit job is
> in its READY phase.
> 
> Add two test cases for this, where we respectively mirror or commit to
> an external QSD instance, which provides a throttled block device.  qemu
> is supposed to cancel the job so that it can quit as soon as possible
> instead of waiting for the job to complete (which it did before 6.2).
> 
> Signed-off-by: Hanna Reitz 
> ---
>  tests/qemu-iotests/185 | 190 -
>  tests/qemu-iotests/185.out |  48 ++
>  2 files changed, 237 insertions(+), 1 deletion(-)
> 
> diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185
> index f2ec5c5ceb..8b1143dc16 100755
> --- a/tests/qemu-iotests/185
> +++ b/tests/qemu-iotests/185
> @@ -33,6 +33,12 @@ _cleanup()
>  _rm_test_img "${TEST_IMG}.copy"
>  _cleanup_test_img
>  _cleanup_qemu
> +
> +if [ -f "$TEST_DIR/qsd.pid" ]; then
> +kill -SIGKILL "$(cat "$TEST_DIR/qsd.pid")"
> +rm -f "$TEST_DIR/qsd.pid"
> +fi
> +rm -f "$SOCK_DIR/qsd.sock"
>  }
>  trap "_cleanup; exit \$status" 0 1 2 3 15
>  
> @@ -45,7 +51,7 @@ _supported_fmt qcow2
>  _supported_proto file
>  _supported_os Linux
>  
> -size=64M
> +size=$((64 * 1048576))

I tend to write $((64 * 1024 * 1024)) rather than remembering all the
digits of 2^20, but your way is fine.

Nice test addition!

Reviewed-by: Eric Blake 

I'm happy to queue this series through my NBD tree in time for
softfreeze, if no one else speaks for it first.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2 3/4] qsd: Add --daemonize

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 05:48:13PM +0100, Hanna Reitz wrote:
> To implement this, we reuse the existing daemonizing functions from the
> system emulator, which mainly do the following:
> - Fork off a child process, and set up a pipe between parent and child
> - The parent process waits until the child sends a status byte over the
>   pipe (0 means that the child was set up successfully; anything else
>   (including errors or EOF) means that the child was not set up
>   successfully), and then exits with an appropriate exit status
> - The child process enters a new session (forking off again), changes
>   the umask, and will ignore terminal signals from then on
> - Once set-up is complete, the child will chdir to /, redirect all
>   standard I/O streams to /dev/null, and tell the parent that set-up has
>   been completed successfully
> 
> In contrast to qemu-nbd's --fork implementation, during the set up
> phase, error messages are not piped through the parent process.
> qemu-nbd mainly does this to detect errors, though (while os_daemonize()
> has the child explicitly signal success after set up); because we do not
> redirect stderr after forking, error messages continue to appear on
> whatever the parent's stderr was (until set up is complete).
> 
> Signed-off-by: Hanna Reitz 
> ---
>  docs/tools/qemu-storage-daemon.rst   |  7 +++
>  storage-daemon/qemu-storage-daemon.c | 15 +++
>  2 files changed, 22 insertions(+)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2 2/4] qsd: Add pre-init argument parsing pass

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 05:48:12PM +0100, Hanna Reitz wrote:
> In contrast to qemu-nbd (where it is called --fork) and the system
> emulator, QSD does not have a --daemonize switch yet.  Just like them,
> QSD allows setting up block devices and exports on the command line.
> When doing so, it is often necessary for whoever invoked the QSD to wait
> until these exports are fully set up.  A --daemonize switch allows
> precisely this, by virtue of the parent process exiting once everything
> is set up.
> 
> Note that there are alternative ways of waiting for all exports to be
> set up, for example:
> - Passing the --pidfile option and waiting until the respective file
>   exists (but I do not know if there is a way of implementing this
>   without a busy wait loop)

Non-portably, you could use inotify or similar, to get a true
event-driven wakeup when the file is created.  And here's the python
glue that libnbd uses, instead of --pidfile:

https://gitlab.com/nbdkit/libnbd/-/blob/master/interop/interop-qemu-storage-daemon.sh#L58

> - Set up some network server (e.g. on a Unix socket) and have the QSD
>   connect to it after all arguments have been processed by appending
>   corresponding --chardev and --monitor options to the command line,
>   and then wait until the QSD connects
> 
> Having a --daemonize option would make this simpler, though, without
> having to rely on additional tools (to set up a network server) or busy
> waiting.
> 
> Implementing a --daemonize switch means having to fork the QSD process.
> Ideally, we should do this as early as possible: All the parent process
> has to do is to wait for the child process to signal completion of its
> set-up phase, and therefore there is basically no initialization that
> needs to be done before the fork.  On the other hand, forking after
> initialization steps means having to consider how those steps (like
> setting up the block layer or QMP) interact with a later fork, which is
> often not trivial.
> 
> In order to fork this early, we must scan the command line for
> --daemonize long before our current process_options() call.  Instead of
> adding custom new code to do so, just reuse process_options() and give
> it a @pre_init_pass argument to distinguish the two passes.  I believe
> there are some other switches but --daemonize that deserve parsing in

s/but/beyond/

> the first pass:
> 
> - --help and --version are supposed to only print some text and then
>   immediately exit (so any initialization we do would be for naught).
>   This changes behavior, because now "--blockdev inv-drv --help" will
>   print a help text instead of complaining about the --blockdev
>   argument.
>   Note that this is similar in behavior to other tools, though: "--help"
>   is generally immediately acted upon when finding it in the argument
>   list, potentially before other arguments (even ones before it) are
>   acted on.  For example, "ls /does-not-exist --help" prints a help text
>   and does not complain about ENOENT.

Well, GNU ls does that, but only if POSIXLY_CORRECT is not set (a
strict POSIX ls must give you two ENOENT).

> 
> - --pidfile does not need initialization, and is already exempted from
>   the sequential order that process_options() claims to strictly follow
>   (the PID file is only created after all arguments are processed, not
>   at the time the --pidfile argument appears), so it makes sense to
>   include it in the same category as --daemonize.
> 
> - Invalid arguments should always be reported as soon as possible.  (The
>   same caveat with --help applies: That means that "--blockdev inv-drv
>   --inv-arg" will now complain about --inv-arg, not inv-drv.)
> 
> This patch does make some references to --daemonize without having
> implemented it yet, but that will happen in the next patch.
> 
> Signed-off-by: Hanna Reitz 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> ---
>  storage-daemon/qemu-storage-daemon.c | 43 
>  1 file changed, 38 insertions(+), 5 deletions(-)
>

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 03/12] block/nbd: support override of hostname for TLS certificate validation

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 04:03:21PM +, Daniel P. Berrangé wrote:
> When connecting to an NBD server with TLS and x509 credentials,
> the client must validate the hostname it uses for the connection,
> against that published in the server's certificate. If the client
> is tunnelling its connection over some other channel, however, the
> hostname it uses may not match the info reported in the server's
> certificate. In such a case, the user needs to explicitly set an
> override for the hostname to use for certificate validation.
> 
> This is achieved by adding a 'tls-hostname' property to the NBD
> block driver.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  block/nbd.c  | 18 +++---
>  qapi/block-core.json |  3 +++
>  2 files changed, 18 insertions(+), 3 deletions(-)
> 
> +++ b/qapi/block-core.json
> @@ -4078,6 +4078,8 @@
>  #
>  # @tls-creds: TLS credentials ID
>  #
> +# @tls-hostname: TLS hostname override for certificate validation

Add the tag '(since 7.0)' (in the interest of soft freeze deadlines, I
can do that as part of queuing through my NBD tree if nothing else
major turns up in the series), and you can have:

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




[PATCH 5/5] iotests: fortify compare_images() against crashes

2022-03-03 Thread John Snow
Fority compare_images() to be more discerning about the status codes it
receives. If qemu_img() returns an exit code that implies it didn't
actually perform the comparison, treat that as an exceptional
circumstance and force the caller to be aware of the peril.

If a negative test is desired (Perhaps to test how qemu_img compare
behaves on malformed images, for instance), it is still possible to
catch the exception in the test and deal with that circumstance
manually.

Signed-off-by: John Snow 
---
 tests/qemu-iotests/iotests.py | 21 -
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index f6c0f1b0a0..654ce834e6 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -474,11 +474,22 @@ def qemu_nbd_popen(*args):
 p.kill()
 p.wait()
 
-def compare_images(img1, img2, fmt1=imgfmt, fmt2=imgfmt):
-'''Return True if two image files are identical'''
-res = qemu_img('compare', '-f', fmt1,
-   '-F', fmt2, img1, img2, check=False)
-return res.returncode == 0
+def compare_images(img1: str, img2: str,
+   fmt1: str = imgfmt, fmt2: str = imgfmt) -> bool:
+"""
+Compare two images with QEMU_IMG; return True if they are identical.
+
+:raise CalledProcessError:
+when qemu-img crashes or returns a status code of anything other
+than 0 (identical) or 1 (different).
+"""
+try:
+qemu_img('compare', '-f', fmt1, '-F', fmt2, img1, img2)
+return True
+except subprocess.CalledProcessError as exc:
+if exc.returncode == 1:
+return False
+raise
 
 def create_image(name, size):
 '''Create a fully-allocated raw image with sector markers'''
-- 
2.34.1




[PATCH 3/5] iotests: Remove explicit checks for qemu_img() == 0

2022-03-03 Thread John Snow
qemu_img() returning zero ought to be the rule, not the
exception. Remove all explicit checks against the condition in
preparation for making non-zero returns an Exception.

Signed-off-by: John Snow 
Reviewed-by: Eric Blake 
---
 tests/qemu-iotests/163 |  9 +++--
 tests/qemu-iotests/216 |  6 +++---
 tests/qemu-iotests/218 |  2 +-
 tests/qemu-iotests/224 | 11 +--
 tests/qemu-iotests/228 | 12 ++--
 tests/qemu-iotests/257 |  3 +--
 tests/qemu-iotests/258 |  4 ++--
 tests/qemu-iotests/310 | 14 +++---
 tests/qemu-iotests/tests/block-status-cache|  3 +--
 tests/qemu-iotests/tests/image-fleecing|  4 ++--
 tests/qemu-iotests/tests/mirror-ready-cancel-error |  6 ++
 tests/qemu-iotests/tests/mirror-top-perms  |  3 +--
 .../qemu-iotests/tests/remove-bitmap-from-backing  |  8 
 tests/qemu-iotests/tests/stream-error-on-reset |  4 ++--
 14 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/tests/qemu-iotests/163 b/tests/qemu-iotests/163
index b8bfc95358..e4cd4b230f 100755
--- a/tests/qemu-iotests/163
+++ b/tests/qemu-iotests/163
@@ -107,8 +107,7 @@ class ShrinkBaseClass(iotests.QMPTestCase):
 
 if iotests.imgfmt == 'raw':
 return
-self.assertEqual(qemu_img('check', test_img), 0,
- "Verifying image corruption")
+qemu_img('check', test_img)
 
 def test_empty_image(self):
 qemu_img('resize',  '-f', iotests.imgfmt, '--shrink', test_img,
@@ -130,8 +129,7 @@ class ShrinkBaseClass(iotests.QMPTestCase):
 qemu_img('resize',  '-f', iotests.imgfmt, '--shrink', test_img,
  self.shrink_size)
 
-self.assertEqual(qemu_img("compare", test_img, check_img), 0,
- "Verifying image content")
+qemu_img("compare", test_img, check_img)
 
 self.image_verify()
 
@@ -146,8 +144,7 @@ class ShrinkBaseClass(iotests.QMPTestCase):
 qemu_img('resize',  '-f', iotests.imgfmt, '--shrink', test_img,
  self.shrink_size)
 
-self.assertEqual(qemu_img("compare", test_img, check_img), 0,
- "Verifying image content")
+qemu_img("compare", test_img, check_img)
 
 self.image_verify()
 
diff --git a/tests/qemu-iotests/216 b/tests/qemu-iotests/216
index c02f8d2880..88b385afa3 100755
--- a/tests/qemu-iotests/216
+++ b/tests/qemu-iotests/216
@@ -51,10 +51,10 @@ with iotests.FilePath('base.img') as base_img_path, \
 log('--- Setting up images ---')
 log('')
 
-assert qemu_img('create', '-f', iotests.imgfmt, base_img_path, '64M') == 0
+qemu_img('create', '-f', iotests.imgfmt, base_img_path, '64M')
 assert qemu_io_silent(base_img_path, '-c', 'write -P 1 0M 1M') == 0
-assert qemu_img('create', '-f', iotests.imgfmt, '-b', base_img_path,
-'-F', iotests.imgfmt, top_img_path) == 0
+qemu_img('create', '-f', iotests.imgfmt, '-b', base_img_path,
+ '-F', iotests.imgfmt, top_img_path)
 assert qemu_io_silent(top_img_path,  '-c', 'write -P 2 1M 1M') == 0
 
 log('Done')
diff --git a/tests/qemu-iotests/218 b/tests/qemu-iotests/218
index 4922b4d3b6..853ed52b34 100755
--- a/tests/qemu-iotests/218
+++ b/tests/qemu-iotests/218
@@ -145,7 +145,7 @@ log('')
 with iotests.VM() as vm, \
  iotests.FilePath('src.img') as src_img_path:
 
-assert qemu_img('create', '-f', iotests.imgfmt, src_img_path, '64M') == 0
+qemu_img('create', '-f', iotests.imgfmt, src_img_path, '64M')
 assert qemu_io_silent('-f', iotests.imgfmt, src_img_path,
   '-c', 'write -P 42 0M 64M') == 0
 
diff --git a/tests/qemu-iotests/224 b/tests/qemu-iotests/224
index 38dd153625..c31c55b49d 100755
--- a/tests/qemu-iotests/224
+++ b/tests/qemu-iotests/224
@@ -47,12 +47,11 @@ for filter_node_name in False, True:
  iotests.FilePath('top.img') as top_img_path, \
  iotests.VM() as vm:
 
-assert qemu_img('create', '-f', iotests.imgfmt,
-base_img_path, '64M') == 0
-assert qemu_img('create', '-f', iotests.imgfmt, '-b', base_img_path,
-'-F', iotests.imgfmt, mid_img_path) == 0
-assert qemu_img('create', '-f', iotests.imgfmt, '-b', mid_img_path,
-'-F', iotests.imgfmt, top_img_path) == 0
+qemu_img('create', '-f', iotests.imgfmt, base_img_path, '64M')
+qemu_img('create', '-f', iotests.imgfmt, '-b', base_img_path,
+ '-F', iotests.imgfmt, mid_img_path)
+qemu_img('create', '-f', iotests.imgfmt, '-b', mid_img_path,
+ '-F', iotests.imgfmt, top_img_path)
 
 # Something to commit
 assert qemu_io_silent(mid_img_path, '-c', 'write -P 1 0 1M') == 0
diff 

[PATCH 2/5] python/utils: add VerboseProcessError

2022-03-03 Thread John Snow
This adds an Exception that extends the Python stdlib
subprocess.CalledProcessError.

The difference is that the str() method of this exception also adds the
stdout/stderr logs. In effect, if this exception goes unhandled, Python
will print the output in a visually distinct wrapper to the terminal so
that it's easy to spot in a sea of traceback information.

Signed-off-by: John Snow 
---
 python/qemu/utils/__init__.py | 36 +++
 1 file changed, 36 insertions(+)

diff --git a/python/qemu/utils/__init__.py b/python/qemu/utils/__init__.py
index 5babf40df2..355ac550bc 100644
--- a/python/qemu/utils/__init__.py
+++ b/python/qemu/utils/__init__.py
@@ -18,6 +18,7 @@
 import os
 import re
 import shutil
+from subprocess import CalledProcessError
 import textwrap
 from typing import Optional
 
@@ -26,6 +27,7 @@
 
 
 __all__ = (
+'VerboseProcessError',
 'add_visual_margin',
 'get_info_usernet_hostfwd_port',
 'kvm_available',
@@ -121,3 +123,37 @@ def _wrap(line: str) -> str:
 os.linesep.join(_wrap(line) for line in content.splitlines()),
 _bar(None, top=False),
 ))
+
+
+class VerboseProcessError(CalledProcessError):
+"""
+The same as CalledProcessError, but more verbose.
+
+This is useful for debugging failed calls during test executions.
+The return code, signal (if any), and terminal output will be displayed
+on unhandled exceptions.
+"""
+def summary(self) -> str:
+"""Return the normal CalledProcessError str() output."""
+return super().__str__()
+
+def __str__(self) -> str:
+lmargin = '  '
+width = -len(lmargin)
+sections = []
+
+name = 'output' if self.stderr is None else 'stdout'
+if self.stdout:
+sections.append(add_visual_margin(self.stdout, width, name))
+else:
+sections.append(f"{name}: N/A")
+
+if self.stderr:
+sections.append(add_visual_margin(self.stderr, width, 'stderr'))
+elif self.stderr is not None:
+sections.append("stderr: N/A")
+
+return os.linesep.join((
+self.summary(),
+textwrap.indent(os.linesep.join(sections), prefix=lmargin),
+))
-- 
2.34.1




[PATCH 1/5] python/utils: add add_visual_margin() text decoration utility

2022-03-03 Thread John Snow
>>> print(add_visual_margin(msg, width=72, name="Commit Message"))
┏━ Commit Message ━━
┃ add_visual_margin() takes a chunk of text and wraps it in a visual
┃ container that force-wraps to a specified width. An optional title
┃ label may be given, and any of the individual glyphs used to draw the
┃ box may be replaced or specified as well.
┗━━━

Signed-off-by: John Snow 
---
 python/qemu/utils/__init__.py | 78 +++
 1 file changed, 78 insertions(+)

diff --git a/python/qemu/utils/__init__.py b/python/qemu/utils/__init__.py
index 7f1a5138c4..5babf40df2 100644
--- a/python/qemu/utils/__init__.py
+++ b/python/qemu/utils/__init__.py
@@ -15,7 +15,10 @@
 # the COPYING file in the top-level directory.
 #
 
+import os
 import re
+import shutil
+import textwrap
 from typing import Optional
 
 # pylint: disable=import-error
@@ -23,6 +26,7 @@
 
 
 __all__ = (
+'add_visual_margin',
 'get_info_usernet_hostfwd_port',
 'kvm_available',
 'list_accel',
@@ -43,3 +47,77 @@ def get_info_usernet_hostfwd_port(info_usernet_output: str) 
-> Optional[int]:
 if match is not None:
 return int(match[1])
 return None
+
+
+# pylint: disable=too-many-arguments
+def add_visual_margin(
+content: str = '',
+width: Optional[int] = None,
+name: Optional[str] = None,
+padding: int = 1,
+upper_left: str = '┏',
+lower_left: str = '┗',
+horizontal: str = '━',
+vertical: str = '┃',
+) -> str:
+"""
+Decorate and wrap some text with a visual decoration around it.
+
+This function assumes that the text decoration characters are single
+characters that display using a single monospace column.
+
+┏━ Example ━
+┃ This is what this function looks like with text content that's
+┃ wrapped to 72 characters. The right-hand margin is left open to
+┃ acommodate the occasional unicode character that might make
+┃ predicting the total "visual" width of a line difficult. This
+┃ provides a visual distinction that's good-enough, though.
+┗━━━
+
+:param content: The text to wrap and decorate.
+:param width:
+The number of columns to use, including for the decoration
+itself. The default (None) uses the the available width of the
+current terminal, or a fallback of 72 lines. A negative number
+subtracts a fixed-width from the default size. The default obeys
+the COLUMNS environment variable, if set.
+:param name: A label to apply to the upper-left of the box.
+:param padding: How many columns of padding to apply inside.
+:param upper_left: Upper-left single-width text decoration character.
+:param lower_left: Lower-left single-width text decoration character.
+:param horizontal: Horizontal single-width text decoration character.
+:param vertical: Vertical single-width text decoration character.
+"""
+if width is None or width < 0:
+avail = shutil.get_terminal_size(fallback=(72, 24))[0]
+if width is None:
+_width = avail
+else:
+_width = avail + width
+else:
+_width = width
+
+prefix = vertical + (' ' * padding)
+
+def _bar(name: Optional[str], top: bool = True) -> str:
+ret = upper_left if top else lower_left
+if name is not None:
+ret += f"{horizontal} {name} "
+
+filler_len = _width - len(ret)
+ret += f"{horizontal * filler_len}"
+return ret
+
+def _wrap(line: str) -> str:
+return os.linesep.join(
+textwrap.wrap(
+line, width=_width - padding, initial_indent=prefix,
+subsequent_indent=prefix, replace_whitespace=False,
+drop_whitespace=True, break_on_hyphens=False)
+)
+
+return os.linesep.join((
+_bar(name, top=True),
+os.linesep.join(_wrap(line) for line in content.splitlines()),
+_bar(None, top=False),
+))
-- 
2.34.1




[PATCH 4/5] iotests: make qemu_img raise on non-zero rc by default

2022-03-03 Thread John Snow
re-write qemu_img() as a function that will by default raise a
VerboseProcessException (extended from CalledProcessException) on
non-zero return codes. This will produce a stack trace that will show
the command line arguments and return code from the failed process run.

Users that want something more flexible (there appears to be only one)
can use check=False and manage the return themselves. However, when the
return code is negative, the Exception will be raised no matter what.
This is done under the belief that there's no legitimate reason, even in
negative tests, to see a crash from qemu-img.

Signed-off-by: John Snow 
---
 tests/qemu-iotests/257|  8 --
 tests/qemu-iotests/iotests.py | 54 +++
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/tests/qemu-iotests/257 b/tests/qemu-iotests/257
index fb5359c581..e7e7a2317e 100755
--- a/tests/qemu-iotests/257
+++ b/tests/qemu-iotests/257
@@ -241,11 +241,13 @@ def compare_images(image, reference, baseimg=None, 
expected_match=True):
 expected_ret = 0 if expected_match else 1
 if baseimg:
 qemu_img("rebase", "-u", "-b", baseimg, '-F', iotests.imgfmt, image)
-ret = qemu_img("compare", image, reference)
+
+sub = qemu_img("compare", image, reference, check=False)
+
 log('qemu_img compare "{:s}" "{:s}" ==> {:s}, {:s}'.format(
 image, reference,
-"Identical" if ret == 0 else "Mismatch",
-"OK!" if ret == expected_ret else "ERROR!"),
+"Identical" if sub.returncode == 0 else "Mismatch",
+"OK!" if sub.returncode == expected_ret else "ERROR!"),
 filters=[iotests.filter_testfiles])
 
 def test_bitmap_sync(bsync_mode, msync_mode='bitmap', failure=None):
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 6ba65eb1ff..f6c0f1b0a0 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -39,6 +39,7 @@
 
 from qemu.machine import qtest
 from qemu.qmp import QMPMessage
+from qemu.utils import VerboseProcessError
 
 # Use this logger for logging messages directly from the iotests module
 logger = logging.getLogger('qemu.iotests')
@@ -215,9 +216,49 @@ def qemu_img_pipe_and_status(*args: str) -> Tuple[str, 
int]:
 return qemu_tool_pipe_and_status('qemu-img', full_args,
  drop_successful_output=is_create)
 
-def qemu_img(*args: str) -> int:
-'''Run qemu-img and return the exit code'''
-return qemu_img_pipe_and_status(*args)[1]
+def qemu_img(*args: str, check: bool = True, combine_stdio: bool = True
+ ) -> subprocess.CompletedProcess[str]:
+"""
+Run qemu_img and return the status code and console output.
+
+This function always prepends QEMU_IMG_OPTIONS and may further alter
+the args for 'create' commands.
+
+:param args: command-line arguments to qemu-img.
+:param check: Enforce a return code of zero.
+:param combine_stdio: set to False to keep stdout/stderr separated.
+
+:raise VerboseProcessError:
+When the return code is negative, or on any non-zero exit code
+when 'check=True' was provided (the default). This exception has
+'stdout', 'stderr', and 'returncode' properties that may be
+inspected to show greater detail. If this exception is not
+handled, the command-line, return code, and all console output
+will be included at the bottom of the stack trace.
+
+:return: a CompletedProcess. This object has args, returncode, and
+stdout properties. If streams are not combined, it will also
+have a stderr property.
+"""
+full_args = qemu_img_args + qemu_img_create_prepare_args(list(args))
+
+subp = subprocess.run(
+full_args,
+stdout=subprocess.PIPE,
+stderr=subprocess.STDOUT if combine_stdio else subprocess.PIPE,
+universal_newlines=True,
+check=False
+)
+
+if check and subp.returncode or (subp.returncode < 0):
+raise VerboseProcessError(
+subp.returncode, full_args,
+output=subp.stdout,
+stderr=subp.stderr,
+)
+
+return subp
+
 
 def ordered_qmp(qmsg, conv_keys=True):
 # Dictionaries are not ordered prior to 3.6, therefore:
@@ -232,7 +273,7 @@ def ordered_qmp(qmsg, conv_keys=True):
 return od
 return qmsg
 
-def qemu_img_create(*args):
+def qemu_img_create(*args: str) -> subprocess.CompletedProcess[str]:
 return qemu_img('create', *args)
 
 def qemu_img_measure(*args):
@@ -435,8 +476,9 @@ def qemu_nbd_popen(*args):
 
 def compare_images(img1, img2, fmt1=imgfmt, fmt2=imgfmt):
 '''Return True if two image files are identical'''
-return qemu_img('compare', '-f', fmt1,
-'-F', fmt2, img1, img2) == 0
+res = qemu_img('compare', '-f', fmt1,
+   '-F', fmt2, img1, img2, check=False)
+return res.returncode == 0
 
 def create_image(name, size):
 '''Create a 

[PATCH 0/5] iotests: add enhanced debugging info to qemu-img failures

2022-03-03 Thread John Snow
This is kinda-sorta V3-ish of a series I started in response to Thomas
Huth's encountering a failure in qemu-img because of missing zstd
support. This series changes the qemu_img() function in iotests.py to
one that raises an Exception on non-zero return code by default.

Alongside this, the Exception object itself is also augmented so that it
prints the stdout/stderr logs to screen if the exception goes unhandled
so that failure cases are very obvious and easy to spot in the middle of
python tracebacks.

(Test this out yourself: Disable zstd support and then run qcow2 iotest
065 before and after this patchset. It makes a real difference!)

NOTE: I have another 13-ish patches that go the rest of the way and
ensure that *every* call to qemu-img goes through this new qemu_img()
function, but for the sake of doing the most good in the shortest amount
of time, I am sending just the first 5 patches, and the rest will be
sent later. I think this is a very good series to get in before freeze
so that we have it during the heavy testing season.

John Snow (5):
  python/utils: add add_visual_margin() text decoration utility
  python/utils: add VerboseProcessError
  iotests: Remove explicit checks for qemu_img() == 0
  iotests: make qemu_img raise on non-zero rc by default
  iotests: fortify compare_images() against crashes

 python/qemu/utils/__init__.py | 114 ++
 tests/qemu-iotests/163|   9 +-
 tests/qemu-iotests/216|   6 +-
 tests/qemu-iotests/218|   2 +-
 tests/qemu-iotests/224|  11 +-
 tests/qemu-iotests/228|  12 +-
 tests/qemu-iotests/257|  11 +-
 tests/qemu-iotests/258|   4 +-
 tests/qemu-iotests/310|  14 +--
 tests/qemu-iotests/iotests.py |  69 +--
 tests/qemu-iotests/tests/block-status-cache   |   3 +-
 tests/qemu-iotests/tests/image-fleecing   |   4 +-
 .../tests/mirror-ready-cancel-error   |   6 +-
 tests/qemu-iotests/tests/mirror-top-perms |   3 +-
 .../tests/remove-bitmap-from-backing  |   8 +-
 .../qemu-iotests/tests/stream-error-on-reset  |   4 +-
 16 files changed, 220 insertions(+), 60 deletions(-)

-- 
2.34.1





Re: [PATCH v2 1/4] os-posix: Add os_set_daemonize()

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 05:48:11PM +0100, Hanna Reitz wrote:
> The daemonizing functions in os-posix (os_daemonize() and
> os_setup_post()) only daemonize the process if the static `daemonize`
> variable is set.  Right now, it can only be set by os_parse_cmd_args().
> 
> In order to use os_daemonize() and os_setup_post() from the storage
> daemon to have it be daemonized, we need some other way to set this
> `daemonize` variable, because I would rather not tap into the system
> emulator's arg-parsing code.  Therefore, this patch adds an
> os_set_daemonize() function, which will return an error on os-win32
> (because daemonizing is not supported there).
> 
> Signed-off-by: Hanna Reitz 
> ---

> +++ b/include/sysemu/os-win32.h
> @@ -77,6 +77,11 @@ typedef struct {
>  } qemu_timeval;
>  int qemu_gettimeofday(qemu_timeval *tp);
>  
> +static inline int os_set_daemonize(bool d)
> +{
> +return -ENOTSUP;

Should this fail only if d is true?  Or will all callers only ever
pass true, in which case why do we need the paraemeter?

> +}
> +
>  static inline bool is_daemonized(void)
>  {
>  return false;
> diff --git a/os-posix.c b/os-posix.c
> index ae6c9f2a5e..24692c8593 100644
> --- a/os-posix.c
> +++ b/os-posix.c
> @@ -317,6 +317,12 @@ bool is_daemonized(void)
>  return daemonize;
>  }
>  
> +int os_set_daemonize(bool d)
> +{
> +daemonize = d;
> +return 0;
> +}
> +
>  int os_mlock(void)
>  {
>  #ifdef HAVE_MLOCKALL
> -- 
> 2.34.1
> 
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 02/12] block: pass desired TLS hostname through from block driver client

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 04:03:20PM +, Daniel P. Berrangé wrote:
> In
> 
>   commit a71d597b989fd701b923f09b3c20ac4fcaa55e81
>   Author: Vladimir Sementsov-Ogievskiy 
>   Date:   Thu Jun 10 13:08:00 2021 +0300
> 
> block/nbd: reuse nbd_co_do_establish_connection() in nbd_open()
> 
> the use of the 'hostname' field from the BDRVNBDState struct was
> lost, and 'nbd_connect' just hardcoded it to match the IP socket
> address. This was a harmless bug at the time since we block use
> with anything other than IP sockets.
> 
> Shortly though, We want to allow the caller to override the hostname

s/We/we/

> used in the TLS certificate checks. This is to allow for TLS
> when doing port forwarding or tunneling. Thus we need to reinstate
> the passing along of the 'hostname'.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  block/nbd.c |  7 ---
>  include/block/nbd.h |  3 ++-
>  nbd/client-connection.c | 12 +---
>  3 files changed, 15 insertions(+), 7 deletions(-)

Nice - this a great step towards fixing a longstanding annoyance of
mine that libnbd and nbdkit support TLS over Unix sockets, but qemu
didn't.

> @@ -1875,7 +1875,8 @@ static int nbd_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  }
>  
>  s->conn = nbd_client_connection_new(s->saddr, true, s->export,
> -s->x_dirty_bitmap, s->tlscreds);
> +s->x_dirty_bitmap, s->tlscreds,
> +s->tlshostname);
>  
>  if (s->open_timeout) {
>  nbd_client_connection_enable_retry(s->conn);
> diff --git a/include/block/nbd.h b/include/block/nbd.h
> index 78d101b774..a98eb665da 100644
> --- a/include/block/nbd.h
> +++ b/include/block/nbd.h
> @@ -415,7 +415,8 @@ NBDClientConnection *nbd_client_connection_new(const 
> SocketAddress *saddr,
> bool do_negotiation,
> const char *export_name,
> const char *x_dirty_bitmap,
> -   QCryptoTLSCreds *tlscreds);
> +   QCryptoTLSCreds *tlscreds,
> +   const char *tlshostname);

We already have a lot of parameters; does it make sense to bundle
tlshostname into the QCryptoTLSCreds struct at all?  But that would
change the QAPI (or maybe you do it later in the series), it is not a
show-stopper to this patch.

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 01/12] crypto: mandate a hostname when checking x509 creds on a client

2022-03-03 Thread Eric Blake
On Thu, Mar 03, 2022 at 04:03:19PM +, Daniel P. Berrangé wrote:
> Currently the TLS session object assumes that the caller will always
> provide a hostname when using x509 creds on a client endpoint. This
> relies on the caller to detect and report an error if the user has
> configured QEMU with x509 credentials on a UNIX socket. The migration
> code has such a check, but it is too broad, reporting an error when
> the user has configured QEMU with PSK credentials on a UNIX socket,
> where hostnames are irrelevant.
> 
> Putting the check into the TLS session object credentials validation
> code ensures we report errors in only the scenario that matters.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  crypto/tlssession.c | 6 ++
>  1 file changed, 6 insertions(+)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




[PATCH v6 16/16] iotests/image-fleecing: test push backup with fleecing

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/tests/image-fleecing | 125 +++-
 tests/qemu-iotests/tests/image-fleecing.out |  63 ++
 2 files changed, 156 insertions(+), 32 deletions(-)

diff --git a/tests/qemu-iotests/tests/image-fleecing 
b/tests/qemu-iotests/tests/image-fleecing
index c9ffa6647e..c56278639c 100755
--- a/tests/qemu-iotests/tests/image-fleecing
+++ b/tests/qemu-iotests/tests/image-fleecing
@@ -51,9 +51,15 @@ remainder = [('0xd5', '0x108000',  '32k'), # Right-end of 
partial-left [1]
  ('0xdc', '32M',   '32k'), # Left-end of partial-right [2]
  ('0xcd', '0x3ff', '64k')] # patterns[3]
 
-def do_test(use_cbw, use_snapshot_access_filter, base_img_path,
-fleece_img_path, nbd_sock_path, vm,
+def do_test(vm, use_cbw, use_snapshot_access_filter, base_img_path,
+fleece_img_path, nbd_sock_path=None,
+target_img_path=None,
 bitmap=False):
+push_backup = target_img_path is not None
+assert (nbd_sock_path is not None) != push_backup
+if push_backup:
+assert use_cbw
+
 log('--- Setting up images ---')
 log('')
 
@@ -67,6 +73,9 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 else:
 assert qemu_img('create', '-f', 'qcow2', fleece_img_path, '64M') == 0
 
+if push_backup:
+assert qemu_img('create', '-f', 'qcow2', target_img_path, '64M') == 0
+
 for p in patterns:
 qemu_io('-f', iotests.imgfmt,
 '-c', 'write -P%s %s %s' % p, base_img_path)
@@ -141,28 +150,45 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 
 export_node = 'fl-access' if use_snapshot_access_filter else tmp_node
 
-log('')
-log('--- Setting up NBD Export ---')
-log('')
+if push_backup:
+log('')
+log('--- Starting actual backup ---')
+log('')
 
-nbd_uri = 'nbd+unix:///%s?socket=%s' % (export_node, nbd_sock_path)
-log(vm.qmp('nbd-server-start',
-   {'addr': {'type': 'unix',
- 'data': {'path': nbd_sock_path}}}))
+log(vm.qmp('blockdev-add', **{
+'driver': iotests.imgfmt,
+'node-name': 'target',
+'file': {
+'driver': 'file',
+'filename': target_img_path
+}
+}))
+log(vm.qmp('blockdev-backup', device=export_node,
+   sync='full', target='target',
+   job_id='push-backup', speed=1))
+else:
+log('')
+log('--- Setting up NBD Export ---')
+log('')
 
-log(vm.qmp('nbd-server-add', device=export_node))
+nbd_uri = 'nbd+unix:///%s?socket=%s' % (export_node, nbd_sock_path)
+log(vm.qmp('nbd-server-start',
+   {'addr': { 'type': 'unix',
+  'data': { 'path': nbd_sock_path } } }))
 
-log('')
-log('--- Sanity Check ---')
-log('')
+log(vm.qmp('nbd-server-add', device=export_node))
 
-for p in patterns + zeroes:
-cmd = 'read -P%s %s %s' % p
-log(cmd)
-out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd,
-   nbd_uri)
-if ret != 0:
-print(out)
+log('')
+log('--- Sanity Check ---')
+log('')
+
+for p in patterns + zeroes:
+cmd = 'read -P%s %s %s' % p
+log(cmd)
+out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd,
+   nbd_uri)
+if ret != 0:
+print(out)
 
 log('')
 log('--- Testing COW ---')
@@ -173,6 +199,23 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 log(cmd)
 log(vm.hmp_qemu_io(qom_path, cmd, qdev=True))
 
+if push_backup:
+# Check that previous operations were done during backup, not after
+# If backup is already finished, it's possible that it was finished
+# even before hmp qemu_io write, and we didn't actually test
+# copy-before-write operation. This should not happen, as we use
+# speed=1. But worth checking.
+result = vm.qmp('query-block-jobs')
+assert len(result['return']) == 1
+
+result = vm.qmp('block-job-set-speed', device='push-backup', speed=0)
+assert result == {'return': {}}
+
+log(vm.event_wait(name='BLOCK_JOB_COMPLETED',
+  match={'data': {'device': 'push-backup'}}),
+filters=[iotests.filter_qmp_event])
+log(vm.qmp('blockdev-del', node_name='target'))
+
 log('')
 log('--- Verifying Data ---')
 log('')
@@ -180,8 +223,12 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 for p in patterns + zeroes:
 cmd = 'read -P%s %s %s' % p
 log(cmd)
-out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', 

[PATCH v6 11/16] block: introduce snapshot-access block driver

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
The new block driver simply utilizes snapshot-access API of underlying
block node.

In further patches we want to use it like this:

[guest]   [NBD export]
   ||
   | root   | root
   v file   v
[copy-before-write]<--[snapshot-access]
   |   |
   | file  | target
   v   v
[active-disk] [temp.img]

This way, NBD client will be able to read snapshotted state of active
disk, when active disk is continued to be written by guest. This is
known as "fleecing", and currently uses another scheme based on qcow2
temporary image which backing file is active-disk. New scheme comes
with benefits - see next commit.

The other possible application is exporting internal snapshots of
qcow2, like this:

[guest]  [NBD export]
   |  |
   | root | root
   v   file   v
[qcow2]<-[snapshot-access]

For this, we'll need to implement snapshot-access API handlers in
qcow2 driver, and improve snapshot-access block driver (and API) to
make it possible to select snapshot by name. Another thing to improve
is size of snapshot. Now for simplicity we just use size of bs->file,
which is OK for backup, but for qcow2 snapshots export we'll need to
imporve snapshot-access API to get size of snapshot.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json  |   4 +-
 include/block/block_int.h |   6 ++
 block/snapshot-access.c   | 132 ++
 MAINTAINERS   |   1 +
 block/meson.build |   1 +
 5 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 block/snapshot-access.c

diff --git a/qapi/block-core.json b/qapi/block-core.json
index ffb7aea2a5..f13b5ff942 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2914,13 +2914,14 @@
 # @blkreplay: Since 4.2
 # @compress: Since 5.0
 # @copy-before-write: Since 6.2
+# @snapshot-access: Since 7.0
 #
 # Since: 2.9
 ##
 { 'enum': 'BlockdevDriver',
   'data': [ 'blkdebug', 'blklogwrites', 'blkreplay', 'blkverify', 'bochs',
 'cloop', 'compress', 'copy-before-write', 'copy-on-read', 'dmg',
-'file', 'ftp', 'ftps', 'gluster',
+'file', 'snapshot-access', 'ftp', 'ftps', 'gluster',
 {'name': 'host_cdrom', 'if': 'HAVE_HOST_BLOCK_DEVICE' },
 {'name': 'host_device', 'if': 'HAVE_HOST_BLOCK_DEVICE' },
 'http', 'https', 'iscsi',
@@ -4267,6 +4268,7 @@
   'rbd':'BlockdevOptionsRbd',
   'replication': { 'type': 'BlockdevOptionsReplication',
'if': 'CONFIG_REPLICATION' },
+  'snapshot-access': 'BlockdevOptionsGenericFormat',
   'ssh':'BlockdevOptionsSsh',
   'throttle':   'BlockdevOptionsThrottle',
   'vdi':'BlockdevOptionsGenericFormat',
diff --git a/include/block/block_int.h b/include/block/block_int.h
index c43315ae6e..5c8ad9ed78 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -385,6 +385,12 @@ struct BlockDriver {
  * in generic block-layer: no serializing, no alignment, no tracked
  * requests. So, block-driver that realizes these APIs is fully responsible
  * for synchronization between snapshot-access API and normal IO requests.
+ *
+ * TODO: To be able to support qcow2's internal snapshots, this API will
+ * need to be extended to:
+ * - be able to select a specific snapshot
+ * - receive the snapshot's actual length (which may differ from bs's
+ *   length)
  */
 int coroutine_fn (*bdrv_co_preadv_snapshot)(BlockDriverState *bs,
 int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset);
diff --git a/block/snapshot-access.c b/block/snapshot-access.c
new file mode 100644
index 00..77b87c1946
--- /dev/null
+++ b/block/snapshot-access.c
@@ -0,0 +1,132 @@
+/*
+ * snapshot_access block driver
+ *
+ * Copyright (c) 2022 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#include "qemu/osdep.h"
+
+#include "sysemu/block-backend.h"
+#include "qemu/cutils.h"
+#include "block/block_int.h"
+
+static coroutine_fn int
+snapshot_access_co_preadv_part(BlockDriverState *bs,
+   int64_t offset, int64_t bytes,
+

[PATCH v6 06/16] block: intoduce reqlist

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Split intersecting-requests functionality out of block-copy to be
reused in copy-before-write filter.

Note: while being here, fix tiny typo in MAINTAINERS.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 include/block/reqlist.h |  67 +++
 block/block-copy.c  | 116 +---
 block/reqlist.c |  76 ++
 MAINTAINERS |   4 +-
 block/meson.build   |   1 +
 5 files changed, 184 insertions(+), 80 deletions(-)
 create mode 100644 include/block/reqlist.h
 create mode 100644 block/reqlist.c

diff --git a/include/block/reqlist.h b/include/block/reqlist.h
new file mode 100644
index 00..0fa1eef259
--- /dev/null
+++ b/include/block/reqlist.h
@@ -0,0 +1,67 @@
+/*
+ * reqlist API
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ * Copyright (c) 2021 Virtuozzo International GmbH.
+ *
+ * Authors:
+ *  Dietmar Maurer (diet...@proxmox.com)
+ *  Vladimir Sementsov-Ogievskiy 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef REQLIST_H
+#define REQLIST_H
+
+#include "qemu/coroutine.h"
+
+/*
+ * The API is not thread-safe and shouldn't be. The struct is public to be part
+ * of other structures and protected by third-party locks, see
+ * block/block-copy.c for example.
+ */
+
+typedef struct BlockReq {
+int64_t offset;
+int64_t bytes;
+
+CoQueue wait_queue; /* coroutines blocked on this req */
+QLIST_ENTRY(BlockReq) list;
+} BlockReq;
+
+typedef QLIST_HEAD(, BlockReq) BlockReqList;
+
+/*
+ * Initialize new request and add it to the list. Caller must be sure that
+ * there are no conflicting requests in the list.
+ */
+void reqlist_init_req(BlockReqList *reqs, BlockReq *req, int64_t offset,
+  int64_t bytes);
+/* Search for request in the list intersecting with @offset/@bytes area. */
+BlockReq *reqlist_find_conflict(BlockReqList *reqs, int64_t offset,
+int64_t bytes);
+
+/*
+ * If there are no intersecting requests return false. Otherwise, wait for the
+ * first found intersecting request to finish and return true.
+ *
+ * @lock is passed to qemu_co_queue_wait()
+ * False return value proves that lock was released at no point.
+ */
+bool coroutine_fn reqlist_wait_one(BlockReqList *reqs, int64_t offset,
+   int64_t bytes, CoMutex *lock);
+
+/*
+ * Shrink request and wake all waiting coroutines (maybe some of them are not
+ * intersecting with shrunk request).
+ */
+void coroutine_fn reqlist_shrink_req(BlockReq *req, int64_t new_bytes);
+
+/*
+ * Remove request and wake all waiting coroutines. Do not release any memory.
+ */
+void coroutine_fn reqlist_remove_req(BlockReq *req);
+
+#endif /* REQLIST_H */
diff --git a/block/block-copy.c b/block/block-copy.c
index 0834e29b6e..ef948dccec 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -17,6 +17,7 @@
 #include "trace.h"
 #include "qapi/error.h"
 #include "block/block-copy.h"
+#include "block/reqlist.h"
 #include "sysemu/block-backend.h"
 #include "qemu/units.h"
 #include "qemu/coroutine.h"
@@ -83,7 +84,6 @@ typedef struct BlockCopyTask {
  */
 BlockCopyState *s;
 BlockCopyCallState *call_state;
-int64_t offset;
 /*
  * @method can also be set again in the while loop of
  * block_copy_dirty_clusters(), but it is never accessed concurrently
@@ -94,21 +94,17 @@ typedef struct BlockCopyTask {
 BlockCopyMethod method;
 
 /*
- * Fields whose state changes throughout the execution
- * Protected by lock in BlockCopyState.
+ * Generally, req is protected by lock in BlockCopyState, Still req.offset
+ * is only set on task creation, so may be read concurrently after 
creation.
+ * req.bytes is changed at most once, and need only protecting the case of
+ * parallel read while updating @bytes value in block_copy_task_shrink().
  */
-CoQueue wait_queue; /* coroutines blocked on this task */
-/*
- * Only protect the case of parallel read while updating @bytes
- * value in block_copy_task_shrink().
- */
-int64_t bytes;
-QLIST_ENTRY(BlockCopyTask) list;
+BlockReq req;
 } BlockCopyTask;
 
 static int64_t task_end(BlockCopyTask *task)
 {
-return task->offset + task->bytes;
+return task->req.offset + task->req.bytes;
 }
 
 typedef struct BlockCopyState {
@@ -136,7 +132,7 @@ typedef struct BlockCopyState {
 CoMutex lock;
 int64_t in_flight_bytes;
 BlockCopyMethod method;
-QLIST_HEAD(, BlockCopyTask) tasks; /* All tasks from all block-copy calls 
*/
+BlockReqList reqs;
 QLIST_HEAD(, BlockCopyCallState) calls;
 /*
  * skip_unallocated:
@@ -160,42 +156,6 @@ typedef struct BlockCopyState {
 RateLimit rate_limit;
 } BlockCopyState;
 
-/* Called with lock held */
-static BlockCopyTask *find_conflicting_task(BlockCopyState 

[PATCH v6 15/16] iotests/image-fleecing: add test case with bitmap

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Note that reads zero areas (not dirty in the bitmap) fails, that's
correct.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/tests/image-fleecing | 38 +++---
 tests/qemu-iotests/tests/image-fleecing.out | 84 +
 2 files changed, 113 insertions(+), 9 deletions(-)

diff --git a/tests/qemu-iotests/tests/image-fleecing 
b/tests/qemu-iotests/tests/image-fleecing
index 909fc0a7ad..c9ffa6647e 100755
--- a/tests/qemu-iotests/tests/image-fleecing
+++ b/tests/qemu-iotests/tests/image-fleecing
@@ -23,12 +23,14 @@
 # Creator/Owner: John Snow 
 
 import iotests
-from iotests import log, qemu_img, qemu_io, qemu_io_silent
+from iotests import log, qemu_img, qemu_io, qemu_io_silent, \
+qemu_io_pipe_and_status
 
 iotests.script_initialize(
-supported_fmts=['qcow2', 'qcow', 'qed', 'vmdk', 'vhdx', 'raw'],
+supported_fmts=['qcow2'],
 supported_platforms=['linux'],
 required_fmts=['copy-before-write'],
+unsupported_imgopts=['compat']
 )
 
 patterns = [('0x5d', '0', '64k'),
@@ -50,11 +52,15 @@ remainder = [('0xd5', '0x108000',  '32k'), # Right-end of 
partial-left [1]
  ('0xcd', '0x3ff', '64k')] # patterns[3]
 
 def do_test(use_cbw, use_snapshot_access_filter, base_img_path,
-fleece_img_path, nbd_sock_path, vm):
+fleece_img_path, nbd_sock_path, vm,
+bitmap=False):
 log('--- Setting up images ---')
 log('')
 
 assert qemu_img('create', '-f', iotests.imgfmt, base_img_path, '64M') == 0
+if bitmap:
+assert qemu_img('bitmap', '--add', base_img_path, 'bitmap0') == 0
+
 if use_snapshot_access_filter:
 assert use_cbw
 assert qemu_img('create', '-f', 'raw', fleece_img_path, '64M') == 0
@@ -106,12 +112,17 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 
 # Establish CBW from source to fleecing node
 if use_cbw:
-log(vm.qmp('blockdev-add', {
+fl_cbw = {
 'driver': 'copy-before-write',
 'node-name': 'fl-cbw',
 'file': src_node,
 'target': tmp_node
-}))
+}
+
+if bitmap:
+fl_cbw['bitmap'] = {'node': src_node, 'name': 'bitmap0'}
+
+log(vm.qmp('blockdev-add', fl_cbw))
 
 log(vm.qmp('qom-set', path=qom_path, property='drive', value='fl-cbw'))
 
@@ -148,7 +159,10 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 for p in patterns + zeroes:
 cmd = 'read -P%s %s %s' % p
 log(cmd)
-assert qemu_io_silent('-r', '-f', 'raw', '-c', cmd, nbd_uri) == 0
+out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd,
+   nbd_uri)
+if ret != 0:
+print(out)
 
 log('')
 log('--- Testing COW ---')
@@ -166,7 +180,10 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 for p in patterns + zeroes:
 cmd = 'read -P%s %s %s' % p
 log(cmd)
-assert qemu_io_silent('-r', '-f', 'raw', '-c', cmd, nbd_uri) == 0
+out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd,
+   nbd_uri)
+if ret != 0:
+print(out)
 
 log('')
 log('--- Cleanup ---')
@@ -201,14 +218,14 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
 log('Done')
 
 
-def test(use_cbw, use_snapshot_access_filter):
+def test(use_cbw, use_snapshot_access_filter, bitmap=False):
 with iotests.FilePath('base.img') as base_img_path, \
  iotests.FilePath('fleece.img') as fleece_img_path, \
  iotests.FilePath('nbd.sock',
   base_dir=iotests.sock_dir) as nbd_sock_path, \
  iotests.VM() as vm:
 do_test(use_cbw, use_snapshot_access_filter, base_img_path,
-fleece_img_path, nbd_sock_path, vm)
+fleece_img_path, nbd_sock_path, vm, bitmap=bitmap)
 
 
 log('=== Test backup(sync=none) based fleecing ===\n')
@@ -219,3 +236,6 @@ test(True, False)
 
 log('=== Test fleecing-format based fleecing ===\n')
 test(True, True)
+
+log('=== Test fleecing-format based fleecing with bitmap ===\n')
+test(True, True, bitmap=True)
diff --git a/tests/qemu-iotests/tests/image-fleecing.out 
b/tests/qemu-iotests/tests/image-fleecing.out
index da0af93388..62e1c1fe42 100644
--- a/tests/qemu-iotests/tests/image-fleecing.out
+++ b/tests/qemu-iotests/tests/image-fleecing.out
@@ -190,6 +190,90 @@ read -P0 0x00f8000 32k
 read -P0 0x201 32k
 read -P0 0x3fe 64k
 
+--- Cleanup ---
+
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"return": {}}
+{"return": {}}
+
+--- Confirming writes ---
+
+read -P0xab 0 64k
+read -P0xad 0x00f8000 64k
+read -P0x1d 0x2008000 64k
+read -P0xea 0x3fe 64k
+read -P0xd5 0x108000 32k
+read -P0xdc 32M 32k
+read -P0xcd 0x3ff 64k
+
+Done
+=== Test fleecing-format based fleecing with bitmap ===
+
+--- Setting up images ---
+
+Done
+
+--- Launching 

[PATCH v6 10/16] block/io: introduce block driver snapshot-access API

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Add new block driver handlers and corresponding generic wrappers.
It will be used to allow copy-before-write filter to provide
reach fleecing interface in further commit.

In future this approach may be used to allow reading qcow2 internal
snapshots, for example to export them through NBD.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 include/block/block_int.h | 27 +++
 block/io.c| 69 +++
 2 files changed, 96 insertions(+)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 27008cfb22..c43315ae6e 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -376,6 +376,24 @@ struct BlockDriver {
  */
 void (*bdrv_cancel_in_flight)(BlockDriverState *bs);
 
+/*
+ * Snapshot-access API.
+ *
+ * Block-driver may provide snapshot-access API: special functions to 
access
+ * some internal "snapshot". The functions are similar with normal
+ * read/block_status/discard handler, but don't have any specific handling
+ * in generic block-layer: no serializing, no alignment, no tracked
+ * requests. So, block-driver that realizes these APIs is fully responsible
+ * for synchronization between snapshot-access API and normal IO requests.
+ */
+int coroutine_fn (*bdrv_co_preadv_snapshot)(BlockDriverState *bs,
+int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset);
+int coroutine_fn (*bdrv_co_snapshot_block_status)(BlockDriverState *bs,
+bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
+int64_t *map, BlockDriverState **file);
+int coroutine_fn (*bdrv_co_pdiscard_snapshot)(BlockDriverState *bs,
+int64_t offset, int64_t bytes);
+
 /*
  * Invalidate any cached meta-data.
  */
@@ -1078,6 +1096,15 @@ extern BlockDriver bdrv_file;
 extern BlockDriver bdrv_raw;
 extern BlockDriver bdrv_qcow2;
 
+int coroutine_fn bdrv_co_preadv_snapshot(BdrvChild *child,
+int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset);
+int coroutine_fn bdrv_co_snapshot_block_status(BlockDriverState *bs,
+bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
+int64_t *map, BlockDriverState **file);
+int coroutine_fn bdrv_co_pdiscard_snapshot(BlockDriverState *bs,
+int64_t offset, int64_t bytes);
+
+
 int coroutine_fn bdrv_co_preadv(BdrvChild *child,
 int64_t offset, int64_t bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags);
diff --git a/block/io.c b/block/io.c
index 4e4cb556c5..0bcf09a491 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3587,3 +3587,72 @@ void bdrv_cancel_in_flight(BlockDriverState *bs)
 bs->drv->bdrv_cancel_in_flight(bs);
 }
 }
+
+int coroutine_fn
+bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes,
+QEMUIOVector *qiov, size_t qiov_offset)
+{
+BlockDriverState *bs = child->bs;
+BlockDriver *drv = bs->drv;
+int ret;
+
+if (!drv) {
+return -ENOMEDIUM;
+}
+
+if (!drv->bdrv_co_preadv_snapshot) {
+return -ENOTSUP;
+}
+
+bdrv_inc_in_flight(bs);
+ret = drv->bdrv_co_preadv_snapshot(bs, offset, bytes, qiov, qiov_offset);
+bdrv_dec_in_flight(bs);
+
+return ret;
+}
+
+int coroutine_fn
+bdrv_co_snapshot_block_status(BlockDriverState *bs,
+  bool want_zero, int64_t offset, int64_t bytes,
+  int64_t *pnum, int64_t *map,
+  BlockDriverState **file)
+{
+BlockDriver *drv = bs->drv;
+int ret;
+
+if (!drv) {
+return -ENOMEDIUM;
+}
+
+if (!drv->bdrv_co_snapshot_block_status) {
+return -ENOTSUP;
+}
+
+bdrv_inc_in_flight(bs);
+ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes,
+ pnum, map, file);
+bdrv_dec_in_flight(bs);
+
+return ret;
+}
+
+int coroutine_fn
+bdrv_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
+{
+BlockDriver *drv = bs->drv;
+int ret;
+
+if (!drv) {
+return -ENOMEDIUM;
+}
+
+if (!drv->bdrv_co_pdiscard_snapshot) {
+return -ENOTSUP;
+}
+
+bdrv_inc_in_flight(bs);
+ret = drv->bdrv_co_pdiscard_snapshot(bs, offset, bytes);
+bdrv_dec_in_flight(bs);
+
+return ret;
+}
-- 
2.31.1




[PATCH v6 05/16] block/block-copy: add block_copy_reset()

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Split block_copy_reset() out of block_copy_reset_unallocated() to be
used separately later.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 include/block/block-copy.h |  1 +
 block/block-copy.c | 21 +
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/include/block/block-copy.h b/include/block/block-copy.h
index b80ad02299..68bbd344b2 100644
--- a/include/block/block-copy.h
+++ b/include/block/block-copy.h
@@ -35,6 +35,7 @@ void block_copy_set_progress_meter(BlockCopyState *s, 
ProgressMeter *pm);
 
 void block_copy_state_free(BlockCopyState *s);
 
+void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes);
 int64_t block_copy_reset_unallocated(BlockCopyState *s,
  int64_t offset, int64_t *count);
 
diff --git a/block/block-copy.c b/block/block-copy.c
index 8aa6ee6a5c..0834e29b6e 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -692,6 +692,18 @@ static int block_copy_is_cluster_allocated(BlockCopyState 
*s, int64_t offset,
 }
 }
 
+void block_copy_reset(BlockCopyState *s, int64_t offset, int64_t bytes)
+{
+QEMU_LOCK_GUARD(>lock);
+
+bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
+if (s->progress) {
+progress_set_remaining(s->progress,
+   bdrv_get_dirty_count(s->copy_bitmap) +
+   s->in_flight_bytes);
+}
+}
+
 /*
  * Reset bits in copy_bitmap starting at offset if they represent unallocated
  * data in the image. May reset subsequent contiguous bits.
@@ -712,14 +724,7 @@ int64_t block_copy_reset_unallocated(BlockCopyState *s,
 bytes = clusters * s->cluster_size;
 
 if (!ret) {
-qemu_co_mutex_lock(>lock);
-bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
-if (s->progress) {
-progress_set_remaining(s->progress,
-   bdrv_get_dirty_count(s->copy_bitmap) +
-   s->in_flight_bytes);
-}
-qemu_co_mutex_unlock(>lock);
+block_copy_reset(s, offset, bytes);
 }
 
 *count = bytes;
-- 
2.31.1




[PATCH v6 07/16] block/reqlist: reqlist_find_conflict(): use ranges_overlap()

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Let's reuse convenient helper.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 block/reqlist.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/reqlist.c b/block/reqlist.c
index 5e320ba649..09fecbd48c 100644
--- a/block/reqlist.c
+++ b/block/reqlist.c
@@ -13,6 +13,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/range.h"
 
 #include "block/reqlist.h"
 
@@ -35,7 +36,7 @@ BlockReq *reqlist_find_conflict(BlockReqList *reqs, int64_t 
offset,
 BlockReq *r;
 
 QLIST_FOREACH(r, reqs, list) {
-if (offset + bytes > r->offset && offset < r->offset + r->bytes) {
+if (ranges_overlap(offset, bytes, r->offset, r->bytes)) {
 return r;
 }
 }
-- 
2.31.1




[PATCH v6 14/16] iotests.py: add qemu_io_pipe_and_status()

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Add helper that returns both status and output, to be used in the
following commit

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/iotests.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 6ba65eb1ff..bb071e5669 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -278,6 +278,9 @@ def qemu_io(*args):
 '''Run qemu-io and return the stdout data'''
 return qemu_tool_pipe_and_status('qemu-io', qemu_io_wrap_args(args))[0]
 
+def qemu_io_pipe_and_status(*args):
+return qemu_tool_pipe_and_status('qemu-io', qemu_io_wrap_args(args))
+
 def qemu_io_log(*args):
 result = qemu_io(*args)
 log(result, filters=[filter_testfiles, filter_qemu_io])
-- 
2.31.1




[PATCH v6 08/16] block/dirty-bitmap: introduce bdrv_dirty_bitmap_status()

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Add a convenient function similar with bdrv_block_status() to get
status of dirty bitmap.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 include/block/dirty-bitmap.h |  2 ++
 include/qemu/hbitmap.h   | 12 
 block/dirty-bitmap.c |  6 ++
 util/hbitmap.c   | 33 +
 4 files changed, 53 insertions(+)

diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index f95d350b70..6528336c4c 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -115,6 +115,8 @@ int64_t bdrv_dirty_bitmap_next_zero(BdrvDirtyBitmap 
*bitmap, int64_t offset,
 bool bdrv_dirty_bitmap_next_dirty_area(BdrvDirtyBitmap *bitmap,
 int64_t start, int64_t end, int64_t max_dirty_count,
 int64_t *dirty_start, int64_t *dirty_count);
+bool bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap, int64_t offset,
+  int64_t bytes, int64_t *count);
 BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
   Error **errp);
 
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 5e71b6d6f7..5bd986aa44 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -340,6 +340,18 @@ bool hbitmap_next_dirty_area(const HBitmap *hb, int64_t 
start, int64_t end,
  int64_t max_dirty_count,
  int64_t *dirty_start, int64_t *dirty_count);
 
+/*
+ * bdrv_dirty_bitmap_status:
+ * @hb: The HBitmap to operate on
+ * @start: The bit to start from
+ * @count: Number of bits to proceed
+ * @pnum: Out-parameter. How many bits has same value starting from @start
+ *
+ * Returns true if bitmap is dirty at @start, false otherwise.
+ */
+bool hbitmap_status(const HBitmap *hb, int64_t start, int64_t count,
+int64_t *pnum);
+
 /**
  * hbitmap_iter_next:
  * @hbi: HBitmapIter to operate on.
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 94a0276833..08d56845ad 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -875,6 +875,12 @@ bool bdrv_dirty_bitmap_next_dirty_area(BdrvDirtyBitmap 
*bitmap,
dirty_start, dirty_count);
 }
 
+bool bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap, int64_t offset,
+  int64_t bytes, int64_t *count)
+{
+return hbitmap_status(bitmap->bitmap, offset, bytes, count);
+}
+
 /**
  * bdrv_merge_dirty_bitmap: merge src into dest.
  * Ensures permissions on bitmaps are reasonable; use for public API.
diff --git a/util/hbitmap.c b/util/hbitmap.c
index 305b894a63..dd0501d9a7 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -301,6 +301,39 @@ bool hbitmap_next_dirty_area(const HBitmap *hb, int64_t 
start, int64_t end,
 return true;
 }
 
+bool hbitmap_status(const HBitmap *hb, int64_t start, int64_t count,
+int64_t *pnum)
+{
+int64_t next_dirty, next_zero;
+
+assert(start >= 0);
+assert(count > 0);
+assert(start + count <= hb->orig_size);
+
+next_dirty = hbitmap_next_dirty(hb, start, count);
+if (next_dirty == -1) {
+*pnum = count;
+return false;
+}
+
+if (next_dirty > start) {
+*pnum = next_dirty - start;
+return false;
+}
+
+assert(next_dirty == start);
+
+next_zero = hbitmap_next_zero(hb, start, count);
+if (next_zero == -1) {
+*pnum = count;
+return true;
+}
+
+assert(next_zero > start);
+*pnum = next_zero - start;
+return false;
+}
+
 bool hbitmap_empty(const HBitmap *hb)
 {
 return hb->count == 0;
-- 
2.31.1




[PATCH v6 12/16] block: copy-before-write: realize snapshot-access API

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Current scheme of image fleecing looks like this:

[guest][NBD export]
  |  |
  |root  | root
  v  v
[copy-before-write] -> [temp.qcow2]
  | target  |
  |file |backing
  v |
[active disk] <-+

 - On guest writes copy-before-write filter copies old data from active
   disk to temp.qcow2. So fleecing client (NBD export) when reads
   changed regions from temp.qcow2 image and unchanged from active disk
   through backing link.

This patch makes possible new image fleecing scheme:

[guest]   [NBD export]
   ||
   | root   | root
   v file   v
[copy-before-write]<--[snapshot-access]
   |   |
   | file  | target
   v   v
[active-disk] [temp.img]

 - copy-before-write does CBW operations and also provides
   snapshot-access API. The API may be accessed through
   snapshot-access driver.

Benefits of new scheme:

1. Access control: if remote client try to read data that not covered
   by original dirty bitmap used on copy-before-write open, client gets
   -EACCES.

2. Discard support: if remote client do DISCARD, this additionally to
   discarding data in temp.img informs block-copy process to not copy
   these clusters. Next read from discarded area will return -EACCES.
   This is significant thing: when fleecing user reads data that was
   not yet copied to temp.img, we can avoid copying it on further guest
   write.

3. Synchronisation between client reads and block-copy write is more
   efficient. In old scheme we just rely on BDRV_REQ_SERIALISING flag
   used for writes to temp.qcow2. New scheme is less blocking:
 - fleecing reads are never blocked: if data region is untouched or
   in-flight, we just read from active-disk, otherwise we read from
   temp.img
 - writes to temp.img are not blocked by fleecing reads
 - still, guest writes of-course are blocked by in-flight fleecing
   reads, that currently read from active-disk - it's the minimum
   necessary blocking

4. Temporary image may be of any format, as we don't rely on backing
   feature.

5. Permission relation are simplified. With old scheme we have to share
   write permission on target child of copy-before-write, otherwise
   backing link conflicts with copy-before-write file child write
   permissions. With new scheme we don't have backing link, and
   copy-before-write node may have unshared access to temporary node.
   (Not realized in this commit, will be in future).

6. Having control on fleecing reads we'll be able to implement
   alternative behavior on failed copy-before-write operations.
   Currently we just break guest request (that's a historical behavior
   of backup). But in some scenarios it's a bad behavior: better
   is to drop the backup as failed but don't break guest request.
   With new scheme we can simply unset some bits in a bitmap on CBW
   failure and further fleecing reads will -EACCES, or something like
   this. (Not implemented in this commit, will be in future)
   Additional application for this is implementing timeout for CBW
   operations.

Iotest 257 output is updated, as two more bitmaps now live in
copy-before-write filter.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/copy-before-write.c  | 212 ++-
 tests/qemu-iotests/257.out | 224 +
 2 files changed, 435 insertions(+), 1 deletion(-)

diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index 91a2288b66..0b6d26605c 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -33,12 +33,37 @@
 #include "block/block-copy.h"
 
 #include "block/copy-before-write.h"
+#include "block/reqlist.h"
 
 #include "qapi/qapi-visit-block-core.h"
 
 typedef struct BDRVCopyBeforeWriteState {
 BlockCopyState *bcs;
 BdrvChild *target;
+
+/*
+ * @lock: protects access to @access_bitmap, @done_bitmap and
+ * @frozen_read_reqs
+ */
+CoMutex lock;
+
+/*
+ * @access_bitmap: represents areas allowed for reading by fleecing user.
+ * Reading from non-dirty areas leads to -EACCES.
+ */
+BdrvDirtyBitmap *access_bitmap;
+
+/*
+ * @done_bitmap: represents areas that was successfully copied to @target 
by
+ * copy-before-write operations.
+ */
+BdrvDirtyBitmap *done_bitmap;
+
+/*
+ * @frozen_read_reqs: current read requests for fleecing user in bs->file
+ * node. These areas must not be rewritten by guest.
+ */
+BlockReqList frozen_read_reqs;
 } BDRVCopyBeforeWriteState;
 
 static coroutine_fn int cbw_co_preadv(
@@ -48,10 +73,20 @@ static coroutine_fn int cbw_co_preadv(
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
 
+/*
+ * Do copy-before-write operation.
+ *
+ * On failure 

[PATCH v6 13/16] iotests/image-fleecing: add test-case for fleecing format node

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 tests/qemu-iotests/tests/image-fleecing | 64 -
 tests/qemu-iotests/tests/image-fleecing.out | 76 -
 2 files changed, 120 insertions(+), 20 deletions(-)

diff --git a/tests/qemu-iotests/tests/image-fleecing 
b/tests/qemu-iotests/tests/image-fleecing
index a58b5a1781..909fc0a7ad 100755
--- a/tests/qemu-iotests/tests/image-fleecing
+++ b/tests/qemu-iotests/tests/image-fleecing
@@ -49,12 +49,17 @@ remainder = [('0xd5', '0x108000',  '32k'), # Right-end of 
partial-left [1]
  ('0xdc', '32M',   '32k'), # Left-end of partial-right [2]
  ('0xcd', '0x3ff', '64k')] # patterns[3]
 
-def do_test(use_cbw, base_img_path, fleece_img_path, nbd_sock_path, vm):
+def do_test(use_cbw, use_snapshot_access_filter, base_img_path,
+fleece_img_path, nbd_sock_path, vm):
 log('--- Setting up images ---')
 log('')
 
 assert qemu_img('create', '-f', iotests.imgfmt, base_img_path, '64M') == 0
-assert qemu_img('create', '-f', 'qcow2', fleece_img_path, '64M') == 0
+if use_snapshot_access_filter:
+assert use_cbw
+assert qemu_img('create', '-f', 'raw', fleece_img_path, '64M') == 0
+else:
+assert qemu_img('create', '-f', 'qcow2', fleece_img_path, '64M') == 0
 
 for p in patterns:
 qemu_io('-f', iotests.imgfmt,
@@ -81,16 +86,23 @@ def do_test(use_cbw, base_img_path, fleece_img_path, 
nbd_sock_path, vm):
 log('')
 
 
-# create tmp_node backed by src_node
-log(vm.qmp('blockdev-add', {
-'driver': 'qcow2',
-'node-name': tmp_node,
-'file': {
+if use_snapshot_access_filter:
+log(vm.qmp('blockdev-add', {
+'node-name': tmp_node,
 'driver': 'file',
 'filename': fleece_img_path,
-},
-'backing': src_node,
-}))
+}))
+else:
+# create tmp_node backed by src_node
+log(vm.qmp('blockdev-add', {
+'driver': 'qcow2',
+'node-name': tmp_node,
+'file': {
+'driver': 'file',
+'filename': fleece_img_path,
+},
+'backing': src_node,
+}))
 
 # Establish CBW from source to fleecing node
 if use_cbw:
@@ -102,6 +114,13 @@ def do_test(use_cbw, base_img_path, fleece_img_path, 
nbd_sock_path, vm):
 }))
 
 log(vm.qmp('qom-set', path=qom_path, property='drive', value='fl-cbw'))
+
+if use_snapshot_access_filter:
+log(vm.qmp('blockdev-add', {
+'driver': 'snapshot-access',
+'node-name': 'fl-access',
+'file': 'fl-cbw',
+}))
 else:
 log(vm.qmp('blockdev-backup',
job_id='fleecing',
@@ -109,16 +128,18 @@ def do_test(use_cbw, base_img_path, fleece_img_path, 
nbd_sock_path, vm):
target=tmp_node,
sync='none'))
 
+export_node = 'fl-access' if use_snapshot_access_filter else tmp_node
+
 log('')
 log('--- Setting up NBD Export ---')
 log('')
 
-nbd_uri = 'nbd+unix:///%s?socket=%s' % (tmp_node, nbd_sock_path)
+nbd_uri = 'nbd+unix:///%s?socket=%s' % (export_node, nbd_sock_path)
 log(vm.qmp('nbd-server-start',
{'addr': {'type': 'unix',
  'data': {'path': nbd_sock_path}}}))
 
-log(vm.qmp('nbd-server-add', device=tmp_node))
+log(vm.qmp('nbd-server-add', device=export_node))
 
 log('')
 log('--- Sanity Check ---')
@@ -151,7 +172,11 @@ def do_test(use_cbw, base_img_path, fleece_img_path, 
nbd_sock_path, vm):
 log('--- Cleanup ---')
 log('')
 
+log(vm.qmp('nbd-server-stop'))
+
 if use_cbw:
+if use_snapshot_access_filter:
+log(vm.qmp('blockdev-del', node_name='fl-access'))
 log(vm.qmp('qom-set', path=qom_path, property='drive', value=src_node))
 log(vm.qmp('blockdev-del', node_name='fl-cbw'))
 else:
@@ -160,7 +185,6 @@ def do_test(use_cbw, base_img_path, fleece_img_path, 
nbd_sock_path, vm):
 assert e is not None
 log(e, filters=[iotests.filter_qmp_event])
 
-log(vm.qmp('nbd-server-stop'))
 log(vm.qmp('blockdev-del', node_name=tmp_node))
 vm.shutdown()
 
@@ -177,17 +201,21 @@ def do_test(use_cbw, base_img_path, fleece_img_path, 
nbd_sock_path, vm):
 log('Done')
 
 
-def test(use_cbw):
+def test(use_cbw, use_snapshot_access_filter):
 with iotests.FilePath('base.img') as base_img_path, \
  iotests.FilePath('fleece.img') as fleece_img_path, \
  iotests.FilePath('nbd.sock',
   base_dir=iotests.sock_dir) as nbd_sock_path, \
  iotests.VM() as vm:
-do_test(use_cbw, base_img_path, fleece_img_path, nbd_sock_path, vm)
+do_test(use_cbw, use_snapshot_access_filter, base_img_path,
+fleece_img_path, nbd_sock_path, vm)
 
 
 log('=== Test 

[PATCH v6 04/16] block/copy-before-write: add bitmap open parameter

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
This brings "incremental" mode to copy-before-write filter: user can
specify bitmap so that filter will copy only "dirty" areas.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json  | 10 +++-
 block/copy-before-write.c | 51 ++-
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 9a5a3641d0..ffb7aea2a5 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -4171,11 +4171,19 @@
 #
 # @target: The target for copy-before-write operations.
 #
+# @bitmap: If specified, copy-before-write filter will do
+#  copy-before-write operations only for dirty regions of the
+#  bitmap. Bitmap size must be equal to length of file and
+#  target child of the filter. Note also, that bitmap is used
+#  only to initialize internal bitmap of the process, so further
+#  modifications (or removing) of specified bitmap doesn't
+#  influence the filter. (Since 7.0)
+#
 # Since: 6.2
 ##
 { 'struct': 'BlockdevOptionsCbw',
   'base': 'BlockdevOptionsGenericFormat',
-  'data': { 'target': 'BlockdevRef' } }
+  'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap' } }
 
 ##
 # @BlockdevOptions:
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index 799223e3fb..91a2288b66 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -34,6 +34,8 @@
 
 #include "block/copy-before-write.h"
 
+#include "qapi/qapi-visit-block-core.h"
+
 typedef struct BDRVCopyBeforeWriteState {
 BlockCopyState *bcs;
 BdrvChild *target;
@@ -145,10 +147,53 @@ static void cbw_child_perm(BlockDriverState *bs, 
BdrvChild *c,
 }
 }
 
+static bool cbw_parse_bitmap_option(QDict *options, BdrvDirtyBitmap **bitmap,
+Error **errp)
+{
+QDict *bitmap_qdict = NULL;
+BlockDirtyBitmap *bmp_param = NULL;
+Visitor *v = NULL;
+bool ret = false;
+
+*bitmap = NULL;
+
+qdict_extract_subqdict(options, _qdict, "bitmap.");
+if (!qdict_size(bitmap_qdict)) {
+ret = true;
+goto out;
+}
+
+v = qobject_input_visitor_new_flat_confused(bitmap_qdict, errp);
+if (!v) {
+goto out;
+}
+
+visit_type_BlockDirtyBitmap(v, NULL, _param, errp);
+if (!bmp_param) {
+goto out;
+}
+
+*bitmap = block_dirty_bitmap_lookup(bmp_param->node, bmp_param->name, NULL,
+errp);
+if (!*bitmap) {
+goto out;
+}
+
+ret = true;
+
+out:
+qapi_free_BlockDirtyBitmap(bmp_param);
+visit_free(v);
+qobject_unref(bitmap_qdict);
+
+return ret;
+}
+
 static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
 Error **errp)
 {
 BDRVCopyBeforeWriteState *s = bs->opaque;
+BdrvDirtyBitmap *bitmap = NULL;
 
 bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds,
BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
@@ -163,6 +208,10 @@ static int cbw_open(BlockDriverState *bs, QDict *options, 
int flags,
 return -EINVAL;
 }
 
+if (!cbw_parse_bitmap_option(options, , errp)) {
+return -EINVAL;
+}
+
 bs->total_sectors = bs->file->bs->total_sectors;
 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
@@ -170,7 +219,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, 
int flags,
 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
  bs->file->bs->supported_zero_flags);
 
-s->bcs = block_copy_state_new(bs->file, s->target, NULL, errp);
+s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
 if (!s->bcs) {
 error_prepend(errp, "Cannot create block-copy-state: ");
 return -EINVAL;
-- 
2.31.1




[PATCH v6 03/16] block/block-copy: block_copy_state_new(): add bitmap parameter

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
This will be used in the following commit to bring "incremental" mode
to copy-before-write filter.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 include/block/block-copy.h |  1 +
 block/block-copy.c | 14 +-
 block/copy-before-write.c  |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/block/block-copy.h b/include/block/block-copy.h
index 99370fa38b..b80ad02299 100644
--- a/include/block/block-copy.h
+++ b/include/block/block-copy.h
@@ -25,6 +25,7 @@ typedef struct BlockCopyState BlockCopyState;
 typedef struct BlockCopyCallState BlockCopyCallState;
 
 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ const BdrvDirtyBitmap *bitmap,
  Error **errp);
 
 /* Function should be called prior any actual copy request */
diff --git a/block/block-copy.c b/block/block-copy.c
index abda7a80bd..8aa6ee6a5c 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -384,8 +384,10 @@ static int64_t 
block_copy_calculate_cluster_size(BlockDriverState *target,
 }
 
 BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ const BdrvDirtyBitmap *bitmap,
  Error **errp)
 {
+ERRP_GUARD();
 BlockCopyState *s;
 int64_t cluster_size;
 BdrvDirtyBitmap *copy_bitmap;
@@ -402,7 +404,17 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, 
BdrvChild *target,
 return NULL;
 }
 bdrv_disable_dirty_bitmap(copy_bitmap);
-bdrv_set_dirty_bitmap(copy_bitmap, 0, bdrv_dirty_bitmap_size(copy_bitmap));
+if (bitmap) {
+if (!bdrv_merge_dirty_bitmap(copy_bitmap, bitmap, NULL, errp)) {
+error_prepend(errp, "Failed to merge bitmap '%s' to internal "
+  "copy-bitmap: ", bdrv_dirty_bitmap_name(bitmap));
+bdrv_release_dirty_bitmap(copy_bitmap);
+return NULL;
+}
+} else {
+bdrv_set_dirty_bitmap(copy_bitmap, 0,
+  bdrv_dirty_bitmap_size(copy_bitmap));
+}
 
 /*
  * If source is in backing chain of target assume that target is going to 
be
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index 5bdaf0a9d9..799223e3fb 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -170,7 +170,7 @@ static int cbw_open(BlockDriverState *bs, QDict *options, 
int flags,
 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
  bs->file->bs->supported_zero_flags);
 
-s->bcs = block_copy_state_new(bs->file, s->target, errp);
+s->bcs = block_copy_state_new(bs->file, s->target, NULL, errp);
 if (!s->bcs) {
 error_prepend(errp, "Cannot create block-copy-state: ");
 return -EINVAL;
-- 
2.31.1




[PATCH v6 02/16] block/dirty-bitmap: bdrv_merge_dirty_bitmap(): add return value

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
That simplifies handling failure in existing code and in further new
usage of bdrv_merge_dirty_bitmap().

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 include/block/dirty-bitmap.h| 2 +-
 block/dirty-bitmap.c| 9 +++--
 block/monitor/bitmap-qmp-cmds.c | 5 +
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 40950ae3d5..f95d350b70 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -77,7 +77,7 @@ void bdrv_dirty_bitmap_set_persistence(BdrvDirtyBitmap 
*bitmap,
bool persistent);
 void bdrv_dirty_bitmap_set_inconsistent(BdrvDirtyBitmap *bitmap);
 void bdrv_dirty_bitmap_set_busy(BdrvDirtyBitmap *bitmap, bool busy);
-void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
+bool bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
  HBitmap **backup, Error **errp);
 void bdrv_dirty_bitmap_skip_store(BdrvDirtyBitmap *bitmap, bool skip);
 bool bdrv_dirty_bitmap_get(BdrvDirtyBitmap *bitmap, int64_t offset);
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 0ef46163e3..94a0276833 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -880,11 +880,14 @@ bool bdrv_dirty_bitmap_next_dirty_area(BdrvDirtyBitmap 
*bitmap,
  * Ensures permissions on bitmaps are reasonable; use for public API.
  *
  * @backup: If provided, make a copy of dest here prior to merge.
+ *
+ * Returns true on success, false on failure. In case of failure bitmaps are
+ * untouched.
  */
-void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
+bool bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
  HBitmap **backup, Error **errp)
 {
-bool ret;
+bool ret = false;
 
 bdrv_dirty_bitmaps_lock(dest->bs);
 if (src->bs != dest->bs) {
@@ -912,6 +915,8 @@ out:
 if (src->bs != dest->bs) {
 bdrv_dirty_bitmaps_unlock(src->bs);
 }
+
+return ret;
 }
 
 /**
diff --git a/block/monitor/bitmap-qmp-cmds.c b/block/monitor/bitmap-qmp-cmds.c
index 9f11deec64..83970b22fa 100644
--- a/block/monitor/bitmap-qmp-cmds.c
+++ b/block/monitor/bitmap-qmp-cmds.c
@@ -259,7 +259,6 @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, 
const char *target,
 BlockDriverState *bs;
 BdrvDirtyBitmap *dst, *src, *anon;
 BlockDirtyBitmapMergeSourceList *lst;
-Error *local_err = NULL;
 
 dst = block_dirty_bitmap_lookup(node, target, , errp);
 if (!dst) {
@@ -297,9 +296,7 @@ BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, 
const char *target,
 abort();
 }
 
-bdrv_merge_dirty_bitmap(anon, src, NULL, _err);
-if (local_err) {
-error_propagate(errp, local_err);
+if (!bdrv_merge_dirty_bitmap(anon, src, NULL, errp)) {
 dst = NULL;
 goto out;
 }
-- 
2.31.1




[PATCH v6 09/16] block/reqlist: add reqlist_wait_all()

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
Add function to wait for all intersecting requests.
To be used in the further commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Nikita Lapshin 
Reviewed-by: Hanna Reitz 
---
 include/block/reqlist.h | 8 
 block/reqlist.c | 8 
 2 files changed, 16 insertions(+)

diff --git a/include/block/reqlist.h b/include/block/reqlist.h
index 0fa1eef259..5253497bae 100644
--- a/include/block/reqlist.h
+++ b/include/block/reqlist.h
@@ -53,6 +53,14 @@ BlockReq *reqlist_find_conflict(BlockReqList *reqs, int64_t 
offset,
 bool coroutine_fn reqlist_wait_one(BlockReqList *reqs, int64_t offset,
int64_t bytes, CoMutex *lock);
 
+/*
+ * Wait for all intersecting requests. It just calls reqlist_wait_one() in a
+ * loop, caller is responsible to stop producing new requests in this region
+ * in parallel, otherwise reqlist_wait_all() may never return.
+ */
+void coroutine_fn reqlist_wait_all(BlockReqList *reqs, int64_t offset,
+   int64_t bytes, CoMutex *lock);
+
 /*
  * Shrink request and wake all waiting coroutines (maybe some of them are not
  * intersecting with shrunk request).
diff --git a/block/reqlist.c b/block/reqlist.c
index 09fecbd48c..08cb57cfa4 100644
--- a/block/reqlist.c
+++ b/block/reqlist.c
@@ -58,6 +58,14 @@ bool coroutine_fn reqlist_wait_one(BlockReqList *reqs, 
int64_t offset,
 return true;
 }
 
+void coroutine_fn reqlist_wait_all(BlockReqList *reqs, int64_t offset,
+   int64_t bytes, CoMutex *lock)
+{
+while (reqlist_wait_one(reqs, offset, bytes, lock)) {
+/* continue */
+}
+}
+
 void coroutine_fn reqlist_shrink_req(BlockReq *req, int64_t new_bytes)
 {
 if (new_bytes == req->bytes) {
-- 
2.31.1




[PATCH v6 01/16] block/block-copy: move copy_bitmap initialization to block_copy_state_new()

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
We are going to complicate bitmap initialization in the further
commit. And in future, backup job will be able to work without filter
(when source is immutable), so we'll need same bitmap initialization in
copy-before-write filter and in backup job. So, it's reasonable to do
it in block-copy.

Note that for now cbw_open() is the only caller of
block_copy_state_new().

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Hanna Reitz 
---
 block/block-copy.c| 1 +
 block/copy-before-write.c | 4 
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index ce116318b5..abda7a80bd 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -402,6 +402,7 @@ BlockCopyState *block_copy_state_new(BdrvChild *source, 
BdrvChild *target,
 return NULL;
 }
 bdrv_disable_dirty_bitmap(copy_bitmap);
+bdrv_set_dirty_bitmap(copy_bitmap, 0, bdrv_dirty_bitmap_size(copy_bitmap));
 
 /*
  * If source is in backing chain of target assume that target is going to 
be
diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index c30a5ff8de..5bdaf0a9d9 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -149,7 +149,6 @@ static int cbw_open(BlockDriverState *bs, QDict *options, 
int flags,
 Error **errp)
 {
 BDRVCopyBeforeWriteState *s = bs->opaque;
-BdrvDirtyBitmap *copy_bitmap;
 
 bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds,
BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
@@ -177,9 +176,6 @@ static int cbw_open(BlockDriverState *bs, QDict *options, 
int flags,
 return -EINVAL;
 }
 
-copy_bitmap = block_copy_dirty_bitmap(s->bcs);
-bdrv_set_dirty_bitmap(copy_bitmap, 0, bdrv_dirty_bitmap_size(copy_bitmap));
-
 return 0;
 }
 
-- 
2.31.1




[PATCH v6 00/16] Make image fleecing more usable

2022-03-03 Thread Vladimir Sementsov-Ogievskiy
v6:
11: add comment
15: limit to qcow2 with unsupported compat
fix style
16: fix style
change log('Backup finished ...') to assertion and comment

Vladimir Sementsov-Ogievskiy (16):
  block/block-copy: move copy_bitmap initialization to
block_copy_state_new()
  block/dirty-bitmap: bdrv_merge_dirty_bitmap(): add return value
  block/block-copy: block_copy_state_new(): add bitmap parameter
  block/copy-before-write: add bitmap open parameter
  block/block-copy: add block_copy_reset()
  block: intoduce reqlist
  block/reqlist: reqlist_find_conflict(): use ranges_overlap()
  block/dirty-bitmap: introduce bdrv_dirty_bitmap_status()
  block/reqlist: add reqlist_wait_all()
  block/io: introduce block driver snapshot-access API
  block: introduce snapshot-access block driver
  block: copy-before-write: realize snapshot-access API
  iotests/image-fleecing: add test-case for fleecing format node
  iotests.py: add qemu_io_pipe_and_status()
  iotests/image-fleecing: add test case with bitmap
  iotests/image-fleecing: test push backup with fleecing

 qapi/block-core.json|  14 +-
 include/block/block-copy.h  |   2 +
 include/block/block_int.h   |  33 +++
 include/block/dirty-bitmap.h|   4 +-
 include/block/reqlist.h |  75 ++
 include/qemu/hbitmap.h  |  12 +
 block/block-copy.c  | 150 +--
 block/copy-before-write.c   | 265 +++-
 block/dirty-bitmap.c|  15 +-
 block/io.c  |  69 +
 block/monitor/bitmap-qmp-cmds.c |   5 +-
 block/reqlist.c |  85 +++
 block/snapshot-access.c | 132 ++
 util/hbitmap.c  |  33 +++
 MAINTAINERS |   5 +-
 block/meson.build   |   2 +
 tests/qemu-iotests/257.out  | 224 +
 tests/qemu-iotests/iotests.py   |   3 +
 tests/qemu-iotests/tests/image-fleecing | 185 +++---
 tests/qemu-iotests/tests/image-fleecing.out | 223 +++-
 20 files changed, 1394 insertions(+), 142 deletions(-)
 create mode 100644 include/block/reqlist.h
 create mode 100644 block/reqlist.c
 create mode 100644 block/snapshot-access.c

-- 
2.31.1




Re: [PATCH v5 16/16] iotests/image-fleecing: test push backup with fleecing

2022-03-03 Thread Vladimir Sementsov-Ogievskiy

03.03.2022 13:58, Hanna Reitz wrote:

On 28.02.22 12:39, Vladimir Sementsov-Ogievskiy wrote:

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  tests/qemu-iotests/tests/image-fleecing | 120 ++--
  tests/qemu-iotests/tests/image-fleecing.out |  63 ++
  2 files changed, 151 insertions(+), 32 deletions(-)

diff --git a/tests/qemu-iotests/tests/image-fleecing 
b/tests/qemu-iotests/tests/image-fleecing
index 33995612be..89c79af698 100755
--- a/tests/qemu-iotests/tests/image-fleecing
+++ b/tests/qemu-iotests/tests/image-fleecing


[...]


@@ -170,6 +196,20 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
  log(cmd)
  log(vm.hmp_qemu_io(qom_path, cmd, qdev=True))
+    if push_backup:
+    # Check that previous operations were done during backup, not after
+    result = vm.qmp('query-block-jobs')
+    if len(result['return']) != 1:
+    log('Backup finished too fast, COW is not tested')


I don’t understand why this log is here, its message sounds like “case not 
run”, but first this logged message will make the whole test fail...


This log means that test doesn't test what it should. If that happens, we'll 
need to adjust disk size, backup speed, or something like this. I hope, that 
will not happen, at least it works for me )




+
+    result = vm.qmp('block-job-set-speed', device='push-backup', speed=0)
+    assert result == {'return': {}}


...and then this will fail, too.

Either this is a hard failure, then the log shouldn’t include “COW is not 
tested” (because it is tested, and the case has failed); or it’s a casenotrun, 
and then nothing should be logged (the message should be appended to 
.casenotrun), and the block-job-set-speed call and waiting for 
BLOCK_JOB_COMPLETED should only be done when the job is still in the job list.


OK, I understand. What about this:

# Check that backup is not finished yet. If it is, it's possible that backup
# finished even before guest write, and we didn't actually test
# copy-before-write operation. If this happen, we'll need to adjust storage
# size or backup speed or something like this.
assert len(result['return'] == 1




+
+    log(vm.event_wait(name='BLOCK_JOB_COMPLETED',
+  match={'data': {'device': 'push-backup'}}),
+  filters=[iotests.filter_qmp_event])
+    log(vm.qmp('blockdev-del', node_name='target'))
+
  log('')
  log('--- Verifying Data ---')
  log('')
@@ -177,15 +217,19 @@ def do_test(use_cbw, use_snapshot_access_filter, 
base_img_path,
  for p in patterns + zeroes:
  cmd = 'read -P%s %s %s' % p
  log(cmd)
-    out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd, 
nbd_uri)
-    if ret != 0:
-    print(out)
+    if push_backup:
+    assert qemu_io_silent('-r', '-c', cmd, target_img_path) == 0
+    else:
+    out, ret = qemu_io_pipe_and_status('-r', '-f', 'raw', '-c', cmd, 
nbd_uri)
+    if ret != 0:
+    print(out)


The existing principle of “print qemu-io’s output on error” seemed perfectly 
fine to me.  Why not continue using it?

(e.g. like

args = ['-r', '-c', cmd]
if push_backup:
     args += [target_img_path]
else:
     args += ['-f', 'raw', nbd_uri]
out, ret = qemu_io_pipe_and_status(*args)

)


I don't remember why did I changed it. Your variant seems good.




  log('')
  log('--- Cleanup ---')
  log('')
-    log(vm.qmp('nbd-server-stop'))
+    if not push_backup:
+    log(vm.qmp('nbd-server-stop'))
  if use_cbw:
  if use_snapshot_access_filter:
+read -P0xcd 0x3ff 64k
+
+Done





--
Best regards,
Vladimir



Re: [PATCH v5 11/16] block: introduce snapshot-access block driver

2022-03-03 Thread Vladimir Sementsov-Ogievskiy

03.03.2022 14:11, Hanna Reitz wrote:

On 03.03.22 12:05, Hanna Reitz wrote:

On 28.02.22 12:39, Vladimir Sementsov-Ogievskiy wrote:

The new block driver simply utilizes snapshot-access API of underlying
block node.

In further patches we want to use it like this:

[guest]   [NBD export]
    |    |
    | root   | root
    v file   v
[copy-before-write]<--[snapshot-access]
    |   |
    | file  | target
    v   v
[active-disk] [temp.img]

This way, NBD client will be able to read snapshotted state of active
disk, when active disk is continued to be written by guest. This is
known as "fleecing", and currently uses another scheme based on qcow2
temporary image which backing file is active-disk. New scheme comes
with benefits - see next commit.

The other possible application is exporting internal snapshots of
qcow2, like this:

[guest]  [NBD export]
    |  |
    | root | root
    v   file   v
[qcow2]<-[snapshot-access]

For this, we'll need to implement snapshot-access API handlers in
qcow2 driver, and improve snapshot-access block driver (and API) to
make it possible to select snapshot by name. Another thing to improve
is size of snapshot. Now for simplicity we just use size of bs->file,
which is OK for backup, but for qcow2 snapshots export we'll need to
imporve snapshot-access API to get size of snapshot.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  qapi/block-core.json    |   4 +-
  block/snapshot-access.c | 132 
  MAINTAINERS |   1 +
  block/meson.build   |   1 +
  4 files changed, 137 insertions(+), 1 deletion(-)
  create mode 100644 block/snapshot-access.c


[...]


diff --git a/block/snapshot-access.c b/block/snapshot-access.c
new file mode 100644
index 00..77b87c1946
--- /dev/null
+++ b/block/snapshot-access.c


[...]


+static int snapshot_access_open(BlockDriverState *bs, QDict *options, int 
flags,
+    Error **errp)
+{
+    bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds,
+   BDRV_CHILD_DATA | BDRV_CHILD_PRIMARY,
+   false, errp);
+    if (!bs->file) {
+    return -EINVAL;
+    }
+
+    bs->total_sectors = bs->file->bs->total_sectors;


(If I hadn’t commented on patch 16, I wouldn’t’ve here, but now I might as 
well...)

Instead of just a comment in the commit message (which noone will really read 
later on), I prefer a TODO or FIXME comment directly here in the code, or even 
better in the API added in the previous patch (i.e. as part of the comment in 
the BlockDriver struct), that this will not work for qcow2, i.e. that we will 
need to inquire the snapshot size from the snapshot-providing node.

It’s OK not to implement that now, but I don’t think having a note just in the 
commit message will help us remember.


Considering softfreeze is next week, I’d propose I just add the following the 
patch 10, would that be OK for you?

(In case it is, I’ll hold off on applying patch 16 for now; it’s a test, so we 
can easily add it during freeze)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index c43315ae6e..5c8ad9ed78 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -385,6 +385,12 @@ struct BlockDriver {
   * in generic block-layer: no serializing, no alignment, no tracked
   * requests. So, block-driver that realizes these APIs is fully 
responsible
   * for synchronization between snapshot-access API and normal IO requests.
+ *
+ * TODO: To be able to support qcow2's internal snapshots, this API will
+ * need to be extended to:
+ * - be able to select a specific snapshot
+ * - receive the snapshot's actual length (which may differ from bs's
+ *   length)


Yes, that sounds good


   */
  int coroutine_fn (*bdrv_co_preadv_snapshot)(BlockDriverState *bs,
  int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t 
qiov_offset);




--
Best regards,
Vladimir



Re: [PATCH v2 1/3] block: Make bdrv_refresh_limits() non-recursive

2022-03-03 Thread Kevin Wolf
Am 16.02.2022 um 11:53 hat Hanna Reitz geschrieben:
> bdrv_refresh_limits() recurses down to the node's children.  That does
> not seem necessary: When we refresh limits on some node, and then
> recurse down and were to change one of its children's BlockLimits, then
> that would mean we noticed the changed limits by pure chance.  The fact
> that we refresh the parent's limits has nothing to do with it, so the
> reason for the change probably happened before this point in time, and
> we should have refreshed the limits then.
> 
> On the other hand, we do not have infrastructure for noticing that block
> limits change after they have been initialized for the first time (this
> would require propagating the change upwards to the respective node's
> parents), and so evidently we consider this case impossible.

I like your optimistic approach, but my interpretation would have been
that this is simply a bug. ;-)

blockdev-reopen allows changing options that affect the block limits
(most importantly probably request_alignment), so this should be
propagated to the parents. I think we'll actually not see failures if we
forget to do this, but parents can either advertise excessive alignment
requirements or they may run into RMW when accessing the child, so this
would only affect performance. This is probably why nobody reported it
yet.

> If this case is impossible, then we will not need to recurse down in
> bdrv_refresh_limits().  Every node's limits are initialized in
> bdrv_open_driver(), and are refreshed whenever its children change.
> We want to use the childrens' limits to get some initial default, but
> we can just take them, we do not need to refresh them.

I think even if we need to propagate to the parents, we still don't need
to propagate to the children because the children have already been
refreshed by whatever changed their options (like bdrv_reopen_commit()).
And parent limits don't influence the child limits at all.

So this patch looks good to me, just not the reasoning.

Kevin

> The problem with recursing is that bdrv_refresh_limits() is not atomic.
> It begins with zeroing BDS.bl, and only then sets proper, valid limits.
> If we do not drain all nodes whose limits are refreshed, then concurrent
> I/O requests can encounter invalid request_alignment values and crash
> qemu.  Therefore, a recursing bdrv_refresh_limits() requires the whole
> subtree to be drained, which is currently not ensured by most callers.
> 
> A non-recursive bdrv_refresh_limits() only requires the node in question
> to not receive I/O requests, and this is done by most callers in some
> way or another:
> - bdrv_open_driver() deals with a new node with no parents yet
> - bdrv_set_file_or_backing_noperm() acts on a drained node
> - bdrv_reopen_commit() acts only on drained nodes
> - bdrv_append() should in theory require the node to be drained; in
>   practice most callers just lock the AioContext, which should at least
>   be enough to prevent concurrent I/O requests from accessing invalid
>   limits
> 
> So we can resolve the bug by making bdrv_refresh_limits() non-recursive.
> 
> Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1879437
> Signed-off-by: Hanna Reitz 
> Reviewed-by: Eric Blake 
> ---
>  block/io.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/block/io.c b/block/io.c
> index 4e4cb556c5..c3e7301613 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -189,10 +189,6 @@ void bdrv_refresh_limits(BlockDriverState *bs, 
> Transaction *tran, Error **errp)
>  QLIST_FOREACH(c, >children, next) {
>  if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | 
> BDRV_CHILD_COW))
>  {
> -bdrv_refresh_limits(c->bs, tran, errp);
> -if (*errp) {
> -return;
> -}
>  bdrv_merge_limits(>bl, >bs->bl);
>  have_limits = true;
>  }
> -- 
> 2.34.1
> 




Re: [PATCH 4/9] util/oslib-win32: Return NULL on qemu_try_memalign() with zero size

2022-03-03 Thread Peter Maydell
On Sun, 27 Feb 2022 at 18:36, Richard Henderson
 wrote:
>
> On 2/27/22 02:54, Peter Maydell wrote:
> >>> +if (size) {
> >>> +ptr = _aligned_malloc(size, alignment);
> >>> +} else {
> >>> +ptr = NULL;
> >>> +}
> >>
> >> Oh, should we set errno to something here?
> >> Otherwise a random value will be used by qemu_memalign.
> >
> > Yeah, I guess so, though the errno to use isn't obvious. Maybe EINVAL?
> >
> > The alternative would be to try to audit all the callsites to
> > confirm they don't ever try to allocate 0 bytes and then have
> > the assert for both Windows and POSIX versions...
>
> Alternately, force size == 1, so that we always get a non-NULL value that can 
> be freed.
> That's a change on the POSIX side as well, of course.

Yes, I had a look at what actual malloc() implementations tend
to do, and the answer seems to be that forcing size to 1 gives
less weird behaviour for the application. So here that would be

   if (size == 0) {
   size++;
   }
   ptr = _aligned_malloc(size, alignment);

We don't need to do anything on the POSIX side (unless we want to
enforce consistency of handling the size==0 case).

I'd quite like to get this series in before softfreeze (though mostly
just for my personal convenience so it's not hanging around as a
loose end I have to come back to after we reopen for 7.1). Does anybody
object if I squash in that change and put this in a pullrequest,
or would you prefer to see a v2 series first?

thanks
-- PMM



[PATCH v2 4/4] iotests/185: Add post-READY quit tests

2022-03-03 Thread Hanna Reitz
185 tests quitting qemu while a block job is active.  It does not
specifically test quitting qemu while a mirror or active commit job is
in its READY phase.

Add two test cases for this, where we respectively mirror or commit to
an external QSD instance, which provides a throttled block device.  qemu
is supposed to cancel the job so that it can quit as soon as possible
instead of waiting for the job to complete (which it did before 6.2).

Signed-off-by: Hanna Reitz 
---
 tests/qemu-iotests/185 | 190 -
 tests/qemu-iotests/185.out |  48 ++
 2 files changed, 237 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/185 b/tests/qemu-iotests/185
index f2ec5c5ceb..8b1143dc16 100755
--- a/tests/qemu-iotests/185
+++ b/tests/qemu-iotests/185
@@ -33,6 +33,12 @@ _cleanup()
 _rm_test_img "${TEST_IMG}.copy"
 _cleanup_test_img
 _cleanup_qemu
+
+if [ -f "$TEST_DIR/qsd.pid" ]; then
+kill -SIGKILL "$(cat "$TEST_DIR/qsd.pid")"
+rm -f "$TEST_DIR/qsd.pid"
+fi
+rm -f "$SOCK_DIR/qsd.sock"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -45,7 +51,7 @@ _supported_fmt qcow2
 _supported_proto file
 _supported_os Linux
 
-size=64M
+size=$((64 * 1048576))
 TEST_IMG="${TEST_IMG}.base" _make_test_img $size
 
 echo
@@ -216,6 +222,188 @@ wait=1 _cleanup_qemu | grep -v 'JOB_STATUS_CHANGE'
 
 _check_test_img
 
+echo
+echo === Start mirror to throttled QSD and exit qemu ===
+echo
+
+# Mirror to a throttled QSD instance (so that qemu cannot drain the
+# throttling), wait for READY, then write some data to the device,
+# and then quit qemu.
+# (qemu should force-cancel the job and not wait for the data to be
+# written to the target.)
+
+_make_test_img $size
+
+# Will be used by this and the next case
+set_up_throttled_qsd() {
+$QSD \
+--object throttle-group,id=thrgr,limits.bps-total=1048576 \
+--blockdev null-co,node-name=null,size=$size \
+--blockdev throttle,node-name=throttled,throttle-group=thrgr,file=null 
\
+--nbd-server addr.type=unix,addr.path="$SOCK_DIR/qsd.sock" \
+--export nbd,id=exp,node-name=throttled,name=target,writable=true \
+--pidfile "$TEST_DIR/qsd.pid" \
+--daemonize
+}
+
+set_up_throttled_qsd
+
+# Need a virtio-blk device so that qemu-io writes will not block the monitor
+_launch_qemu \
+--blockdev file,node-name=source-proto,filename="$TEST_IMG" \
+--blockdev qcow2,node-name=source-fmt,file=source-proto \
+--device virtio-blk,id=vblk,drive=source-fmt \
+--blockdev "{\"driver\": \"nbd\",
+ \"node-name\": \"target\",
+ \"server\": {
+ \"type\": \"unix\",
+ \"path\": \"$SOCK_DIR/qsd.sock\"
+ },
+ \"export\": \"target\"}"
+
+h=$QEMU_HANDLE
+_send_qemu_cmd $h '{"execute": "qmp_capabilities"}' 'return'
+
+# Use sync=top, so the first pass will not copy the whole image
+_send_qemu_cmd $h \
+'{"execute": "blockdev-mirror",
+  "arguments": {
+  "job-id": "mirror",
+  "device": "source-fmt",
+  "target": "target",
+  "sync": "top"
+  }}' \
+'return' \
+| grep -v JOB_STATUS_CHANGE # Ignore these events during creation
+
+# This too will be used by this and the next case
+# $1: QEMU handle
+# $2: Image size
+wait_for_job_and_quit() {
+h=$1
+size=$2
+
+# List of expected events
+capture_events='BLOCK_JOB_READY JOB_STATUS_CHANGE'
+_wait_event $h 'BLOCK_JOB_READY'
+QEMU_EVENTS= # Ignore all JOB_STATUS_CHANGE events that came before READY
+
+# Write something to the device for post-READY mirroring.  Write it in
+# blocks matching the cluster size, each spaced one block apart, so
+# that the mirror job will have to spawn one request per cluster.
+# Because the number of concurrent requests is limited (to 16), this
+# limits the number of bytes concurrently in flight, which speeds up
+# cancelling the job (in-flight requests still are waited for).
+# To limit the number of bytes in flight, we could alternatively pass
+# something for blockdev-mirror's @buf-size parameter, but
+# block-commit does not have such a parameter, so we need to figure
+# something out that works for both.
+
+cluster_size=65536
+step=$((cluster_size * 2))
+
+echo '--- Writing data to the virtio-blk device ---'
+
+for ofs in $(seq 0 $step $((size - step))); do
+qemu_io_cmd="qemu-io -d vblk/virtio-backend "
+qemu_io_cmd+="\\\"aio_write $ofs $cluster_size\\\""
+
+# Do not include these requests in the reference output
+# (it's just too much)
+silent=yes _send_qemu_cmd $h \
+"{\"execute\": \"human-monitor-command\",
+  \"arguments\": {
+  \"command-line\": \"$qemu_io_cmd\"
+  }}" \
+'return'
+done
+
+# Wait until the job's length is updated to reflect 

[PATCH v2 3/4] qsd: Add --daemonize

2022-03-03 Thread Hanna Reitz
To implement this, we reuse the existing daemonizing functions from the
system emulator, which mainly do the following:
- Fork off a child process, and set up a pipe between parent and child
- The parent process waits until the child sends a status byte over the
  pipe (0 means that the child was set up successfully; anything else
  (including errors or EOF) means that the child was not set up
  successfully), and then exits with an appropriate exit status
- The child process enters a new session (forking off again), changes
  the umask, and will ignore terminal signals from then on
- Once set-up is complete, the child will chdir to /, redirect all
  standard I/O streams to /dev/null, and tell the parent that set-up has
  been completed successfully

In contrast to qemu-nbd's --fork implementation, during the set up
phase, error messages are not piped through the parent process.
qemu-nbd mainly does this to detect errors, though (while os_daemonize()
has the child explicitly signal success after set up); because we do not
redirect stderr after forking, error messages continue to appear on
whatever the parent's stderr was (until set up is complete).

Signed-off-by: Hanna Reitz 
---
 docs/tools/qemu-storage-daemon.rst   |  7 +++
 storage-daemon/qemu-storage-daemon.c | 15 +++
 2 files changed, 22 insertions(+)

diff --git a/docs/tools/qemu-storage-daemon.rst 
b/docs/tools/qemu-storage-daemon.rst
index 878e6a5c5c..8b97592663 100644
--- a/docs/tools/qemu-storage-daemon.rst
+++ b/docs/tools/qemu-storage-daemon.rst
@@ -154,6 +154,13 @@ Standard options:
   created but before accepting connections. The daemon has started successfully
   when the pid file is written and clients may begin connecting.
 
+.. option:: --daemonize
+
+  Daemonize the process. The parent process will exit once startup is complete
+  (i.e., after the pid file has been or would have been written) or failure
+  occurs. Its exit code reflects whether the child has started up successfully
+  or failed to do so.
+
 Examples
 
 Launch the daemon with QMP monitor socket ``qmp.sock`` so clients can execute
diff --git a/storage-daemon/qemu-storage-daemon.c 
b/storage-daemon/qemu-storage-daemon.c
index b798954edb..9f2c3332bf 100644
--- a/storage-daemon/qemu-storage-daemon.c
+++ b/storage-daemon/qemu-storage-daemon.c
@@ -137,6 +137,9 @@ static void help(void)
 "\n"
 "  --pidfilewrite process ID to a file after startup\n"
 "\n"
+"  --daemonizedaemonize the process, and have the parent exit\n"
+" once startup is complete\n"
+"\n"
 QEMU_HELP_BOTTOM "\n",
 error_get_progname());
 }
@@ -144,6 +147,7 @@ QEMU_HELP_BOTTOM "\n",
 enum {
 OPTION_BLOCKDEV = 256,
 OPTION_CHARDEV,
+OPTION_DAEMONIZE,
 OPTION_EXPORT,
 OPTION_MONITOR,
 OPTION_NBD_SERVER,
@@ -200,6 +204,7 @@ static void process_options(int argc, char *argv[], bool 
pre_init_pass)
 static const struct option long_options[] = {
 {"blockdev", required_argument, NULL, OPTION_BLOCKDEV},
 {"chardev", required_argument, NULL, OPTION_CHARDEV},
+{"daemonize", no_argument, NULL, OPTION_DAEMONIZE},
 {"export", required_argument, NULL, OPTION_EXPORT},
 {"help", no_argument, NULL, 'h'},
 {"monitor", required_argument, NULL, OPTION_MONITOR},
@@ -225,6 +230,7 @@ static void process_options(int argc, char *argv[], bool 
pre_init_pass)
 c == '?' ||
 c == 'h' ||
 c == 'V' ||
+c == OPTION_DAEMONIZE ||
 c == OPTION_PIDFILE;
 
 /* Process every option only in its respective pass */
@@ -277,6 +283,12 @@ static void process_options(int argc, char *argv[], bool 
pre_init_pass)
 qemu_opts_del(opts);
 break;
 }
+case OPTION_DAEMONIZE:
+if (os_set_daemonize(true) < 0) {
+error_report("--daemonize not supported in this build");
+exit(EXIT_FAILURE);
+}
+break;
 case OPTION_EXPORT:
 {
 Visitor *v;
@@ -367,6 +379,8 @@ int main(int argc, char *argv[])
 
 process_options(argc, argv, true);
 
+os_daemonize();
+
 module_call_init(MODULE_INIT_QOM);
 module_call_init(MODULE_INIT_TRACE);
 qemu_add_opts(_trace_opts);
@@ -389,6 +403,7 @@ int main(int argc, char *argv[])
  * it.
  */
 pid_file_init();
+os_setup_post();
 
 while (!exit_requested) {
 main_loop_wait(false);
-- 
2.34.1




[PATCH v2 2/4] qsd: Add pre-init argument parsing pass

2022-03-03 Thread Hanna Reitz
In contrast to qemu-nbd (where it is called --fork) and the system
emulator, QSD does not have a --daemonize switch yet.  Just like them,
QSD allows setting up block devices and exports on the command line.
When doing so, it is often necessary for whoever invoked the QSD to wait
until these exports are fully set up.  A --daemonize switch allows
precisely this, by virtue of the parent process exiting once everything
is set up.

Note that there are alternative ways of waiting for all exports to be
set up, for example:
- Passing the --pidfile option and waiting until the respective file
  exists (but I do not know if there is a way of implementing this
  without a busy wait loop)
- Set up some network server (e.g. on a Unix socket) and have the QSD
  connect to it after all arguments have been processed by appending
  corresponding --chardev and --monitor options to the command line,
  and then wait until the QSD connects

Having a --daemonize option would make this simpler, though, without
having to rely on additional tools (to set up a network server) or busy
waiting.

Implementing a --daemonize switch means having to fork the QSD process.
Ideally, we should do this as early as possible: All the parent process
has to do is to wait for the child process to signal completion of its
set-up phase, and therefore there is basically no initialization that
needs to be done before the fork.  On the other hand, forking after
initialization steps means having to consider how those steps (like
setting up the block layer or QMP) interact with a later fork, which is
often not trivial.

In order to fork this early, we must scan the command line for
--daemonize long before our current process_options() call.  Instead of
adding custom new code to do so, just reuse process_options() and give
it a @pre_init_pass argument to distinguish the two passes.  I believe
there are some other switches but --daemonize that deserve parsing in
the first pass:

- --help and --version are supposed to only print some text and then
  immediately exit (so any initialization we do would be for naught).
  This changes behavior, because now "--blockdev inv-drv --help" will
  print a help text instead of complaining about the --blockdev
  argument.
  Note that this is similar in behavior to other tools, though: "--help"
  is generally immediately acted upon when finding it in the argument
  list, potentially before other arguments (even ones before it) are
  acted on.  For example, "ls /does-not-exist --help" prints a help text
  and does not complain about ENOENT.

- --pidfile does not need initialization, and is already exempted from
  the sequential order that process_options() claims to strictly follow
  (the PID file is only created after all arguments are processed, not
  at the time the --pidfile argument appears), so it makes sense to
  include it in the same category as --daemonize.

- Invalid arguments should always be reported as soon as possible.  (The
  same caveat with --help applies: That means that "--blockdev inv-drv
  --inv-arg" will now complain about --inv-arg, not inv-drv.)

This patch does make some references to --daemonize without having
implemented it yet, but that will happen in the next patch.

Signed-off-by: Hanna Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 storage-daemon/qemu-storage-daemon.c | 43 
 1 file changed, 38 insertions(+), 5 deletions(-)

diff --git a/storage-daemon/qemu-storage-daemon.c 
b/storage-daemon/qemu-storage-daemon.c
index 504d33aa91..b798954edb 100644
--- a/storage-daemon/qemu-storage-daemon.c
+++ b/storage-daemon/qemu-storage-daemon.c
@@ -177,7 +177,23 @@ static int getopt_set_loc(int argc, char **argv, const 
char *optstring,
 return c;
 }
 
-static void process_options(int argc, char *argv[])
+/**
+ * Process QSD command-line arguments.
+ *
+ * This is done in two passes:
+ *
+ * First (@pre_init_pass is true), we do a pass where all global
+ * arguments pertaining to the QSD process (like --help or --daemonize)
+ * are processed.  This pass is done before most of the QEMU-specific
+ * initialization steps (e.g. initializing the block layer or QMP), and
+ * so must only process arguments that are not really QEMU-specific.
+ *
+ * Second (@pre_init_pass is false), we (sequentially) process all
+ * QEMU/QSD-specific arguments.  Many of these arguments are effectively
+ * translated to QMP commands (like --blockdev for blockdev-add, or
+ * --export for block-export-add).
+ */
+static void process_options(int argc, char *argv[], bool pre_init_pass)
 {
 int c;
 
@@ -196,11 +212,26 @@ static void process_options(int argc, char *argv[])
 };
 
 /*
- * In contrast to the system emulator, options are processed in the order
- * they are given on the command lines. This means that things must be
- * defined first before they can be referenced in another option.
+ * In contrast to the system emulator, QEMU-specific options are processed

[PATCH v2 1/4] os-posix: Add os_set_daemonize()

2022-03-03 Thread Hanna Reitz
The daemonizing functions in os-posix (os_daemonize() and
os_setup_post()) only daemonize the process if the static `daemonize`
variable is set.  Right now, it can only be set by os_parse_cmd_args().

In order to use os_daemonize() and os_setup_post() from the storage
daemon to have it be daemonized, we need some other way to set this
`daemonize` variable, because I would rather not tap into the system
emulator's arg-parsing code.  Therefore, this patch adds an
os_set_daemonize() function, which will return an error on os-win32
(because daemonizing is not supported there).

Signed-off-by: Hanna Reitz 
---
 include/sysemu/os-posix.h | 1 +
 include/sysemu/os-win32.h | 5 +
 os-posix.c| 6 ++
 3 files changed, 12 insertions(+)

diff --git a/include/sysemu/os-posix.h b/include/sysemu/os-posix.h
index 2edf33658a..dd64fb401d 100644
--- a/include/sysemu/os-posix.h
+++ b/include/sysemu/os-posix.h
@@ -55,6 +55,7 @@ int os_mlock(void);
 typedef struct timeval qemu_timeval;
 #define qemu_gettimeofday(tp) gettimeofday(tp, NULL)
 
+int os_set_daemonize(bool d);
 bool is_daemonized(void);
 
 /**
diff --git a/include/sysemu/os-win32.h b/include/sysemu/os-win32.h
index 43f569b5c2..68af96907e 100644
--- a/include/sysemu/os-win32.h
+++ b/include/sysemu/os-win32.h
@@ -77,6 +77,11 @@ typedef struct {
 } qemu_timeval;
 int qemu_gettimeofday(qemu_timeval *tp);
 
+static inline int os_set_daemonize(bool d)
+{
+return -ENOTSUP;
+}
+
 static inline bool is_daemonized(void)
 {
 return false;
diff --git a/os-posix.c b/os-posix.c
index ae6c9f2a5e..24692c8593 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -317,6 +317,12 @@ bool is_daemonized(void)
 return daemonize;
 }
 
+int os_set_daemonize(bool d)
+{
+daemonize = d;
+return 0;
+}
+
 int os_mlock(void)
 {
 #ifdef HAVE_MLOCKALL
-- 
2.34.1




[PATCH v2 0/4] qsd: Add --daemonize; and add job quit tests

2022-03-03 Thread Hanna Reitz
Hi,

v1 cover letter:

https://lists.nongnu.org/archive/html/qemu-block/2021-12/msg00499.html


In v2, I followed Vladimir’s suggestion to look into whether we could
reuse os_daemonize().  Indeed we can, and it makes patch 3 (formerly 2)
much simpler!

I decided to leave patch 2 (formerly 1) largely unchanged, because it
seems to me like the point of contention is whether it’s at all
reasonable to introduce a second argument pass for this feature, and not
e.g. which arguments we parse during it.
I believe such an additional pass is a necessity for --daemonize, so
either we really don’t want this pass and so cannot add this feature
(and just drop this series); or we do want this feature, and then we
have to add this pass.


v2:
- Patch 1: Added, so we can use os_daemonize() in patch 3
  (os_daemonize() internally will only do something if the static
  `daemonize` variable is set, which this new os_set_daemonize()
  function does; otherwise, you can only set it by invoking
  os_parse_cmd_args(), which I would rather not (feels like abuse))

- Patch 2:
  - Tried to be more verbose in the commit description
  - Made it clear in process_options() that only QEMU-specific options
are processed in order

- Patch 3: Vastly simplified by using the existing os_daemonize() and
  os_setup_post() functions


git backport-diff against v1:

Key:
[] : patches are identical
[] : number of functional differences between upstream/downstream patch
[down] : patch is downstream-only
The flags [FC] indicate (F)unctional and (C)ontextual differences, respectively

001/4:[down] 'os-posix: Add os_set_daemonize()'
002/4:[0006] [FC] 'qsd: Add pre-init argument parsing pass'
003/4:[0148] [FC] 'qsd: Add --daemonize'
004/4:[] [--] 'iotests/185: Add post-READY quit tests'


Hanna Reitz (4):
  os-posix: Add os_set_daemonize()
  qsd: Add pre-init argument parsing pass
  qsd: Add --daemonize
  iotests/185: Add post-READY quit tests

 docs/tools/qemu-storage-daemon.rst   |   7 +
 include/sysemu/os-posix.h|   1 +
 include/sysemu/os-win32.h|   5 +
 os-posix.c   |   6 +
 storage-daemon/qemu-storage-daemon.c |  58 +++-
 tests/qemu-iotests/185   | 190 ++-
 tests/qemu-iotests/185.out   |  48 +++
 7 files changed, 309 insertions(+), 6 deletions(-)

-- 
2.34.1




[PATCH 09/12] tests/qemu-iotests: convert NBD TLS test to use standard filters

2022-03-03 Thread Daniel P . Berrangé
Using standard filters is more future proof than rolling our own.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/233 | 29 -
 tests/qemu-iotests/233.out |  9 -
 2 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/tests/qemu-iotests/233 b/tests/qemu-iotests/233
index 9ca7b68f42..050267298d 100755
--- a/tests/qemu-iotests/233
+++ b/tests/qemu-iotests/233
@@ -65,7 +65,7 @@ tls_x509_create_client "ca1" "client3"
 echo
 echo "== preparing image =="
 _make_test_img 64M
-$QEMU_IO -c 'w -P 0x11 1m 1m' "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c 'w -P 0x11 1m 1m' "$TEST_IMG" 2>&1 | _filter_qemu_io
 
 echo
 echo "== check TLS client to plain server fails =="
@@ -74,9 +74,9 @@ nbd_server_start_tcp_socket -f $IMGFMT "$TEST_IMG" 2> 
"$TEST_DIR/server.log"
 obj=tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0
 $QEMU_IMG info --image-opts --object $obj \
 driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0 \
-2>&1 | sed "s/$nbd_tcp_port/PORT/g"
+2>&1 | _filter_nbd
 $QEMU_NBD_PROG -L -b $nbd_tcp_addr -p $nbd_tcp_port --object $obj \
---tls-creds=tls0
+--tls-creds=tls0 2>&1 | _filter_qemu_nbd_exports
 
 nbd_server_stop
 
@@ -88,8 +88,10 @@ nbd_server_start_tcp_socket \
 --tls-creds tls0 \
 -f $IMGFMT "$TEST_IMG" 2>> "$TEST_DIR/server.log"
 
-$QEMU_IMG info nbd://localhost:$nbd_tcp_port 2>&1 | sed 
"s/$nbd_tcp_port/PORT/g"
-$QEMU_NBD_PROG -L -b $nbd_tcp_addr -p $nbd_tcp_port
+$QEMU_IMG info nbd://localhost:$nbd_tcp_port \
+2>&1 | _filter_nbd
+$QEMU_NBD_PROG -L -b $nbd_tcp_addr -p $nbd_tcp_port \
+2>&1 | _filter_qemu_nbd_exports
 
 echo
 echo "== check TLS works =="
@@ -97,21 +99,21 @@ 
obj1=tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0
 obj2=tls-creds-x509,dir=${tls_dir}/client3,endpoint=client,id=tls0
 $QEMU_IMG info --image-opts --object $obj1 \
 driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0 \
-2>&1 | sed "s/$nbd_tcp_port/PORT/g"
+2>&1 | _filter_nbd
 $QEMU_IMG info --image-opts --object $obj2 \
 driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0 \
-2>&1 | sed "s/$nbd_tcp_port/PORT/g"
+2>&1 | _filter_nbd
 $QEMU_NBD_PROG -L -b $nbd_tcp_addr -p $nbd_tcp_port --object $obj1 \
---tls-creds=tls0
+--tls-creds=tls0 2>&1 | _filter_qemu_nbd_exports
 
 echo
 echo "== check TLS with different CA fails =="
 obj=tls-creds-x509,dir=${tls_dir}/client2,endpoint=client,id=tls0
 $QEMU_IMG info --image-opts --object $obj \
 driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0 \
-2>&1 | sed "s/$nbd_tcp_port/PORT/g"
+2>&1 | _filter_nbd
 $QEMU_NBD_PROG -L -b $nbd_tcp_addr -p $nbd_tcp_port --object $obj \
---tls-creds=tls0
+--tls-creds=tls0 2>&1 | _filter_qemu_nbd_exports
 
 echo
 echo "== perform I/O over TLS =="
@@ -121,7 +123,8 @@ $QEMU_IO -c 'r -P 0x11 1m 1m' -c 'w -P 0x22 1m 1m' 
--image-opts \
 driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0 \
 2>&1 | _filter_qemu_io
 
-$QEMU_IO -f $IMGFMT -r -U -c 'r -P 0x22 1m 1m' "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -f $IMGFMT -r -U -c 'r -P 0x22 1m 1m' "$TEST_IMG" \
+2>&1 | _filter_qemu_io
 
 echo
 echo "== check TLS with authorization =="
@@ -139,12 +142,12 @@ nbd_server_start_tcp_socket \
 $QEMU_IMG info --image-opts \
 --object tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0 \
 driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0 \
-2>&1 | sed "s/$nbd_tcp_port/PORT/g"
+2>&1 | _filter_nbd
 
 $QEMU_IMG info --image-opts \
 --object tls-creds-x509,dir=${tls_dir}/client3,endpoint=client,id=tls0 \
 driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0 \
-2>&1 | sed "s/$nbd_tcp_port/PORT/g"
+2>&1 | _filter_nbd
 
 echo
 echo "== final server log =="
diff --git a/tests/qemu-iotests/233.out b/tests/qemu-iotests/233.out
index 4b1f6a0e15..a1e45765b8 100644
--- a/tests/qemu-iotests/233.out
+++ b/tests/qemu-iotests/233.out
@@ -17,15 +17,12 @@ wrote 1048576/1048576 bytes at offset 1048576
 qemu-img: Could not open 'driver=nbd,host=127.0.0.1,port=PORT,tls-creds=tls0': 
Denied by server for option 5 (starttls)
 server reported: TLS not configured
 qemu-nbd: Denied by server for option 5 (starttls)
-server reported: TLS not configured
 
 == check plain client to TLS server fails ==
 qemu-img: Could not open 'nbd://localhost:PORT': TLS negotiation required 
before option 7 (go)
 Did you forget a valid tls-creds?
 server reported: Option 0x7 not permitted before TLS
 qemu-nbd: TLS negotiation required before option 3 (list)
-Did you forget a valid tls-creds?
-server reported: Option 0x3 not permitted before TLS
 
 == check TLS works ==
 image: nbd://127.0.0.1:PORT
@@ -37,14 +34,8 @@ file format: nbd
 virtual size: 64 MiB (67108864 bytes)
 disk size: unavailable
 exports available: 1
- export: ''
   size:  67108864
-  flags: 0xced ( flush fua trim zeroes df cache fast-zero )
   min block: 1

[PATCH 12/12] tests/qemu-iotests: validate NBD TLS with UNIX sockets and PSK

2022-03-03 Thread Daniel P . Berrangé
This validates that connections to an NBD server running on a UNIX
socket can use TLS with pre-shared keys (PSK).

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/233| 28 
 tests/qemu-iotests/233.out| 17 +
 tests/qemu-iotests/common.tls | 24 
 3 files changed, 69 insertions(+)

diff --git a/tests/qemu-iotests/233 b/tests/qemu-iotests/233
index 27b0a123d3..0488f3bbef 100755
--- a/tests/qemu-iotests/233
+++ b/tests/qemu-iotests/233
@@ -61,6 +61,8 @@ tls_x509_create_server "ca1" "server1"
 tls_x509_create_client "ca1" "client1"
 tls_x509_create_client "ca2" "client2"
 tls_x509_create_client "ca1" "client3"
+tls_psk_create_creds "psk1"
+tls_psk_create_creds "psk2"
 
 echo
 echo "== preparing image =="
@@ -191,6 +193,32 @@ $QEMU_IMG info --image-opts --object $obj1 \
 $QEMU_NBD_PROG -L -k $nbd_unix_socket --object $obj1 \
 --tls-creds=tls0 --tls-hostname=127.0.0.1  2>&1 | _filter_qemu_nbd_exports
 
+
+echo
+echo "== check TLS works over UNIX with PSK =="
+nbd_server_stop
+
+nbd_server_start_unix_socket \
+--object 
tls-creds-psk,dir=${tls_dir}/psk1,endpoint=server,id=tls0,verify-peer=on \
+--tls-creds tls0 \
+-f $IMGFMT "$TEST_IMG" 2>> "$TEST_DIR/server.log"
+
+obj1=tls-creds-psk,dir=${tls_dir}/psk1,username=psk1,endpoint=client,id=tls0
+$QEMU_IMG info --image-opts --object $obj1 \
+driver=nbd,path=$nbd_unix_socket,tls-creds=tls0 \
+2>&1 | _filter_nbd
+$QEMU_NBD_PROG -L -k $nbd_unix_socket --object $obj1 \
+--tls-creds=tls0 2>&1 | _filter_qemu_nbd_exports
+
+echo
+echo "== check TLS fails over UNIX with mismatch PSK =="
+obj1=tls-creds-psk,dir=${tls_dir}/psk2,username=psk2,endpoint=client,id=tls0
+$QEMU_IMG info --image-opts --object $obj1 \
+driver=nbd,path=$nbd_unix_socket,tls-creds=tls0 \
+2>&1 | _filter_nbd
+$QEMU_NBD_PROG -L -k $nbd_unix_socket --object $obj1 \
+--tls-creds=tls0 2>&1 | _filter_qemu_nbd_exports
+
 echo
 echo "== final server log =="
 cat "$TEST_DIR/server.log" | _filter_authz_check_tls
diff --git a/tests/qemu-iotests/233.out b/tests/qemu-iotests/233.out
index a00e4c5b08..ecb36a2f97 100644
--- a/tests/qemu-iotests/233.out
+++ b/tests/qemu-iotests/233.out
@@ -7,6 +7,8 @@ Generating a signed certificate...
 Generating a signed certificate...
 Generating a signed certificate...
 Generating a signed certificate...
+Generating a random key for user 'psk1'
+Generating a random key for user 'psk2'
 
 == preparing image ==
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
@@ -79,6 +81,19 @@ exports available: 1
   size:  67108864
   min block: 1
 
+== check TLS works over UNIX with PSK ==
+image: nbd+unix://?socket=SOCK_DIR/qemu-nbd.sock
+file format: nbd
+virtual size: 64 MiB (67108864 bytes)
+disk size: unavailable
+exports available: 1
+  size:  67108864
+  min block: 1
+
+== check TLS fails over UNIX with mismatch PSK ==
+qemu-img: Could not open 
'driver=nbd,path=SOCK_DIR/qemu-nbd.sock,tls-creds=tls0': TLS handshake failed: 
The TLS connection was non-properly terminated.
+qemu-nbd: TLS handshake failed: The TLS connection was non-properly terminated.
+
 == final server log ==
 qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
 qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
@@ -88,4 +103,6 @@ qemu-nbd: option negotiation failed: TLS x509 authz check 
for DISTINGUISHED-NAME
 qemu-nbd: option negotiation failed: TLS x509 authz check for 
DISTINGUISHED-NAME is denied
 qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
 qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
+qemu-nbd: option negotiation failed: TLS handshake failed: An illegal 
parameter has been received.
+qemu-nbd: option negotiation failed: TLS handshake failed: An illegal 
parameter has been received.
 *** done
diff --git a/tests/qemu-iotests/common.tls b/tests/qemu-iotests/common.tls
index 4a5760949d..b9c5462986 100644
--- a/tests/qemu-iotests/common.tls
+++ b/tests/qemu-iotests/common.tls
@@ -24,6 +24,7 @@ tls_x509_cleanup()
 {
 rm -f "${tls_dir}"/*.pem
 rm -f "${tls_dir}"/*/*.pem
+rm -f "${tls_dir}"/*/*.psk
 rmdir "${tls_dir}"/*
 rmdir "${tls_dir}"
 }
@@ -40,6 +41,18 @@ tls_certtool()
 rm -f "${tls_dir}"/certtool.log
 }
 
+tls_psktool()
+{
+psktool "$@" 1>"${tls_dir}"/psktool.log 2>&1
+if test "$?" = 0; then
+  head -1 "${tls_dir}"/psktool.log
+else
+  cat "${tls_dir}"/psktool.log
+fi
+rm -f "${tls_dir}"/psktool.log
+}
+
+
 tls_x509_init()
 {
 (certtool --help) >/dev/null 2>&1 || \
@@ -176,3 +189,14 @@ EOF
 
 rm -f "${tls_dir}/cert.info"
 }
+
+tls_psk_create_creds()
+{
+name=$1
+
+mkdir -p "${tls_dir}/$name"
+
+

[PATCH 05/12] block/nbd: don't restrict TLS usage to IP sockets

2022-03-03 Thread Daniel P . Berrangé
The TLS usage for NBD was restricted to IP sockets because validating
x509 certificates requires knowledge of the hostname that the client
is connecting to.

TLS does not have to use x509 certificates though, as PSK (pre-shared
keys) provide an alternative credential option. These have no
requirement for a hostname and can thus be trivially used for UNIX
sockets.

Furthermore, with the ability to overide the default hostname for
TLS validation in the previous patch, it is now also valid to want
to use x509 certificates with FD passing and UNIX sockets.

Signed-off-by: Daniel P. Berrangé 
---
 block/nbd.c| 8 ++--
 blockdev-nbd.c | 6 --
 qemu-nbd.c | 8 +++-
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 113aa5d3af..3ede47dec9 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -1838,13 +1838,9 @@ static int nbd_process_options(BlockDriverState *bs, 
QDict *options,
 goto error;
 }
 
-/* TODO SOCKET_ADDRESS_KIND_FD where fd has AF_INET or AF_INET6 */
-if (s->saddr->type != SOCKET_ADDRESS_TYPE_INET) {
-error_setg(errp, "TLS only supported over IP sockets");
-goto error;
-}
 s->tlshostname = g_strdup(qemu_opt_get(opts, "tls-hostname"));
-if (!s->tlshostname) {
+if (!s->tlshostname &&
+s->saddr->type == SOCKET_ADDRESS_TYPE_INET) {
 s->tlshostname = g_strdup(s->saddr->u.inet.host);
 }
 }
diff --git a/blockdev-nbd.c b/blockdev-nbd.c
index bdfa7ed3a5..9840d25a82 100644
--- a/blockdev-nbd.c
+++ b/blockdev-nbd.c
@@ -148,12 +148,6 @@ void nbd_server_start(SocketAddress *addr, const char 
*tls_creds,
 if (!nbd_server->tlscreds) {
 goto error;
 }
-
-/* TODO SOCKET_ADDRESS_TYPE_FD where fd has AF_INET or AF_INET6 */
-if (addr->type != SOCKET_ADDRESS_TYPE_INET) {
-error_setg(errp, "TLS is only supported with IPv4/IPv6");
-goto error;
-}
 }
 
 nbd_server->tlsauthz = g_strdup(tls_authz);
diff --git a/qemu-nbd.c b/qemu-nbd.c
index be8043fb00..f4c5b247de 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -808,7 +808,9 @@ int main(int argc, char **argv)
 
 socket_activation = check_socket_activation();
 if (socket_activation == 0) {
-setup_address_and_port(, );
+if (!sockpath) {
+setup_address_and_port(, );
+}
 } else {
 /* Using socket activation - check user didn't use -p etc. */
 const char *err_msg = socket_activation_validate_opts(device, sockpath,
@@ -829,10 +831,6 @@ int main(int argc, char **argv)
 }
 
 if (tlscredsid) {
-if (sockpath) {
-error_report("TLS is only supported with IPv4/IPv6");
-exit(EXIT_FAILURE);
-}
 if (device) {
 error_report("TLS is not supported with a host device");
 exit(EXIT_FAILURE);
-- 
2.34.1




[PATCH 04/12] qemu-nbd: add --tls-hostname option for TLS certificate validation

2022-03-03 Thread Daniel P . Berrangé
When using the --list option, qemu-nbd acts as an NBD client rather
than a server. As such when using TLS, it has a need to validate
the server certificate. This adds a --tls-hostname option which can
be used to override the default hostname used for certificate
validation.

Signed-off-by: Daniel P. Berrangé 
---
 docs/tools/qemu-nbd.rst | 14 ++
 qemu-nbd.c  | 17 -
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst
index 6031f96893..acce54a39d 100644
--- a/docs/tools/qemu-nbd.rst
+++ b/docs/tools/qemu-nbd.rst
@@ -169,6 +169,20 @@ driver options if ``--image-opts`` is specified.
   option; or provide the credentials needed for connecting as a client
   in list mode.
 
+.. option:: --tls-hostname=hostname
+
+  When validating an x509 certificate received over a TLS connection,
+  the hostname that the NBD client used to connect will be checked
+  against information in the server provided certificate. Sometimes
+  it might be required to override the hostname used to perform this
+  check. For example if the NBD client is using a tunnel from localhost
+  to connect to the remote server. In this case the `--tls-hostname`
+  option should be used to set the officially expected hostname of
+  the remote NBD server. This can also be used if accessing NBD over
+  a UNIX socket where there is no inherant hostname available. This
+  only is only permitted when acting as a NBD client with the `--list`
+  option.
+
 .. option:: --fork
 
   Fork off the server process and exit the parent once the server is running.
diff --git a/qemu-nbd.c b/qemu-nbd.c
index c6c20df68a..be8043fb00 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -69,6 +69,7 @@
 #define QEMU_NBD_OPT_TLSAUTHZ  264
 #define QEMU_NBD_OPT_PID_FILE  265
 #define QEMU_NBD_OPT_SELINUX_LABEL 266
+#define QEMU_NBD_OPT_TLSHOSTNAME   267
 
 #define MBR_SIZE 512
 
@@ -542,6 +543,7 @@ int main(int argc, char **argv)
 { "export-name", required_argument, NULL, 'x' },
 { "description", required_argument, NULL, 'D' },
 { "tls-creds", required_argument, NULL, QEMU_NBD_OPT_TLSCREDS },
+{ "tls-hostname", required_argument, NULL, QEMU_NBD_OPT_TLSHOSTNAME },
 { "tls-authz", required_argument, NULL, QEMU_NBD_OPT_TLSAUTHZ },
 { "image-opts", no_argument, NULL, QEMU_NBD_OPT_IMAGE_OPTS },
 { "trace", required_argument, NULL, 'T' },
@@ -568,6 +570,7 @@ int main(int argc, char **argv)
 strList *bitmaps = NULL;
 bool alloc_depth = false;
 const char *tlscredsid = NULL;
+const char *tlshostname = NULL;
 bool imageOpts = false;
 bool writethrough = false; /* Client will flush as needed. */
 bool fork_process = false;
@@ -747,6 +750,9 @@ int main(int argc, char **argv)
 case QEMU_NBD_OPT_TLSCREDS:
 tlscredsid = optarg;
 break;
+case QEMU_NBD_OPT_TLSHOSTNAME:
+tlshostname = optarg;
+break;
 case QEMU_NBD_OPT_IMAGE_OPTS:
 imageOpts = true;
 break;
@@ -835,6 +841,10 @@ int main(int argc, char **argv)
 error_report("TLS authorization is incompatible with export list");
 exit(EXIT_FAILURE);
 }
+if (tlshostname && !list) {
+error_report("TLS hostname is only required with export list");
+exit(EXIT_FAILURE);
+}
 tlscreds = nbd_get_tls_creds(tlscredsid, list, _err);
 if (local_err) {
 error_reportf_err(local_err, "Failed to get TLS creds: ");
@@ -845,6 +855,10 @@ int main(int argc, char **argv)
 error_report("--tls-authz is not permitted without --tls-creds");
 exit(EXIT_FAILURE);
 }
+if (tlshostname) {
+error_report("--tls-hostname is not permitted without 
--tls-creds");
+exit(EXIT_FAILURE);
+}
 }
 
 if (selinux_label) {
@@ -861,7 +875,8 @@ int main(int argc, char **argv)
 
 if (list) {
 saddr = nbd_build_socket_address(sockpath, bindto, port);
-return qemu_nbd_client_list(saddr, tlscreds, bindto);
+return qemu_nbd_client_list(saddr, tlscreds,
+tlshostname ? tlshostname : bindto);
 }
 
 #if !HAVE_NBD_DEVICE
-- 
2.34.1




[PATCH 03/12] block/nbd: support override of hostname for TLS certificate validation

2022-03-03 Thread Daniel P . Berrangé
When connecting to an NBD server with TLS and x509 credentials,
the client must validate the hostname it uses for the connection,
against that published in the server's certificate. If the client
is tunnelling its connection over some other channel, however, the
hostname it uses may not match the info reported in the server's
certificate. In such a case, the user needs to explicitly set an
override for the hostname to use for certificate validation.

This is achieved by adding a 'tls-hostname' property to the NBD
block driver.

Signed-off-by: Daniel P. Berrangé 
---
 block/nbd.c  | 18 +++---
 qapi/block-core.json |  3 +++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index dd43929207..113aa5d3af 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -90,9 +90,10 @@ typedef struct BDRVNBDState {
 uint32_t reconnect_delay;
 uint32_t open_timeout;
 SocketAddress *saddr;
-char *export, *tlscredsid;
+char *export;
+char *tlscredsid;
 QCryptoTLSCreds *tlscreds;
-const char *tlshostname;
+char *tlshostname;
 char *x_dirty_bitmap;
 bool alloc_depth;
 
@@ -121,6 +122,8 @@ static void nbd_clear_bdrvstate(BlockDriverState *bs)
 s->export = NULL;
 g_free(s->tlscredsid);
 s->tlscredsid = NULL;
+g_free(s->tlshostname);
+s->tlshostname = NULL;
 g_free(s->x_dirty_bitmap);
 s->x_dirty_bitmap = NULL;
 }
@@ -1764,6 +1767,11 @@ static QemuOptsList nbd_runtime_opts = {
 .type = QEMU_OPT_STRING,
 .help = "ID of the TLS credentials to use",
 },
+{
+.name = "tls-hostname",
+.type = QEMU_OPT_STRING,
+.help = "Override hostname for validating TLS x509 certificate",
+},
 {
 .name = "x-dirty-bitmap",
 .type = QEMU_OPT_STRING,
@@ -1835,7 +1843,10 @@ static int nbd_process_options(BlockDriverState *bs, 
QDict *options,
 error_setg(errp, "TLS only supported over IP sockets");
 goto error;
 }
-s->tlshostname = s->saddr->u.inet.host;
+s->tlshostname = g_strdup(qemu_opt_get(opts, "tls-hostname"));
+if (!s->tlshostname) {
+s->tlshostname = g_strdup(s->saddr->u.inet.host);
+}
 }
 
 s->x_dirty_bitmap = g_strdup(qemu_opt_get(opts, "x-dirty-bitmap"));
@@ -2037,6 +2048,7 @@ static const char *const nbd_strong_runtime_opts[] = {
 "port",
 "export",
 "tls-creds",
+"tls-hostname",
 "server.",
 
 NULL
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 9a5a3641d0..c1b0435f57 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -4078,6 +4078,8 @@
 #
 # @tls-creds: TLS credentials ID
 #
+# @tls-hostname: TLS hostname override for certificate validation
+#
 # @x-dirty-bitmap: A metadata context name such as "qemu:dirty-bitmap:NAME"
 #  or "qemu:allocation-depth" to query in place of the
 #  traditional "base:allocation" block status (see
@@ -4108,6 +4110,7 @@
   'data': { 'server': 'SocketAddress',
 '*export': 'str',
 '*tls-creds': 'str',
+'*tls-hostname': 'str',
 '*x-dirty-bitmap': { 'type': 'str', 'features': [ 'unstable' ] },
 '*reconnect-delay': 'uint32',
 '*open-timeout': 'uint32' } }
-- 
2.34.1




[PATCH 02/12] block: pass desired TLS hostname through from block driver client

2022-03-03 Thread Daniel P . Berrangé
In

  commit a71d597b989fd701b923f09b3c20ac4fcaa55e81
  Author: Vladimir Sementsov-Ogievskiy 
  Date:   Thu Jun 10 13:08:00 2021 +0300

block/nbd: reuse nbd_co_do_establish_connection() in nbd_open()

the use of the 'hostname' field from the BDRVNBDState struct was
lost, and 'nbd_connect' just hardcoded it to match the IP socket
address. This was a harmless bug at the time since we block use
with anything other than IP sockets.

Shortly though, We want to allow the caller to override the hostname
used in the TLS certificate checks. This is to allow for TLS
when doing port forwarding or tunneling. Thus we need to reinstate
the passing along of the 'hostname'.

Signed-off-by: Daniel P. Berrangé 
---
 block/nbd.c |  7 ---
 include/block/nbd.h |  3 ++-
 nbd/client-connection.c | 12 +---
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 5853d85d60..dd43929207 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -92,7 +92,7 @@ typedef struct BDRVNBDState {
 SocketAddress *saddr;
 char *export, *tlscredsid;
 QCryptoTLSCreds *tlscreds;
-const char *hostname;
+const char *tlshostname;
 char *x_dirty_bitmap;
 bool alloc_depth;
 
@@ -1835,7 +1835,7 @@ static int nbd_process_options(BlockDriverState *bs, 
QDict *options,
 error_setg(errp, "TLS only supported over IP sockets");
 goto error;
 }
-s->hostname = s->saddr->u.inet.host;
+s->tlshostname = s->saddr->u.inet.host;
 }
 
 s->x_dirty_bitmap = g_strdup(qemu_opt_get(opts, "x-dirty-bitmap"));
@@ -1875,7 +1875,8 @@ static int nbd_open(BlockDriverState *bs, QDict *options, 
int flags,
 }
 
 s->conn = nbd_client_connection_new(s->saddr, true, s->export,
-s->x_dirty_bitmap, s->tlscreds);
+s->x_dirty_bitmap, s->tlscreds,
+s->tlshostname);
 
 if (s->open_timeout) {
 nbd_client_connection_enable_retry(s->conn);
diff --git a/include/block/nbd.h b/include/block/nbd.h
index 78d101b774..a98eb665da 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -415,7 +415,8 @@ NBDClientConnection *nbd_client_connection_new(const 
SocketAddress *saddr,
bool do_negotiation,
const char *export_name,
const char *x_dirty_bitmap,
-   QCryptoTLSCreds *tlscreds);
+   QCryptoTLSCreds *tlscreds,
+   const char *tlshostname);
 void nbd_client_connection_release(NBDClientConnection *conn);
 
 QIOChannel *coroutine_fn
diff --git a/nbd/client-connection.c b/nbd/client-connection.c
index 2bda42641d..2a632931c3 100644
--- a/nbd/client-connection.c
+++ b/nbd/client-connection.c
@@ -33,6 +33,7 @@ struct NBDClientConnection {
 /* Initialization constants, never change */
 SocketAddress *saddr; /* address to connect to */
 QCryptoTLSCreds *tlscreds;
+char *tlshostname;
 NBDExportInfo initial_info;
 bool do_negotiation;
 bool do_retry;
@@ -77,7 +78,8 @@ NBDClientConnection *nbd_client_connection_new(const 
SocketAddress *saddr,
bool do_negotiation,
const char *export_name,
const char *x_dirty_bitmap,
-   QCryptoTLSCreds *tlscreds)
+   QCryptoTLSCreds *tlscreds,
+   const char *tlshostname)
 {
 NBDClientConnection *conn = g_new(NBDClientConnection, 1);
 
@@ -85,6 +87,7 @@ NBDClientConnection *nbd_client_connection_new(const 
SocketAddress *saddr,
 *conn = (NBDClientConnection) {
 .saddr = QAPI_CLONE(SocketAddress, saddr),
 .tlscreds = tlscreds,
+.tlshostname = g_strdup(tlshostname),
 .do_negotiation = do_negotiation,
 
 .initial_info.request_sizes = true,
@@ -107,6 +110,7 @@ static void 
nbd_client_connection_do_free(NBDClientConnection *conn)
 }
 error_free(conn->err);
 qapi_free_SocketAddress(conn->saddr);
+g_free(conn->tlshostname);
 object_unref(OBJECT(conn->tlscreds));
 g_free(conn->initial_info.x_dirty_bitmap);
 g_free(conn->initial_info.name);
@@ -120,6 +124,7 @@ static void 
nbd_client_connection_do_free(NBDClientConnection *conn)
  */
 static int nbd_connect(QIOChannelSocket *sioc, SocketAddress *addr,
NBDExportInfo *info, QCryptoTLSCreds *tlscreds,
+   const char *tlshostname,
QIOChannel **outioc, Error **errp)
 {
 int ret;
@@ -140,7 +145,7 @@ static int nbd_connect(QIOChannelSocket 

[PATCH 08/12] tests/qemu-iotests: introduce filter for qemu-nbd export list

2022-03-03 Thread Daniel P . Berrangé
Introduce a filter for the output of qemu-nbd export list so it can be
reused in multiple tests.

The filter is a bit more permissive that what test 241 currently uses,
as its allows printing of the export count, along with any possible
error messages that might be emitted.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/241   | 6 +++---
 tests/qemu-iotests/241.out   | 3 +++
 tests/qemu-iotests/common.filter | 5 +
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/241 b/tests/qemu-iotests/241
index c962c8b607..f196650afa 100755
--- a/tests/qemu-iotests/241
+++ b/tests/qemu-iotests/241
@@ -58,7 +58,7 @@ echo
 
 nbd_server_start_unix_socket -f $IMGFMT "$TEST_IMG_FILE"
 
-$QEMU_NBD_PROG --list -k $nbd_unix_socket | grep '\(size\|min\)'
+$QEMU_NBD_PROG --list -k $nbd_unix_socket | _filter_qemu_nbd_exports
 $QEMU_IMG map -f raw --output=json "$TEST_IMG" | _filter_qemu_img_map
 $QEMU_IO -f raw -c map "$TEST_IMG"
 nbd_server_stop
@@ -71,7 +71,7 @@ echo
 # sector alignment, here at the server.
 nbd_server_start_unix_socket "$TEST_IMG_FILE" 2> "$TEST_DIR/server.log"
 
-$QEMU_NBD_PROG --list -k $nbd_unix_socket | grep '\(size\|min\)'
+$QEMU_NBD_PROG --list -k $nbd_unix_socket | _filter_qemu_nbd_exports
 $QEMU_IMG map -f raw --output=json "$TEST_IMG" | _filter_qemu_img_map
 $QEMU_IO -f raw -c map "$TEST_IMG"
 nbd_server_stop
@@ -84,7 +84,7 @@ echo
 # Now force sector alignment at the client.
 nbd_server_start_unix_socket -f $IMGFMT "$TEST_IMG_FILE"
 
-$QEMU_NBD_PROG --list -k $nbd_unix_socket | grep '\(size\|min\)'
+$QEMU_NBD_PROG --list -k $nbd_unix_socket | _filter_qemu_nbd_exports
 $QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
 $QEMU_IO -c map "$TEST_IMG"
 nbd_server_stop
diff --git a/tests/qemu-iotests/241.out b/tests/qemu-iotests/241.out
index 56e95b599a..db2d71ab9d 100644
--- a/tests/qemu-iotests/241.out
+++ b/tests/qemu-iotests/241.out
@@ -2,6 +2,7 @@ QA output created by 241
 
 === Exporting unaligned raw image, natural alignment ===
 
+exports available: 1
   size:  1024
   min block: 1
 [{ "start": 0, "length": 1000, "depth": 0, "present": true, "zero": false, 
"data": true, "offset": OFFSET},
@@ -10,6 +11,7 @@ QA output created by 241
 
 === Exporting unaligned raw image, forced server sector alignment ===
 
+exports available: 1
   size:  1024
   min block: 512
 [{ "start": 0, "length": 1024, "depth": 0, "present": true, "zero": false, 
"data": true, "offset": OFFSET}]
@@ -20,6 +22,7 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' 
and probing guessed
 
 === Exporting unaligned raw image, forced client sector alignment ===
 
+exports available: 1
   size:  1024
   min block: 1
 [{ "start": 0, "length": 1000, "depth": 0, "present": true, "zero": false, 
"data": true, "offset": OFFSET},
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index 25d1d22929..940c9884bd 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -308,6 +308,11 @@ _filter_nbd()
 -e 's#\(foo\|PORT/\?\|.sock\): Failed to .*$#\1#'
 }
 
+_filter_qemu_nbd_exports()
+{
+grep '\(exports available\|size\|min block\|qemu-nbd\):'
+}
+
 _filter_qmp_empty_return()
 {
 grep -v '{"return": {}}'
-- 
2.34.1




[PATCH 11/12] tests/qemu-iotests: validate NBD TLS with UNIX sockets

2022-03-03 Thread Daniel P . Berrangé
This validates that connections to an NBD server running on a UNIX
socket can use TLS, and require a TLS hostname override to pass
certificate validation.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/233 | 24 
 tests/qemu-iotests/233.out | 15 +++
 2 files changed, 39 insertions(+)

diff --git a/tests/qemu-iotests/233 b/tests/qemu-iotests/233
index 09cfb7039b..27b0a123d3 100755
--- a/tests/qemu-iotests/233
+++ b/tests/qemu-iotests/233
@@ -167,6 +167,30 @@ $QEMU_IMG info --image-opts \
 driver=nbd,host=$nbd_tcp_addr,port=$nbd_tcp_port,tls-creds=tls0 \
 2>&1 | _filter_nbd
 
+nbd_server_stop
+
+nbd_server_start_unix_socket \
+--object 
tls-creds-x509,dir=${tls_dir}/server1,endpoint=server,id=tls0,verify-peer=on \
+--tls-creds tls0 \
+-f $IMGFMT "$TEST_IMG" 2>> "$TEST_DIR/server.log"
+
+echo
+echo "== check TLS fail over UNIX with no hostname =="
+obj1=tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0
+$QEMU_IMG info --image-opts --object $obj1 \
+driver=nbd,path=$nbd_unix_socket,tls-creds=tls0 2>&1 | _filter_nbd
+$QEMU_NBD_PROG -L -k $nbd_unix_socket --object $obj1 --tls-creds=tls0 \
+2>&1 | _filter_qemu_nbd_exports
+
+echo
+echo "== check TLS works over UNIX with hostname override =="
+obj1=tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0
+$QEMU_IMG info --image-opts --object $obj1 \
+driver=nbd,path=$nbd_unix_socket,tls-creds=tls0,tls-hostname=127.0.0.1 \
+2>&1 | _filter_nbd
+$QEMU_NBD_PROG -L -k $nbd_unix_socket --object $obj1 \
+--tls-creds=tls0 --tls-hostname=127.0.0.1  2>&1 | _filter_qemu_nbd_exports
+
 echo
 echo "== final server log =="
 cat "$TEST_DIR/server.log" | _filter_authz_check_tls
diff --git a/tests/qemu-iotests/233.out b/tests/qemu-iotests/233.out
index 05abf470ac..a00e4c5b08 100644
--- a/tests/qemu-iotests/233.out
+++ b/tests/qemu-iotests/233.out
@@ -66,6 +66,19 @@ read 1048576/1048576 bytes at offset 1048576
 qemu-img: Could not open 'driver=nbd,host=127.0.0.1,port=PORT,tls-creds=tls0': 
Failed to read option reply: Cannot read from TLS channel: Software caused 
connection abort
 qemu-img: Could not open 'driver=nbd,host=127.0.0.1,port=PORT,tls-creds=tls0': 
Failed to read option reply: Cannot read from TLS channel: Software caused 
connection abort
 
+== check TLS fail over UNIX with no hostname ==
+qemu-img: Could not open 
'driver=nbd,path=SOCK_DIR/qemu-nbd.sock,tls-creds=tls0': No hostname for 
certificate validation
+qemu-nbd: No hostname for certificate validation
+
+== check TLS works over UNIX with hostname override ==
+image: nbd+unix://?socket=SOCK_DIR/qemu-nbd.sock
+file format: nbd
+virtual size: 64 MiB (67108864 bytes)
+disk size: unavailable
+exports available: 1
+  size:  67108864
+  min block: 1
+
 == final server log ==
 qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
 qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
@@ -73,4 +86,6 @@ qemu-nbd: option negotiation failed: Verify failed: No 
certificate was found.
 qemu-nbd: option negotiation failed: Verify failed: No certificate was found.
 qemu-nbd: option negotiation failed: TLS x509 authz check for 
DISTINGUISHED-NAME is denied
 qemu-nbd: option negotiation failed: TLS x509 authz check for 
DISTINGUISHED-NAME is denied
+qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
+qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
 *** done
-- 
2.34.1




[PATCH 10/12] tests/qemu-iotests: validate NBD TLS with hostname mismatch

2022-03-03 Thread Daniel P . Berrangé
This validates that connections to an NBD server where the certificate
hostname does not match will fail. It further validates that using the
new 'tls-hostname' override option can solve the failure.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/233| 18 ++
 tests/qemu-iotests/233.out| 15 +++
 tests/qemu-iotests/common.tls |  7 ---
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/233 b/tests/qemu-iotests/233
index 050267298d..09cfb7039b 100755
--- a/tests/qemu-iotests/233
+++ b/tests/qemu-iotests/233
@@ -106,6 +106,24 @@ $QEMU_IMG info --image-opts --object $obj2 \
 $QEMU_NBD_PROG -L -b $nbd_tcp_addr -p $nbd_tcp_port --object $obj1 \
 --tls-creds=tls0 2>&1 | _filter_qemu_nbd_exports
 
+echo
+echo "== check TLS fail over TCP with mismatch hostname =="
+obj1=tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0
+$QEMU_IMG info --image-opts --object $obj1 \
+driver=nbd,host=localhost,port=$nbd_tcp_port,tls-creds=tls0 \
+2>&1 | _filter_nbd
+$QEMU_NBD_PROG -L -b localhost -p $nbd_tcp_port --object $obj1 \
+--tls-creds=tls0 | _filter_qemu_nbd_exports
+
+echo
+echo "== check TLS works over TCP with mismatch hostname and override =="
+obj1=tls-creds-x509,dir=${tls_dir}/client1,endpoint=client,id=tls0
+$QEMU_IMG info --image-opts --object $obj1 \
+
driver=nbd,host=localhost,port=$nbd_tcp_port,tls-creds=tls0,tls-hostname=127.0.0.1
 \
+2>&1 | _filter_nbd
+$QEMU_NBD_PROG -L -b localhost -p $nbd_tcp_port --object $obj1 \
+--tls-creds=tls0 --tls-hostname=127.0.0.1 | _filter_qemu_nbd_exports
+
 echo
 echo "== check TLS with different CA fails =="
 obj=tls-creds-x509,dir=${tls_dir}/client2,endpoint=client,id=tls0
diff --git a/tests/qemu-iotests/233.out b/tests/qemu-iotests/233.out
index a1e45765b8..05abf470ac 100644
--- a/tests/qemu-iotests/233.out
+++ b/tests/qemu-iotests/233.out
@@ -37,6 +37,19 @@ exports available: 1
   size:  67108864
   min block: 1
 
+== check TLS fail over TCP with mismatch hostname ==
+qemu-img: Could not open 'driver=nbd,host=localhost,port=PORT,tls-creds=tls0': 
Certificate does not match the hostname localhost
+qemu-nbd: Certificate does not match the hostname localhost
+
+== check TLS works over TCP with mismatch hostname and override ==
+image: nbd://localhost:PORT
+file format: nbd
+virtual size: 64 MiB (67108864 bytes)
+disk size: unavailable
+exports available: 1
+  size:  67108864
+  min block: 1
+
 == check TLS with different CA fails ==
 qemu-img: Could not open 'driver=nbd,host=127.0.0.1,port=PORT,tls-creds=tls0': 
The certificate hasn't got a known issuer
 qemu-nbd: The certificate hasn't got a known issuer
@@ -54,6 +67,8 @@ qemu-img: Could not open 
'driver=nbd,host=127.0.0.1,port=PORT,tls-creds=tls0': F
 qemu-img: Could not open 'driver=nbd,host=127.0.0.1,port=PORT,tls-creds=tls0': 
Failed to read option reply: Cannot read from TLS channel: Software caused 
connection abort
 
 == final server log ==
+qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
+qemu-nbd: option negotiation failed: Failed to read opts magic: Cannot read 
from TLS channel: Software caused connection abort
 qemu-nbd: option negotiation failed: Verify failed: No certificate was found.
 qemu-nbd: option negotiation failed: Verify failed: No certificate was found.
 qemu-nbd: option negotiation failed: TLS x509 authz check for 
DISTINGUISHED-NAME is denied
diff --git a/tests/qemu-iotests/common.tls b/tests/qemu-iotests/common.tls
index 6ba28a78d3..4a5760949d 100644
--- a/tests/qemu-iotests/common.tls
+++ b/tests/qemu-iotests/common.tls
@@ -118,12 +118,13 @@ tls_x509_create_server()
 caname=$1
 name=$2
 
+# We don't include 'localhost' in the cert, as
+# we want to keep it unlisted to let tests
+# validate hostname override
 mkdir -p "${tls_dir}/$name"
 cat > "${tls_dir}/cert.info" <

[PATCH 07/12] tests/qemu-iotests: expand _filter_nbd rules

2022-03-03 Thread Daniel P . Berrangé
Some tests will want to use 'localhost' instead of '127.0.0.1', and
some will use the image options syntax rather than the classic URI
syntax.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/common.filter | 4 
 1 file changed, 4 insertions(+)

diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index 75cc241580..25d1d22929 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -300,6 +300,10 @@ _filter_nbd()
 # Filter out the TCP port number since this changes between runs.
 $SED -e '/nbd\/.*\.c:/d' \
 -e 's#127\.0\.0\.1:[0-9]*#127.0.0.1:PORT#g' \
+-e 's#localhost:[0-9]*#localhost:PORT#g' \
+-e 's#host=127\.0\.0\.1,port=[0-9]*#host=127.0.0.1,port=PORT#g' \
+-e 's#host=localhost,port=[0-9]*#host=localhost,port=PORT#g' \
+-e "s#path=$SOCK_DIR#path=SOCK_DIR#g" \
 -e "s#?socket=$SOCK_DIR#?socket=SOCK_DIR#g" \
 -e 's#\(foo\|PORT/\?\|.sock\): Failed to .*$#\1#'
 }
-- 
2.34.1




[PATCH 06/12] tests/qemu-iotests: add QEMU_IOTESTS_REGEN=1 to update reference file

2022-03-03 Thread Daniel P . Berrangé
When developing an I/O test it is typical to add some logic to the
test script, run it to view the output diff, and then apply the
output diff to the reference file. This can be drastically simplified
by letting the test runner update the reference file in place.

By setting 'QEMU_IOTESTS_REGEN=1', the test runner will report the
failure and show the diff, but at the same time update the reference
file. So next time the I/O test is run it will succeed.

Continuing to display the diff when updating the reference gives the
developer a chance to review what was changed.

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/testrunner.py | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/tests/qemu-iotests/testrunner.py b/tests/qemu-iotests/testrunner.py
index 9a94273975..8a82696a6b 100644
--- a/tests/qemu-iotests/testrunner.py
+++ b/tests/qemu-iotests/testrunner.py
@@ -25,6 +25,7 @@
 import contextlib
 import json
 import termios
+import shutil
 import sys
 from multiprocessing import Pool
 from contextlib import contextmanager
@@ -320,6 +321,11 @@ def do_run_test(self, test: str, mp: bool) -> TestResult:
 
 diff = file_diff(str(f_reference), str(f_bad))
 if diff:
+if os.environ.get("QEMU_IOTESTS_REGEN", None) is not None:
+shutil.copyfile(str(f_bad), str(f_reference))
+print("")
+print("#REFERENCE FILE UPDATED#")
+print("")
 return TestResult(status='fail', elapsed=elapsed,
   description=f'output mismatch (see {f_bad})',
   diff=diff, casenotrun=casenotrun)
-- 
2.34.1




[PATCH 01/12] crypto: mandate a hostname when checking x509 creds on a client

2022-03-03 Thread Daniel P . Berrangé
Currently the TLS session object assumes that the caller will always
provide a hostname when using x509 creds on a client endpoint. This
relies on the caller to detect and report an error if the user has
configured QEMU with x509 credentials on a UNIX socket. The migration
code has such a check, but it is too broad, reporting an error when
the user has configured QEMU with PSK credentials on a UNIX socket,
where hostnames are irrelevant.

Putting the check into the TLS session object credentials validation
code ensures we report errors in only the scenario that matters.

Signed-off-by: Daniel P. Berrangé 
---
 crypto/tlssession.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/crypto/tlssession.c b/crypto/tlssession.c
index a8db8c76d1..b302d835d2 100644
--- a/crypto/tlssession.c
+++ b/crypto/tlssession.c
@@ -373,6 +373,12 @@ qcrypto_tls_session_check_certificate(QCryptoTLSSession 
*session,
session->hostname);
 goto error;
 }
+} else {
+if (session->creds->endpoint ==
+QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) {
+error_setg(errp, "No hostname for certificate validation");
+goto error;
+}
 }
 }
 
-- 
2.34.1




[PATCH 00/12] nbd: enable use of TLS on non-TCP transports and other TLS improvements

2022-03-03 Thread Daniel P . Berrangé
This series was principally motivated by a desire to enabl use of TLS
on non-TCP transports. For x509 certificates this means we need a way
to set the hostname to use for validation. This also lets us override
the hostname when connecting on a TCP transport that is tunnelled or
port-forwarded. It also unlocks the ability to use PSK (pre-shared
keys) with UNIX sockets which would always have worked, had it not
been blocked by explicit checks in NBD code.

NB, the first patch in this series is common with my corresponding
migration series for TLS

  https://lists.gnu.org/archive/html/qemu-devel/2022-03/msg00556.html

Daniel P. Berrangé (12):
  crypto: mandate a hostname when checking x509 creds on a client
  block: pass desired TLS hostname through from block driver client
  block/nbd: support override of hostname for TLS certificate validation
  qemu-nbd: add --tls-hostname option for TLS certificate validation
  block/nbd: don't restrict TLS usage to IP sockets
  tests/qemu-iotests: add QEMU_IOTESTS_REGEN=1 to update reference file
  tests/qemu-iotests: expand _filter_nbd rules
  tests/qemu-iotests: introduce filter for qemu-nbd export list
  tests/qemu-iotests: convert NBD TLS test to use standard filters
  tests/qemu-iotests: validate NBD TLS with hostname mismatch
  tests/qemu-iotests: validate NBD TLS with UNIX sockets
  tests/qemu-iotests: validate NBD TLS with UNIX sockets and PSK

 block/nbd.c  | 25 +---
 blockdev-nbd.c   |  6 --
 crypto/tlssession.c  |  6 ++
 docs/tools/qemu-nbd.rst  | 14 +
 include/block/nbd.h  |  3 +-
 nbd/client-connection.c  | 12 +++-
 qapi/block-core.json |  3 +
 qemu-nbd.c   | 25 ++--
 tests/qemu-iotests/233   | 99 +++-
 tests/qemu-iotests/233.out   | 56 +++---
 tests/qemu-iotests/241   |  6 +-
 tests/qemu-iotests/241.out   |  3 +
 tests/qemu-iotests/common.filter |  9 +++
 tests/qemu-iotests/common.tls| 31 +-
 tests/qemu-iotests/testrunner.py |  6 ++
 15 files changed, 252 insertions(+), 52 deletions(-)

-- 
2.34.1





[PATCH v8 28/31] block_int-common.h: assertions in the callers of BdrvChildClass function pointers

2022-03-03 Thread Emanuele Giuseppe Esposito
Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 5afdbe3060..01811d6519 100644
--- a/block.c
+++ b/block.c
@@ -1497,7 +1497,7 @@ const BdrvChildClass child_of_bds = {
 
 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
 {
-IO_CODE();
+GLOBAL_STATE_CODE();
 return c->klass->get_parent_aio_context(c);
 }
 
@@ -2128,6 +2128,7 @@ bool bdrv_is_writable(BlockDriverState *bs)
 
 static char *bdrv_child_user_desc(BdrvChild *c)
 {
+GLOBAL_STATE_CODE();
 return c->klass->get_parent_desc(c);
 }
 
@@ -2844,6 +2845,7 @@ static void bdrv_replace_child_noperm(BdrvChild **childp,
 
 assert(!child->frozen);
 assert(old_bs != new_bs);
+GLOBAL_STATE_CODE();
 
 if (old_bs && new_bs) {
 assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
@@ -2940,6 +2942,7 @@ static void bdrv_attach_child_common_abort(void *opaque)
 BdrvChild *child = *s->child;
 BlockDriverState *bs = child->bs;
 
+GLOBAL_STATE_CODE();
 /*
  * Pass free_empty_child=false, because we still need the child
  * for the AioContext operations on the parent below; those
@@ -3308,6 +3311,7 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild 
*child)
 static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
 {
 BdrvChild *c;
+GLOBAL_STATE_CODE();
 QLIST_FOREACH(c, >parents, next_parent) {
 if (c->klass->change_media) {
 c->klass->change_media(c, load);
@@ -3807,6 +3811,7 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
 
 assert(!child_class || !flags);
 assert(!child_class == !parent);
+GLOBAL_STATE_CODE();
 
 if (reference) {
 bool options_non_empty = options ? qdict_size(options) : false;
@@ -4193,6 +4198,7 @@ static BlockReopenQueue 
*bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
  * important to avoid graph changes between the recursive queuing here and
  * bdrv_reopen_multiple(). */
 assert(bs->quiesce_counter > 0);
+GLOBAL_STATE_CODE();
 
 if (bs_queue == NULL) {
 bs_queue = g_new0(BlockReopenQueue, 1);
@@ -7327,6 +7333,7 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
 BdrvChild *child, *parent;
 
 g_assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+GLOBAL_STATE_CODE();
 
 if (old_context == new_context) {
 return;
@@ -7399,6 +7406,7 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
 static bool bdrv_parent_can_set_aio_context(BdrvChild *c, AioContext *ctx,
 GSList **ignore, Error **errp)
 {
+GLOBAL_STATE_CODE();
 if (g_slist_find(*ignore, c)) {
 return true;
 }
-- 
2.31.1




[PATCH v8 27/31] block_int-common.h: split function pointers in BdrvChildClass

2022-03-03 Thread Emanuele Giuseppe Esposito
Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/block/block_int-common.h | 81 ++--
 1 file changed, 47 insertions(+), 34 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index f05ebb0da3..5a04c778e4 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -830,19 +830,16 @@ struct BdrvChildClass {
  */
 bool parent_is_bds;
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
 void (*inherit_options)(BdrvChildRole role, bool parent_is_format,
 int *child_flags, QDict *child_options,
 int parent_flags, QDict *parent_options);
-
 void (*change_media)(BdrvChild *child, bool load);
-void (*resize)(BdrvChild *child);
-
-/*
- * Returns a name that is supposedly more useful for human users than the
- * node name for identifying the node in question (in particular, a BB
- * name), or NULL if the parent can't provide a better name.
- */
-const char *(*get_name)(BdrvChild *child);
 
 /*
  * Returns a malloced string that describes the parent of the child for a
@@ -852,6 +849,47 @@ struct BdrvChildClass {
  */
 char *(*get_parent_desc)(BdrvChild *child);
 
+/*
+ * Notifies the parent that the child has been activated/inactivated (e.g.
+ * when migration is completing) and it can start/stop requesting
+ * permissions and doing I/O on it.
+ */
+void (*activate)(BdrvChild *child, Error **errp);
+int (*inactivate)(BdrvChild *child);
+
+void (*attach)(BdrvChild *child);
+void (*detach)(BdrvChild *child);
+
+/*
+ * Notifies the parent that the filename of its child has changed (e.g.
+ * because the direct child was removed from the backing chain), so that it
+ * can update its reference.
+ */
+int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
+   const char *filename, Error **errp);
+
+bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx,
+GSList **ignore, Error **errp);
+void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore);
+
+AioContext *(*get_parent_aio_context)(BdrvChild *child);
+
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
+void (*resize)(BdrvChild *child);
+
+/*
+ * Returns a name that is supposedly more useful for human users than the
+ * node name for identifying the node in question (in particular, a BB
+ * name), or NULL if the parent can't provide a better name.
+ */
+const char *(*get_name)(BdrvChild *child);
+
 /*
  * If this pair of functions is implemented, the parent doesn't issue new
  * requests after returning from .drained_begin() until .drained_end() is
@@ -876,31 +914,6 @@ struct BdrvChildClass {
  * activity on the child has stopped.
  */
 bool (*drained_poll)(BdrvChild *child);
-
-/*
- * Notifies the parent that the child has been activated/inactivated (e.g.
- * when migration is completing) and it can start/stop requesting
- * permissions and doing I/O on it.
- */
-void (*activate)(BdrvChild *child, Error **errp);
-int (*inactivate)(BdrvChild *child);
-
-void (*attach)(BdrvChild *child);
-void (*detach)(BdrvChild *child);
-
-/*
- * Notifies the parent that the filename of its child has changed (e.g.
- * because the direct child was removed from the backing chain), so that it
- * can update its reference.
- */
-int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
-   const char *filename, Error **errp);
-
-bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx,
-GSList **ignore, Error **errp);
-void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore);
-
-AioContext *(*get_parent_aio_context)(BdrvChild *child);
 };
 
 extern const BdrvChildClass child_of_bds;
-- 
2.31.1




[PATCH v8 31/31] job.h: assertions in the callers of JobDriver function pointers

2022-03-03 Thread Emanuele Giuseppe Esposito
Signed-off-by: Emanuele Giuseppe Esposito 
---
 job.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/job.c b/job.c
index 54db80df66..075c6f3a20 100644
--- a/job.c
+++ b/job.c
@@ -381,6 +381,8 @@ void job_ref(Job *job)
 
 void job_unref(Job *job)
 {
+GLOBAL_STATE_CODE();
+
 if (--job->refcnt == 0) {
 assert(job->status == JOB_STATUS_NULL);
 assert(!timer_pending(>sleep_timer));
@@ -602,6 +604,7 @@ bool job_user_paused(Job *job)
 void job_user_resume(Job *job, Error **errp)
 {
 assert(job);
+GLOBAL_STATE_CODE();
 if (!job->user_paused || job->pause_count <= 0) {
 error_setg(errp, "Can't resume a job that was not paused");
 return;
@@ -672,6 +675,7 @@ static void job_update_rc(Job *job)
 static void job_commit(Job *job)
 {
 assert(!job->ret);
+GLOBAL_STATE_CODE();
 if (job->driver->commit) {
 job->driver->commit(job);
 }
@@ -680,6 +684,7 @@ static void job_commit(Job *job)
 static void job_abort(Job *job)
 {
 assert(job->ret);
+GLOBAL_STATE_CODE();
 if (job->driver->abort) {
 job->driver->abort(job);
 }
@@ -687,6 +692,7 @@ static void job_abort(Job *job)
 
 static void job_clean(Job *job)
 {
+GLOBAL_STATE_CODE();
 if (job->driver->clean) {
 job->driver->clean(job);
 }
@@ -726,6 +732,7 @@ static int job_finalize_single(Job *job)
 
 static void job_cancel_async(Job *job, bool force)
 {
+GLOBAL_STATE_CODE();
 if (job->driver->cancel) {
 force = job->driver->cancel(job, force);
 } else {
@@ -825,6 +832,7 @@ static void job_completed_txn_abort(Job *job)
 
 static int job_prepare(Job *job)
 {
+GLOBAL_STATE_CODE();
 if (job->ret == 0 && job->driver->prepare) {
 job->ret = job->driver->prepare(job);
 job_update_rc(job);
@@ -952,6 +960,7 @@ static void coroutine_fn job_co_entry(void *opaque)
 Job *job = opaque;
 
 assert(job && job->driver && job->driver->run);
+assert(job->aio_context == qemu_get_current_aio_context());
 job_pause_point(job);
 job->ret = job->driver->run(job, >err);
 job->deferred_to_main_loop = true;
@@ -1054,6 +1063,7 @@ void job_complete(Job *job, Error **errp)
 {
 /* Should not be reachable via external interface for internal jobs */
 assert(job->id);
+GLOBAL_STATE_CODE();
 if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
 return;
 }
-- 
2.31.1




[PATCH v8 26/31] block_int-common.h: assertions in the callers of BlockDriver function pointers

2022-03-03 Thread Emanuele Giuseppe Esposito
Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c| 17 +
 block/create.c |  2 ++
 2 files changed, 19 insertions(+)

diff --git a/block.c b/block.c
index 4a3447b2a0..5afdbe3060 100644
--- a/block.c
+++ b/block.c
@@ -529,6 +529,7 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque)
 
 CreateCo *cco = opaque;
 assert(cco->drv);
+GLOBAL_STATE_CODE();
 
 ret = cco->drv->bdrv_co_create_opts(cco->drv,
 cco->filename, cco->opts, _err);
@@ -1096,6 +1097,7 @@ int refresh_total_sectors(BlockDriverState *bs, int64_t 
hint)
 static void bdrv_join_options(BlockDriverState *bs, QDict *options,
   QDict *old_options)
 {
+GLOBAL_STATE_CODE();
 if (bs->drv && bs->drv->bdrv_join_options) {
 bs->drv->bdrv_join_options(options, old_options);
 } else {
@@ -1605,6 +1607,7 @@ static int bdrv_open_driver(BlockDriverState *bs, 
BlockDriver *drv,
 {
 Error *local_err = NULL;
 int i, ret;
+GLOBAL_STATE_CODE();
 
 bdrv_assign_node_name(bs, node_name, _err);
 if (local_err) {
@@ -1996,6 +1999,8 @@ static int bdrv_fill_options(QDict **options, const char 
*filename,
 BlockDriver *drv = NULL;
 Error *local_err = NULL;
 
+GLOBAL_STATE_CODE();
+
 /*
  * Caution: while qdict_get_try_str() is fine, getting non-string
  * types would require more care.  When @options come from
@@ -2192,6 +2197,7 @@ static void bdrv_child_perm(BlockDriverState *bs, 
BlockDriverState *child_bs,
 uint64_t *nperm, uint64_t *nshared)
 {
 assert(bs->drv && bs->drv->bdrv_child_perm);
+GLOBAL_STATE_CODE();
 bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
  parent_perm, parent_shared,
  nperm, nshared);
@@ -2280,6 +2286,7 @@ static void bdrv_drv_set_perm_commit(void *opaque)
 {
 BlockDriverState *bs = opaque;
 uint64_t cumulative_perms, cumulative_shared_perms;
+GLOBAL_STATE_CODE();
 
 if (bs->drv->bdrv_set_perm) {
 bdrv_get_cumulative_perm(bs, _perms,
@@ -2291,6 +2298,7 @@ static void bdrv_drv_set_perm_commit(void *opaque)
 static void bdrv_drv_set_perm_abort(void *opaque)
 {
 BlockDriverState *bs = opaque;
+GLOBAL_STATE_CODE();
 
 if (bs->drv->bdrv_abort_perm_update) {
 bs->drv->bdrv_abort_perm_update(bs);
@@ -2306,6 +2314,7 @@ static int bdrv_drv_set_perm(BlockDriverState *bs, 
uint64_t perm,
  uint64_t shared_perm, Transaction *tran,
  Error **errp)
 {
+GLOBAL_STATE_CODE();
 if (!bs->drv) {
 return 0;
 }
@@ -4372,6 +4381,7 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, 
Error **errp)
 
 assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 assert(bs_queue != NULL);
+GLOBAL_STATE_CODE();
 
 QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
 ctx = bdrv_get_aio_context(bs_entry->state.bs);
@@ -4637,6 +4647,7 @@ static int bdrv_reopen_prepare(BDRVReopenState 
*reopen_state,
 
 assert(reopen_state != NULL);
 assert(reopen_state->bs->drv != NULL);
+GLOBAL_STATE_CODE();
 drv = reopen_state->bs->drv;
 
 /* This function and each driver's bdrv_reopen_prepare() remove
@@ -4847,6 +4858,7 @@ static void bdrv_reopen_commit(BDRVReopenState 
*reopen_state)
 bs = reopen_state->bs;
 drv = bs->drv;
 assert(drv != NULL);
+GLOBAL_STATE_CODE();
 
 /* If there are any driver level actions to take */
 if (drv->bdrv_reopen_commit) {
@@ -4888,6 +4900,7 @@ static void bdrv_reopen_abort(BDRVReopenState 
*reopen_state)
 assert(reopen_state != NULL);
 drv = reopen_state->bs->drv;
 assert(drv != NULL);
+GLOBAL_STATE_CODE();
 
 if (drv->bdrv_reopen_abort) {
 drv->bdrv_reopen_abort(reopen_state);
@@ -6723,6 +6736,8 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
 int ret;
 uint64_t cumulative_perms, cumulative_shared_perms;
 
+GLOBAL_STATE_CODE();
+
 if (!bs->drv) {
 return -ENOMEDIUM;
 }
@@ -7237,6 +7252,7 @@ static void bdrv_detach_aio_context(BlockDriverState *bs)
 BdrvAioNotifier *baf, *baf_tmp;
 
 assert(!bs->walking_aio_notifiers);
+GLOBAL_STATE_CODE();
 bs->walking_aio_notifiers = true;
 QLIST_FOREACH_SAFE(baf, >aio_notifiers, list, baf_tmp) {
 if (baf->deleted) {
@@ -7264,6 +7280,7 @@ static void bdrv_attach_aio_context(BlockDriverState *bs,
 AioContext *new_context)
 {
 BdrvAioNotifier *ban, *ban_tmp;
+GLOBAL_STATE_CODE();
 
 if (bs->quiesce_counter) {
 aio_disable_external(new_context);
diff --git a/block/create.c b/block/create.c
index 89812669df..4df43f11f4 100644
--- a/block/create.c
+++ b/block/create.c
@@ -42,6 +42,8 @@ static int coroutine_fn blockdev_create_run(Job *job, Error 
**errp)
 BlockdevCreateJob *s = container_of(job, 

[PATCH v8 29/31] block-backend-common.h: split function pointers in BlockDevOps

2022-03-03 Thread Emanuele Giuseppe Esposito
Assertions in the callers of the function pointrs are already
added by previous patches.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Philippe Mathieu-Daudé 
---
 include/sysemu/block-backend-common.h | 28 ++-
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/sysemu/block-backend-common.h 
b/include/sysemu/block-backend-common.h
index 6963bbf45a..2391679c56 100644
--- a/include/sysemu/block-backend-common.h
+++ b/include/sysemu/block-backend-common.h
@@ -27,6 +27,14 @@
 
 /* Callbacks for block device models */
 typedef struct BlockDevOps {
+
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /*
  * Runs when virtual media changed (monitor commands eject, change)
  * Argument load is true on load and false on eject.
@@ -44,16 +52,26 @@ typedef struct BlockDevOps {
  * true, even if they do not support eject requests.
  */
 void (*eject_request_cb)(void *opaque, bool force);
-/*
- * Is the virtual tray open?
- * Device models implement this only when the device has a tray.
- */
-bool (*is_tray_open)(void *opaque);
+
 /*
  * Is the virtual medium locked into the device?
  * Device models implement this only when device has such a lock.
  */
 bool (*is_medium_locked)(void *opaque);
+
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
+/*
+ * Is the virtual tray open?
+ * Device models implement this only when the device has a tray.
+ */
+bool (*is_tray_open)(void *opaque);
+
 /*
  * Runs when the size changed (e.g. monitor command block_resize)
  */
-- 
2.31.1




[PATCH v8 25/31] block_int-common.h: split function pointers in BlockDriver

2022-03-03 Thread Emanuele Giuseppe Esposito
Similar to the header split, also the function pointers in BlockDriver
can be split in I/O and global state.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/block/block_int-common.h | 445 ---
 1 file changed, 237 insertions(+), 208 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index b92e3630fd..f05ebb0da3 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -96,6 +96,11 @@ typedef struct BdrvTrackedRequest {
 
 
 struct BlockDriver {
+/*
+ * These fields are initialized when this object is created,
+ * and are never changed afterwards.
+ */
+
 const char *format_name;
 int instance_size;
 
@@ -122,6 +127,69 @@ struct BlockDriver {
  */
 bool is_format;
 
+/*
+ * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
+ * this field set to true, except ones that are defined only by their
+ * child's bs.
+ * An example of the last type will be the quorum block driver.
+ */
+bool bdrv_needs_filename;
+
+/*
+ * Set if a driver can support backing files. This also implies the
+ * following semantics:
+ *
+ *  - Return status 0 of .bdrv_co_block_status means that corresponding
+ *blocks are not allocated in this layer of backing-chain
+ *  - For such (unallocated) blocks, read will:
+ *- fill buffer with zeros if there is no backing file
+ *- read from the backing file otherwise, where the block layer
+ *  takes care of reading zeros beyond EOF if backing file is short
+ */
+bool supports_backing;
+
+bool has_variable_length;
+
+/*
+ * Drivers setting this field must be able to work with just a plain
+ * filename with ':' as a prefix, and no other options.
+ * Options may be extracted from the filename by implementing
+ * bdrv_parse_filename.
+ */
+const char *protocol_name;
+
+/* List of options for creating images, terminated by name == NULL */
+QemuOptsList *create_opts;
+
+/* List of options for image amend */
+QemuOptsList *amend_opts;
+
+/*
+ * If this driver supports reopening images this contains a
+ * NULL-terminated list of the runtime options that can be
+ * modified. If an option in this list is unspecified during
+ * reopen then it _must_ be reset to its default value or return
+ * an error.
+ */
+const char *const *mutable_opts;
+
+/*
+ * Pointer to a NULL-terminated array of names of strong options
+ * that can be specified for bdrv_open(). A strong option is one
+ * that changes the data of a BDS.
+ * If this pointer is NULL, the array is considered empty.
+ * "filename" and "driver" are always considered strong.
+ */
+const char *const *strong_runtime_opts;
+
+
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /*
  * This function is invoked under BQL before .bdrv_co_amend()
  * (which in contrast does not necessarily run under the BQL)
@@ -143,7 +211,6 @@ struct BlockDriver {
 bool (*bdrv_recurse_can_replace)(BlockDriverState *bs,
  BlockDriverState *to_replace);
 
-int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
 int (*bdrv_probe_device)(const char *filename);
 
 /*
@@ -152,28 +219,8 @@ struct BlockDriver {
  */
 void (*bdrv_parse_filename)(const char *filename, QDict *options,
 Error **errp);
-/*
- * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
- * this field set to true, except ones that are defined only by their
- * child's bs.
- * An example of the last type will be the quorum block driver.
- */
-bool bdrv_needs_filename;
-
-/*
- * Set if a driver can support backing files. This also implies the
- * following semantics:
- *
- *  - Return status 0 of .bdrv_co_block_status means that corresponding
- *blocks are not allocated in this layer of backing-chain
- *  - For such (unallocated) blocks, read will:
- *- fill buffer with zeros if there is no backing file
- *- read from the backing file otherwise, where the block layer
- *  takes care of reading zeros beyond EOF if backing file is short
- */
-bool supports_backing;
 
-/* For handling image reopen for split or non-split files */
+/* For handling image reopen for split or non-split files. */
 int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
BlockReopenQueue *queue, Error **errp);
 void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
@@ -189,7 +236,6 @@ struct BlockDriver {
   Error **errp);
 void 

[PATCH v8 23/31] block/copy-before-write.h: global state API + assertions

2022-03-03 Thread Emanuele Giuseppe Esposito
copy-before-write functions always run under BQL.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/copy-before-write.c | 2 ++
 block/copy-before-write.h | 7 +++
 2 files changed, 9 insertions(+)

diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index c30a5ff8de..80b7684dba 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -223,6 +223,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
 QDict *opts;
 
 assert(source->total_sectors == target->total_sectors);
+GLOBAL_STATE_CODE();
 
 opts = qdict_new();
 qdict_put_str(opts, "driver", "copy-before-write");
@@ -245,6 +246,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
 
 void bdrv_cbw_drop(BlockDriverState *bs)
 {
+GLOBAL_STATE_CODE();
 bdrv_drop_filter(bs, _abort);
 bdrv_unref(bs);
 }
diff --git a/block/copy-before-write.h b/block/copy-before-write.h
index 51847e711a..6e72bb25e9 100644
--- a/block/copy-before-write.h
+++ b/block/copy-before-write.h
@@ -29,6 +29,13 @@
 #include "block/block_int.h"
 #include "block/block-copy.h"
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
   BlockDriverState *target,
   const char *filter_node_name,
-- 
2.31.1




[PATCH v8 19/31] assertions for blockjob.h global state API

2022-03-03 Thread Emanuele Giuseppe Esposito
Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockjob.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/blockjob.c b/blockjob.c
index d79a52d204..4868453d74 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -62,6 +62,7 @@ static bool is_block_job(Job *job)
 BlockJob *block_job_next(BlockJob *bjob)
 {
 Job *job = bjob ? >job : NULL;
+GLOBAL_STATE_CODE();
 
 do {
 job = job_next(job);
@@ -73,6 +74,7 @@ BlockJob *block_job_next(BlockJob *bjob)
 BlockJob *block_job_get(const char *id)
 {
 Job *job = job_get(id);
+GLOBAL_STATE_CODE();
 
 if (job && is_block_job(job)) {
 return container_of(job, BlockJob, job);
@@ -184,6 +186,7 @@ static const BdrvChildClass child_job = {
 
 void block_job_remove_all_bdrv(BlockJob *job)
 {
+GLOBAL_STATE_CODE();
 /*
  * bdrv_root_unref_child() may reach child_job_[can_]set_aio_ctx(),
  * which will also traverse job->nodes, so consume the list one by
@@ -206,6 +209,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
 bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
 {
 GSList *el;
+GLOBAL_STATE_CODE();
 
 for (el = job->nodes; el; el = el->next) {
 BdrvChild *c = el->data;
@@ -222,6 +226,7 @@ int block_job_add_bdrv(BlockJob *job, const char *name, 
BlockDriverState *bs,
 {
 BdrvChild *c;
 bool need_context_ops;
+GLOBAL_STATE_CODE();
 
 bdrv_ref(bs);
 
@@ -271,6 +276,8 @@ bool block_job_set_speed(BlockJob *job, int64_t speed, 
Error **errp)
 const BlockJobDriver *drv = block_job_driver(job);
 int64_t old_speed = job->speed;
 
+GLOBAL_STATE_CODE();
+
 if (job_apply_verb(>job, JOB_VERB_SET_SPEED, errp) < 0) {
 return false;
 }
@@ -309,6 +316,8 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 BlockJobInfo *info;
 uint64_t progress_current, progress_total;
 
+GLOBAL_STATE_CODE();
+
 if (block_job_is_internal(job)) {
 error_setg(errp, "Cannot query QEMU internal jobs");
 return NULL;
@@ -491,6 +500,7 @@ fail:
 
 void block_job_iostatus_reset(BlockJob *job)
 {
+GLOBAL_STATE_CODE();
 if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
 return;
 }
@@ -548,5 +558,6 @@ BlockErrorAction block_job_error_action(BlockJob *job, 
BlockdevOnError on_err,
 
 AioContext *block_job_get_aio_context(BlockJob *job)
 {
+GLOBAL_STATE_CODE();
 return job->job.aio_context;
 }
-- 
2.31.1




[PATCH v8 20/31] include/sysemu/blockdev.h: global state API

2022-03-03 Thread Emanuele Giuseppe Esposito
blockdev functions run always under the BQL lock.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/sysemu/blockdev.h | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/sysemu/blockdev.h b/include/sysemu/blockdev.h
index f9fb54d437..3211b16513 100644
--- a/include/sysemu/blockdev.h
+++ b/include/sysemu/blockdev.h
@@ -13,9 +13,6 @@
 #include "block/block.h"
 #include "qemu/queue.h"
 
-void blockdev_mark_auto_del(BlockBackend *blk);
-void blockdev_auto_del(BlockBackend *blk);
-
 typedef enum {
 IF_DEFAULT = -1,/* for use with drive_add() only */
 /*
@@ -38,6 +35,16 @@ struct DriveInfo {
 QTAILQ_ENTRY(DriveInfo) next;
 };
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
+void blockdev_mark_auto_del(BlockBackend *blk);
+void blockdev_auto_del(BlockBackend *blk);
+
 DriveInfo *blk_legacy_dinfo(BlockBackend *blk);
 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo);
 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo);
-- 
2.31.1




[PATCH v8 18/31] include/block/blockjob.h: global state API

2022-03-03 Thread Emanuele Giuseppe Esposito
blockjob functions run always under the BQL lock.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/block/blockjob.h | 29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 87fbb3985f..6525e16fd5 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -74,6 +74,13 @@ typedef struct BlockJob {
 GSList *nodes;
 } BlockJob;
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /**
  * block_job_next:
  * @job: A block job, or %NULL.
@@ -155,6 +162,21 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
  */
 void block_job_iostatus_reset(BlockJob *job);
 
+/*
+ * block_job_get_aio_context:
+ *
+ * Returns aio context associated with a block job.
+ */
+AioContext *block_job_get_aio_context(BlockJob *job);
+
+
+/*
+ * Common functions that are neither I/O nor Global State.
+ *
+ * See include/block/block-common.h for more information about
+ * the Common API.
+ */
+
 /**
  * block_job_is_internal:
  * @job: The job to determine if it is user-visible or not.
@@ -170,11 +192,4 @@ bool block_job_is_internal(BlockJob *job);
  */
 const BlockJobDriver *block_job_driver(BlockJob *job);
 
-/*
- * block_job_get_aio_context:
- *
- * Returns aio context associated with a block job.
- */
-AioContext *block_job_get_aio_context(BlockJob *job);
-
 #endif
-- 
2.31.1




[PATCH v8 10/31] block.c: assertions to the block layer permissions API

2022-03-03 Thread Emanuele Giuseppe Esposito
Now that we "covered" the three main cases where the
permission API was being used under BQL (fuse,
amend and invalidate_cache), we can safely assert for
the permission functions implemented in block.c

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/block.c b/block.c
index 2a7df2a013..7d4a5440de 100644
--- a/block.c
+++ b/block.c
@@ -2109,6 +2109,7 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, 
Error **errp)
 
 assert(a->bs);
 assert(a->bs == b->bs);
+GLOBAL_STATE_CODE();
 
 if ((b->perm & a->shared_perm) == b->perm) {
 return true;
@@ -2132,6 +2133,7 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, 
Error **errp)
 static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
 {
 BdrvChild *a, *b;
+GLOBAL_STATE_CODE();
 
 /*
  * During the loop we'll look at each pair twice. That's correct because
@@ -2213,6 +2215,8 @@ static void bdrv_child_set_perm_abort(void *opaque)
 {
 BdrvChildSetPermState *s = opaque;
 
+GLOBAL_STATE_CODE();
+
 s->child->perm = s->old_perm;
 s->child->shared_perm = s->old_shared_perm;
 }
@@ -2226,6 +2230,7 @@ static void bdrv_child_set_perm(BdrvChild *c, uint64_t 
perm,
 uint64_t shared, Transaction *tran)
 {
 BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
+GLOBAL_STATE_CODE();
 
 *s = (BdrvChildSetPermState) {
 .child = c,
@@ -2405,6 +2410,7 @@ static int bdrv_node_refresh_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 BdrvChild *c;
 int ret;
 uint64_t cumulative_perms, cumulative_shared_perms;
+GLOBAL_STATE_CODE();
 
 bdrv_get_cumulative_perm(bs, _perms, _shared_perms);
 
@@ -2473,6 +2479,7 @@ static int bdrv_list_refresh_perms(GSList *list, 
BlockReopenQueue *q,
 {
 int ret;
 BlockDriverState *bs;
+GLOBAL_STATE_CODE();
 
 for ( ; list; list = list->next) {
 bs = list->data;
@@ -2540,6 +2547,7 @@ static int bdrv_refresh_perms(BlockDriverState *bs, Error 
**errp)
 int ret;
 Transaction *tran = tran_new();
 g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
+GLOBAL_STATE_CODE();
 
 ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
 tran_finalize(tran, ret);
@@ -2602,6 +2610,7 @@ static void bdrv_filter_default_perms(BlockDriverState 
*bs, BdrvChild *c,
   uint64_t perm, uint64_t shared,
   uint64_t *nperm, uint64_t *nshared)
 {
+GLOBAL_STATE_CODE();
 *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
 *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
 }
@@ -2613,6 +2622,7 @@ static void bdrv_default_perms_for_cow(BlockDriverState 
*bs, BdrvChild *c,
uint64_t *nperm, uint64_t *nshared)
 {
 assert(role & BDRV_CHILD_COW);
+GLOBAL_STATE_CODE();
 
 /*
  * We want consistent read from backing files if the parent needs it.
@@ -2649,6 +2659,7 @@ static void 
bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
 {
 int flags;
 
+GLOBAL_STATE_CODE();
 assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
 
 flags = bdrv_reopen_get_flags(reopen_queue, bs);
@@ -6028,6 +6039,7 @@ static void xdbg_graph_add_edge(XDbgBlockGraphConstructor 
*gr, void *parent,
 {
 BlockPermission qapi_perm;
 XDbgBlockGraphEdge *edge;
+GLOBAL_STATE_CODE();
 
 edge = g_new0(XDbgBlockGraphEdge, 1);
 
-- 
2.31.1




[PATCH v8 22/31] include/block/snapshot: global state API + assertions

2022-03-03 Thread Emanuele Giuseppe Esposito
Snapshots run also under the BQL, so they all are
in the global state API. The aiocontext lock that they hold
is currently an overkill and in future could be removed.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/snapshot.c | 28 
 include/block/snapshot.h | 13 +++--
 migration/savevm.c   |  2 ++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/block/snapshot.c b/block/snapshot.c
index ccacda8bd5..d6f53c3065 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -57,6 +57,8 @@ int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo 
*sn_info,
 QEMUSnapshotInfo *sn_tab, *sn;
 int nb_sns, i, ret;
 
+GLOBAL_STATE_CODE();
+
 ret = -ENOENT;
 nb_sns = bdrv_snapshot_list(bs, _tab);
 if (nb_sns < 0) {
@@ -105,6 +107,7 @@ bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs,
 bool ret = false;
 
 assert(id || name);
+GLOBAL_STATE_CODE();
 
 nb_sns = bdrv_snapshot_list(bs, _tab);
 if (nb_sns < 0) {
@@ -200,6 +203,7 @@ static BlockDriverState 
*bdrv_snapshot_fallback(BlockDriverState *bs)
 int bdrv_can_snapshot(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
+GLOBAL_STATE_CODE();
 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
 return 0;
 }
@@ -220,6 +224,9 @@ int bdrv_snapshot_create(BlockDriverState *bs,
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
+
+GLOBAL_STATE_CODE();
+
 if (!drv) {
 return -ENOMEDIUM;
 }
@@ -240,6 +247,8 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
 BdrvChild **fallback_ptr;
 int ret, open_ret;
 
+GLOBAL_STATE_CODE();
+
 if (!drv) {
 error_setg(errp, "Block driver is closed");
 return -ENOMEDIUM;
@@ -348,6 +357,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
 BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
 int ret;
 
+GLOBAL_STATE_CODE();
+
 if (!drv) {
 error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
 return -ENOMEDIUM;
@@ -380,6 +391,8 @@ int bdrv_snapshot_list(BlockDriverState *bs,
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
+
+GLOBAL_STATE_CODE();
 if (!drv) {
 return -ENOMEDIUM;
 }
@@ -419,6 +432,8 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
 {
 BlockDriver *drv = bs->drv;
 
+GLOBAL_STATE_CODE();
+
 if (!drv) {
 error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
 return -ENOMEDIUM;
@@ -447,6 +462,8 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState 
*bs,
 int ret;
 Error *local_err = NULL;
 
+GLOBAL_STATE_CODE();
+
 ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, _err);
 if (ret == -ENOENT || ret == -EINVAL) {
 error_free(local_err);
@@ -515,6 +532,8 @@ bool bdrv_all_can_snapshot(bool has_devices, strList 
*devices,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, , errp) < 0) 
{
 return false;
 }
@@ -549,6 +568,8 @@ int bdrv_all_delete_snapshot(const char *name,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, , errp) < 0) 
{
 return -1;
 }
@@ -588,6 +609,8 @@ int bdrv_all_goto_snapshot(const char *name,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, , errp) < 0) 
{
 return -1;
 }
@@ -622,6 +645,8 @@ int bdrv_all_has_snapshot(const char *name,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, , errp) < 0) 
{
 return -1;
 }
@@ -663,6 +688,7 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
 {
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
+GLOBAL_STATE_CODE();
 
 if (bdrv_all_get_snapshot_devices(has_devices, devices, , errp) < 0) 
{
 return -1;
@@ -703,6 +729,8 @@ BlockDriverState *bdrv_all_find_vmstate_bs(const char 
*vmstate_bs,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, , errp) < 0) 
{
 return NULL;
 }
diff --git a/include/block/snapshot.h b/include/block/snapshot.h
index 940345692f..50ff924710 100644
--- a/include/block/snapshot.h
+++ b/include/block/snapshot.h
@@ -45,6 +45,13 @@ typedef struct QEMUSnapshotInfo {
 uint64_t icount; /* record/replay step */
 } QEMUSnapshotInfo;
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 int 

[PATCH v8 30/31] job.h: split function pointers in JobDriver

2022-03-03 Thread Emanuele Giuseppe Esposito
The job API will be handled separately in another serie.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/qemu/job.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 6e67b6977f..c105b31076 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -169,6 +169,12 @@ typedef struct Job {
  * Callbacks and other information about a Job driver.
  */
 struct JobDriver {
+
+/*
+ * These fields are initialized when this object is created,
+ * and are never changed afterwards
+ */
+
 /** Derived Job struct size */
 size_t instance_size;
 
@@ -184,9 +190,18 @@ struct JobDriver {
  * aborted. If it returns zero, the job moves into the WAITING state. If it
  * is the last job to complete in its transaction, all jobs in the
  * transaction move from WAITING to PENDING.
+ *
+ * This callback must be run in the job's context.
  */
 int coroutine_fn (*run)(Job *job, Error **errp);
 
+/*
+ * Functions run without regard to the BQL that may run in any
+ * arbitrary thread. These functions do not need to be thread-safe
+ * because the caller ensures that they are invoked from one
+ * thread at time.
+ */
+
 /**
  * If the callback is not NULL, it will be invoked when the job transitions
  * into the paused state.  Paused jobs must not perform any asynchronous
@@ -201,6 +216,13 @@ struct JobDriver {
  */
 void coroutine_fn (*resume)(Job *job);
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /**
  * Called when the job is resumed by the user (i.e. user_paused becomes
  * false). .user_resume is called before .resume.
-- 
2.31.1




[PATCH v8 16/31] GS and IO CODE macros for blockjob_int.h

2022-03-03 Thread Emanuele Giuseppe Esposito
Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockjob.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/blockjob.c b/blockjob.c
index 10815a89fe..d79a52d204 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -84,6 +84,7 @@ BlockJob *block_job_get(const char *id)
 void block_job_free(Job *job)
 {
 BlockJob *bjob = container_of(job, BlockJob, job);
+GLOBAL_STATE_CODE();
 
 block_job_remove_all_bdrv(bjob);
 ratelimit_destroy(>limit);
@@ -299,6 +300,7 @@ bool block_job_set_speed(BlockJob *job, int64_t speed, 
Error **errp)
 
 int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
 {
+IO_CODE();
 return ratelimit_calculate_delay(>limit, n);
 }
 
@@ -434,6 +436,7 @@ void *block_job_create(const char *job_id, const 
BlockJobDriver *driver,
 {
 BlockJob *job;
 int ret;
+GLOBAL_STATE_CODE();
 
 if (job_id == NULL && !(flags & JOB_INTERNAL)) {
 job_id = bdrv_get_device_name(bs);
@@ -498,6 +501,7 @@ void block_job_iostatus_reset(BlockJob *job)
 void block_job_user_resume(Job *job)
 {
 BlockJob *bjob = container_of(job, BlockJob, job);
+GLOBAL_STATE_CODE();
 block_job_iostatus_reset(bjob);
 }
 
@@ -505,6 +509,7 @@ BlockErrorAction block_job_error_action(BlockJob *job, 
BlockdevOnError on_err,
 int is_read, int error)
 {
 BlockErrorAction action;
+IO_CODE();
 
 switch (on_err) {
 case BLOCKDEV_ON_ERROR_ENOSPC:
-- 
2.31.1




[PATCH v8 17/31] block.c: add assertions to static functions

2022-03-03 Thread Emanuele Giuseppe Esposito
Following the assertion derived from the API split,
propagate the assertion also in the static functions.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c   | 46 ++-
 block/block-backend.c |  3 +++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 48daca835a..7224053f8b 100644
--- a/block.c
+++ b/block.c
@@ -438,6 +438,7 @@ BlockDriverState *bdrv_new(void)
 static BlockDriver *bdrv_do_find_format(const char *format_name)
 {
 BlockDriver *drv1;
+GLOBAL_STATE_CODE();
 
 QLIST_FOREACH(drv1, _drivers, list) {
 if (!strcmp(drv1->format_name, format_name)) {
@@ -596,6 +597,8 @@ static int64_t create_file_fallback_truncate(BlockBackend 
*blk,
 int64_t size;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
_err);
 if (ret < 0 && ret != -ENOTSUP) {
@@ -634,6 +637,8 @@ static int 
create_file_fallback_zero_first_sector(BlockBackend *blk,
 int64_t bytes_to_clear;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
 if (bytes_to_clear) {
 ret = blk_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
@@ -896,6 +901,7 @@ static BlockDriver *find_hdev_driver(const char *filename)
 {
 int score_max = 0, score;
 BlockDriver *drv = NULL, *d;
+GLOBAL_STATE_CODE();
 
 QLIST_FOREACH(d, _drivers, list) {
 if (d->bdrv_probe_device) {
@@ -913,6 +919,7 @@ static BlockDriver *find_hdev_driver(const char *filename)
 static BlockDriver *bdrv_do_find_protocol(const char *protocol)
 {
 BlockDriver *drv1;
+GLOBAL_STATE_CODE();
 
 QLIST_FOREACH(drv1, _drivers, list) {
 if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
@@ -1021,6 +1028,8 @@ static int find_image_format(BlockBackend *file, const 
char *filename,
 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
 int ret = 0;
 
+GLOBAL_STATE_CODE();
+
 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
 if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) 
{
 *pdrv = _raw;
@@ -1103,6 +1112,7 @@ static BlockdevDetectZeroesOptions 
bdrv_parse_detect_zeroes(QemuOpts *opts,
 BlockdevDetectZeroesOptions detect_zeroes =
 qapi_enum_parse(_lookup, value,
 BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, _err);
+GLOBAL_STATE_CODE();
 g_free(value);
 if (local_err) {
 error_propagate(errp, local_err);
@@ -1218,6 +1228,7 @@ static void bdrv_child_cb_drained_end(BdrvChild *child,
 static int bdrv_child_cb_inactivate(BdrvChild *child)
 {
 BlockDriverState *bs = child->opaque;
+GLOBAL_STATE_CODE();
 assert(bs->open_flags & BDRV_O_INACTIVE);
 return 0;
 }
@@ -1244,6 +1255,7 @@ static void bdrv_child_cb_set_aio_ctx(BdrvChild *child, 
AioContext *ctx,
 static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
int parent_flags, QDict *parent_options)
 {
+GLOBAL_STATE_CODE();
 *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
 
 /* For temporary files, unconditional cache=unsafe is fine */
@@ -1264,6 +1276,7 @@ static void bdrv_backing_attach(BdrvChild *c)
 BlockDriverState *parent = c->opaque;
 BlockDriverState *backing_hd = c->bs;
 
+GLOBAL_STATE_CODE();
 assert(!parent->backing_blocker);
 error_setg(>backing_blocker,
"node is used as backing hd of '%s'",
@@ -1302,6 +1315,7 @@ static void bdrv_backing_detach(BdrvChild *c)
 {
 BlockDriverState *parent = c->opaque;
 
+GLOBAL_STATE_CODE();
 assert(parent->backing_blocker);
 bdrv_op_unblock_all(c->bs, parent->backing_blocker);
 error_free(parent->backing_blocker);
@@ -1314,6 +1328,7 @@ static int bdrv_backing_update_filename(BdrvChild *c, 
BlockDriverState *base,
 BlockDriverState *parent = c->opaque;
 bool read_only = bdrv_is_read_only(parent);
 int ret;
+GLOBAL_STATE_CODE();
 
 if (read_only) {
 ret = bdrv_reopen_set_read_only(parent, false, errp);
@@ -1345,6 +1360,7 @@ static void bdrv_inherited_options(BdrvChildRole role, 
bool parent_is_format,
int parent_flags, QDict *parent_options)
 {
 int flags = parent_flags;
+GLOBAL_STATE_CODE();
 
 /*
  * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
@@ -1486,6 +1502,7 @@ AioContext *bdrv_child_get_parent_aio_context(BdrvChild 
*c)
 static int bdrv_open_flags(BlockDriverState *bs, int flags)
 {
 int open_flags = flags;
+GLOBAL_STATE_CODE();
 
 /*
  * Clear flags that are internal to the block layer before opening the
@@ -1498,6 +1515,8 @@ static int bdrv_open_flags(BlockDriverState *bs, int 
flags)
 
 static void update_flags_from_options(int *flags, QemuOpts *opts)
 {
+GLOBAL_STATE_CODE();

[PATCH v8 14/31] block: introduce assert_bdrv_graph_writable

2022-03-03 Thread Emanuele Giuseppe Esposito
We want to be sure that the functions that write the child and
parent list of a bs are under BQL and drain.

BQL prevents from concurrent writings from the GS API, while
drains protect from I/O.

TODO: drains are missing in some functions using this assert.
Therefore a proper assertion will fail. Because adding drains
requires additional discussions, they will be added in future
series.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c|  4 
 include/block/block_int-global-state.h | 17 +
 2 files changed, 21 insertions(+)

diff --git a/block.c b/block.c
index cf656e5b26..48daca835a 100644
--- a/block.c
+++ b/block.c
@@ -1420,6 +1420,7 @@ static void bdrv_child_cb_attach(BdrvChild *child)
 {
 BlockDriverState *bs = child->opaque;
 
+assert_bdrv_graph_writable(bs);
 QLIST_INSERT_HEAD(>children, child, next);
 
 if (child->role & BDRV_CHILD_COW) {
@@ -1439,6 +1440,7 @@ static void bdrv_child_cb_detach(BdrvChild *child)
 
 bdrv_unapply_subtree_drain(child, bs);
 
+assert_bdrv_graph_writable(bs);
 QLIST_REMOVE(child, next);
 }
 
@@ -2829,6 +2831,7 @@ static void bdrv_replace_child_noperm(BdrvChild **childp,
 if (child->klass->detach) {
 child->klass->detach(child);
 }
+assert_bdrv_graph_writable(old_bs);
 QLIST_REMOVE(child, next_parent);
 }
 
@@ -2838,6 +2841,7 @@ static void bdrv_replace_child_noperm(BdrvChild **childp,
 }
 
 if (new_bs) {
+assert_bdrv_graph_writable(new_bs);
 QLIST_INSERT_HEAD(_bs->parents, child, next_parent);
 
 /*
diff --git a/include/block/block_int-global-state.h 
b/include/block/block_int-global-state.h
index 5078d6a6ea..0f21b0570b 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -309,4 +309,21 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
  */
 void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
 
+/**
+ * Make sure that the function is running under both drain and BQL.
+ * The latter protects from concurrent writings
+ * from the GS API, while the former prevents concurrent reads
+ * from I/O.
+ */
+static inline void assert_bdrv_graph_writable(BlockDriverState *bs)
+{
+/*
+ * TODO: this function is incomplete. Because the users of this
+ * assert lack the necessary drains, check only for BQL.
+ * Once the necessary drains are added,
+ * assert also for qatomic_read(>quiesce_counter) > 0
+ */
+assert(qemu_in_main_thread());
+}
+
 #endif /* BLOCK_INT_GLOBAL_STATE */
-- 
2.31.1




[PATCH v8 24/31] block/coroutines: I/O and "I/O or GS" API

2022-03-03 Thread Emanuele Giuseppe Esposito
block coroutines functions run in different aiocontext, and are
not protected by the BQL. Therefore are I/O.

On the other side, generated_co_wrapper functions use BDRV_POLL_WHILE,
meaning the caller can either be the main loop or a specific iothread.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c   |  2 ++
 block/block-backend.c |  6 
 block/coroutines.h| 81 +++
 block/io.c|  3 ++
 block/nbd.c   |  1 +
 5 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/block.c b/block.c
index 7224053f8b..4a3447b2a0 100644
--- a/block.c
+++ b/block.c
@@ -5454,6 +5454,7 @@ fail:
 int coroutine_fn bdrv_co_check(BlockDriverState *bs,
BdrvCheckResult *res, BdrvCheckMode fix)
 {
+IO_CODE();
 if (bs->drv == NULL) {
 return -ENOMEDIUM;
 }
@@ -6663,6 +6664,7 @@ int bdrv_activate(BlockDriverState *bs, Error **errp)
 int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
 Error *local_err = NULL;
+IO_CODE();
 
 assert(!(bs->open_flags & BDRV_O_INACTIVE));
 
diff --git a/block/block-backend.c b/block/block-backend.c
index bf77c4a8fa..e0e1aff4b1 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1290,6 +1290,7 @@ blk_co_do_preadv(BlockBackend *blk, int64_t offset, 
int64_t bytes,
 {
 int ret;
 BlockDriverState *bs;
+IO_CODE();
 
 blk_wait_while_drained(blk);
 
@@ -1337,6 +1338,7 @@ blk_co_do_pwritev_part(BlockBackend *blk, int64_t offset, 
int64_t bytes,
 {
 int ret;
 BlockDriverState *bs;
+IO_CODE();
 
 blk_wait_while_drained(blk);
 
@@ -1656,6 +1658,8 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
 int coroutine_fn
 blk_co_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
+IO_CODE();
+
 blk_wait_while_drained(blk);
 
 if (!blk_is_available(blk)) {
@@ -1699,6 +1703,7 @@ int coroutine_fn
 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
 {
 int ret;
+IO_CODE();
 
 blk_wait_while_drained(blk);
 
@@ -1757,6 +1762,7 @@ int blk_pdiscard(BlockBackend *blk, int64_t offset, 
int64_t bytes)
 int coroutine_fn blk_co_do_flush(BlockBackend *blk)
 {
 blk_wait_while_drained(blk);
+IO_CODE();
 
 if (!blk_is_available(blk)) {
 return -ENOMEDIUM;
diff --git a/block/coroutines.h b/block/coroutines.h
index c8c14a29c8..b293e943c8 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -30,17 +30,17 @@
 /* For blk_bs() in generated block/block-gen.c */
 #include "sysemu/block-backend.h"
 
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
 int coroutine_fn bdrv_co_check(BlockDriverState *bs,
BdrvCheckResult *res, BdrvCheckMode fix);
 int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp);
 
-int generated_co_wrapper
-bdrv_preadv(BdrvChild *child, int64_t offset, unsigned int bytes,
-QEMUIOVector *qiov, BdrvRequestFlags flags);
-int generated_co_wrapper
-bdrv_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes,
- QEMUIOVector *qiov, BdrvRequestFlags flags);
-
 int coroutine_fn
 bdrv_co_common_block_status_above(BlockDriverState *bs,
   BlockDriverState *base,
@@ -52,6 +52,51 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
   int64_t *map,
   BlockDriverState **file,
   int *depth);
+
+int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
+   QEMUIOVector *qiov, int64_t pos);
+int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
+QEMUIOVector *qiov, int64_t pos);
+
+int coroutine_fn
+nbd_co_do_establish_connection(BlockDriverState *bs, Error **errp);
+
+
+int coroutine_fn
+blk_co_do_preadv(BlockBackend *blk, int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, BdrvRequestFlags flags);
+
+
+int coroutine_fn
+blk_co_do_pwritev_part(BlockBackend *blk, int64_t offset, int64_t bytes,
+   QEMUIOVector *qiov, size_t qiov_offset,
+   BdrvRequestFlags flags);
+
+int coroutine_fn
+blk_co_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
+
+int coroutine_fn
+blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
+
+int coroutine_fn blk_co_do_flush(BlockBackend *blk);
+
+
+/*
+ * "I/O or GS" API functions. These functions can run without
+ * the BQL, but only in one specific iothread/main loop.
+ *
+ * See include/block/block-io.h for more information about
+ * the "I/O or GS" API.
+ */
+
+int generated_co_wrapper
+bdrv_preadv(BdrvChild *child, int64_t offset, unsigned int bytes,
+QEMUIOVector *qiov, BdrvRequestFlags flags);
+
+int 

[PATCH v8 21/31] assertions for blockdev.h global state API

2022-03-03 Thread Emanuele Giuseppe Esposito
Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/block-backend.c |  3 +++
 blockdev.c| 16 
 2 files changed, 19 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 2ab1274dfe..bf77c4a8fa 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -810,6 +810,7 @@ bool bdrv_is_root_node(BlockDriverState *bs)
  */
 DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return blk->legacy_dinfo;
 }
 
@@ -821,6 +822,7 @@ DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
 {
 assert(!blk->legacy_dinfo);
+GLOBAL_STATE_CODE();
 return blk->legacy_dinfo = dinfo;
 }
 
@@ -831,6 +833,7 @@ DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, 
DriveInfo *dinfo)
 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 {
 BlockBackend *blk = NULL;
+GLOBAL_STATE_CODE();
 
 while ((blk = blk_next(blk)) != NULL) {
 if (blk->legacy_dinfo == dinfo) {
diff --git a/blockdev.c b/blockdev.c
index 12a317f149..e46e831212 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -113,6 +113,8 @@ void override_max_devs(BlockInterfaceType type, int 
max_devs)
 BlockBackend *blk;
 DriveInfo *dinfo;
 
+GLOBAL_STATE_CODE();
+
 if (max_devs <= 0) {
 return;
 }
@@ -142,6 +144,8 @@ void blockdev_mark_auto_del(BlockBackend *blk)
 DriveInfo *dinfo = blk_legacy_dinfo(blk);
 BlockJob *job;
 
+GLOBAL_STATE_CODE();
+
 if (!dinfo) {
 return;
 }
@@ -163,6 +167,7 @@ void blockdev_mark_auto_del(BlockBackend *blk)
 void blockdev_auto_del(BlockBackend *blk)
 {
 DriveInfo *dinfo = blk_legacy_dinfo(blk);
+GLOBAL_STATE_CODE();
 
 if (dinfo && dinfo->auto_del) {
 monitor_remove_blk(blk);
@@ -187,6 +192,8 @@ QemuOpts *drive_add(BlockInterfaceType type, int index, 
const char *file,
 {
 QemuOpts *opts;
 
+GLOBAL_STATE_CODE();
+
 opts = qemu_opts_parse_noisily(qemu_find_opts("drive"), optstr, false);
 if (!opts) {
 return NULL;
@@ -207,6 +214,8 @@ DriveInfo *drive_get(BlockInterfaceType type, int bus, int 
unit)
 BlockBackend *blk;
 DriveInfo *dinfo;
 
+GLOBAL_STATE_CODE();
+
 for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 dinfo = blk_legacy_dinfo(blk);
 if (dinfo && dinfo->type == type
@@ -229,6 +238,8 @@ void drive_check_orphaned(void)
 Location loc;
 bool orphans = false;
 
+GLOBAL_STATE_CODE();
+
 for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 dinfo = blk_legacy_dinfo(blk);
 /*
@@ -262,6 +273,7 @@ void drive_check_orphaned(void)
 
 DriveInfo *drive_get_by_index(BlockInterfaceType type, int index)
 {
+GLOBAL_STATE_CODE();
 return drive_get(type,
  drive_index_to_bus_id(type, index),
  drive_index_to_unit_id(type, index));
@@ -273,6 +285,8 @@ int drive_get_max_bus(BlockInterfaceType type)
 BlockBackend *blk;
 DriveInfo *dinfo;
 
+GLOBAL_STATE_CODE();
+
 max_bus = -1;
 for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 dinfo = blk_legacy_dinfo(blk);
@@ -759,6 +773,8 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType 
block_default_type,
 const char *filename;
 int i;
 
+GLOBAL_STATE_CODE();
+
 /* Change legacy command line options into QMP ones */
 static const struct {
 const char *from;
-- 
2.31.1




[PATCH v8 15/31] include/block/blockjob_int.h: split header into I/O and GS API

2022-03-03 Thread Emanuele Giuseppe Esposito
Since the I/O functions are not many, keep a single file.
Also split the function pointers in BlockJobDriver.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/blockjob_int.h | 28 
 1 file changed, 28 insertions(+)

diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index 6633d83da2..6bd9ae2b20 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -38,6 +38,13 @@ struct BlockJobDriver {
 /** Generic JobDriver callbacks and settings */
 JobDriver job_driver;
 
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
 /*
  * Returns whether the job has pending requests for the child or will
  * submit new requests before the next pause point. This callback is polled
@@ -46,6 +53,13 @@ struct BlockJobDriver {
  */
 bool (*drained_poll)(BlockJob *job);
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /*
  * If the callback is not NULL, it will be invoked before the job is
  * resumed in a new AioContext.  This is the place to move any resources
@@ -56,6 +70,13 @@ struct BlockJobDriver {
 void (*set_speed)(BlockJob *job, int64_t speed);
 };
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /**
  * block_job_create:
  * @job_id: The id of the newly-created job, or %NULL to have one
@@ -98,6 +119,13 @@ void block_job_free(Job *job);
  */
 void block_job_user_resume(Job *job);
 
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
 /**
  * block_job_ratelimit_get_delay:
  *
-- 
2.31.1




[PATCH v8 08/31] block/block-backend.c: assertions for block-backend

2022-03-03 Thread Emanuele Giuseppe Esposito
All the global state (GS) API functions will check that
qemu_in_main_thread() returns true. If not, it means
that the safety of BQL cannot be guaranteed, and
they need to be moved to I/O.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/block-backend.c  | 78 ++
 softmmu/qdev-monitor.c |  2 ++
 2 files changed, 80 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 462e18facf..4476b61b8b 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -239,6 +239,7 @@ static void blk_root_activate(BdrvChild *child, Error 
**errp)
 
 void blk_set_force_allow_inactivate(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 blk->force_allow_inactivate = true;
 }
 
@@ -357,6 +358,8 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, 
uint64_t shared_perm)
 {
 BlockBackend *blk;
 
+GLOBAL_STATE_CODE();
+
 blk = g_new0(BlockBackend, 1);
 blk->refcnt = 1;
 blk->ctx = ctx;
@@ -394,6 +397,8 @@ BlockBackend *blk_new_with_bs(BlockDriverState *bs, 
uint64_t perm,
 {
 BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
 
+GLOBAL_STATE_CODE();
+
 if (blk_insert_bs(blk, bs, errp) < 0) {
 blk_unref(blk);
 return NULL;
@@ -422,6 +427,8 @@ BlockBackend *blk_new_open(const char *filename, const char 
*reference,
 uint64_t perm = 0;
 uint64_t shared = BLK_PERM_ALL;
 
+GLOBAL_STATE_CODE();
+
 /*
  * blk_new_open() is mainly used in .bdrv_create implementations and the
  * tools where sharing isn't a major concern because the BDS stays private
@@ -499,6 +506,7 @@ static void drive_info_del(DriveInfo *dinfo)
 
 int blk_get_refcnt(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return blk ? blk->refcnt : 0;
 }
 
@@ -509,6 +517,7 @@ int blk_get_refcnt(BlockBackend *blk)
 void blk_ref(BlockBackend *blk)
 {
 assert(blk->refcnt > 0);
+GLOBAL_STATE_CODE();
 blk->refcnt++;
 }
 
@@ -519,6 +528,7 @@ void blk_ref(BlockBackend *blk)
  */
 void blk_unref(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 if (blk) {
 assert(blk->refcnt > 0);
 if (blk->refcnt > 1) {
@@ -539,6 +549,7 @@ void blk_unref(BlockBackend *blk)
  */
 BlockBackend *blk_all_next(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return blk ? QTAILQ_NEXT(blk, link)
: QTAILQ_FIRST(_backends);
 }
@@ -547,6 +558,8 @@ void blk_remove_all_bs(void)
 {
 BlockBackend *blk = NULL;
 
+GLOBAL_STATE_CODE();
+
 while ((blk = blk_all_next(blk)) != NULL) {
 AioContext *ctx = blk_get_aio_context(blk);
 
@@ -570,6 +583,7 @@ void blk_remove_all_bs(void)
  */
 BlockBackend *blk_next(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return blk ? QTAILQ_NEXT(blk, monitor_link)
: QTAILQ_FIRST(_block_backends);
 }
@@ -636,6 +650,7 @@ static void bdrv_next_reset(BdrvNextIterator *it)
 
 BlockDriverState *bdrv_first(BdrvNextIterator *it)
 {
+GLOBAL_STATE_CODE();
 bdrv_next_reset(it);
 return bdrv_next(it);
 }
@@ -673,6 +688,7 @@ bool monitor_add_blk(BlockBackend *blk, const char *name, 
Error **errp)
 {
 assert(!blk->name);
 assert(name && name[0]);
+GLOBAL_STATE_CODE();
 
 if (!id_wellformed(name)) {
 error_setg(errp, "Invalid device name");
@@ -700,6 +716,8 @@ bool monitor_add_blk(BlockBackend *blk, const char *name, 
Error **errp)
  */
 void monitor_remove_blk(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
+
 if (!blk->name) {
 return;
 }
@@ -726,6 +744,7 @@ BlockBackend *blk_by_name(const char *name)
 {
 BlockBackend *blk = NULL;
 
+GLOBAL_STATE_CODE();
 assert(name);
 while ((blk = blk_next(blk)) != NULL) {
 if (!strcmp(name, blk->name)) {
@@ -760,6 +779,7 @@ static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
  */
 bool bdrv_has_blk(BlockDriverState *bs)
 {
+GLOBAL_STATE_CODE();
 return bdrv_first_blk(bs) != NULL;
 }
 
@@ -770,6 +790,7 @@ bool bdrv_is_root_node(BlockDriverState *bs)
 {
 BdrvChild *c;
 
+GLOBAL_STATE_CODE();
 QLIST_FOREACH(c, >parents, next_parent) {
 if (c->klass != _root) {
 return false;
@@ -819,6 +840,7 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
  */
 BlockBackendPublic *blk_get_public(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return >public;
 }
 
@@ -827,6 +849,7 @@ BlockBackendPublic *blk_get_public(BlockBackend *blk)
  */
 BlockBackend *blk_by_public(BlockBackendPublic *public)
 {
+GLOBAL_STATE_CODE();
 return container_of(public, BlockBackend, public);
 }
 
@@ -838,6 +861,8 @@ void blk_remove_bs(BlockBackend *blk)
 ThrottleGroupMember *tgm = >public.throttle_group_member;
 BdrvChild *root;
 
+GLOBAL_STATE_CODE();
+
 notifier_list_notify(>remove_bs_notifiers, blk);
 if (tgm->throttle_state) {
 BlockDriverState *bs = blk_bs(blk);
@@ -872,6 +897,7 @@ void blk_remove_bs(BlockBackend *blk)
 int blk_insert_bs(BlockBackend *blk, 

[PATCH v8 01/31] main-loop.h: introduce qemu_in_main_thread()

2022-03-03 Thread Emanuele Giuseppe Esposito
When invoked from the main loop, this function is the same
as qemu_mutex_iothread_locked, and returns true if the BQL is held.
When invoked from iothreads or tests, it returns true only
if the current AioContext is the Main Loop.

This essentially just extends qemu_mutex_iothread_locked to work
also in unit tests or other users like storage-daemon, that run
in the Main Loop but end up using the implementation in
stubs/iothread-lock.c.

Using qemu_mutex_iothread_locked in unit tests defaults to false
because they use the implementation in stubs/iothread-lock,
making all assertions added in next patches fail despite the
AioContext is still the main loop.

See the comment in the function header for more information.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/qemu/main-loop.h | 24 
 softmmu/cpus.c   |  5 +
 stubs/iothread-lock.c|  5 +
 3 files changed, 34 insertions(+)

diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 8dbc6fcb89..bc42b5939d 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -242,9 +242,33 @@ AioContext *iohandler_get_aio_context(void);
  * must always be taken outside other locks.  This function helps
  * functions take different paths depending on whether the current
  * thread is running within the main loop mutex.
+ *
+ * This function should never be used in the block layer, because
+ * unit tests, block layer tools and qemu-storage-daemon do not
+ * have a BQL.
+ * Please instead refer to qemu_in_main_thread().
  */
 bool qemu_mutex_iothread_locked(void);
 
+/**
+ * qemu_in_main_thread: return whether it's possible to safely access
+ * the global state of the block layer.
+ *
+ * Global state of the block layer is not accessible from I/O threads
+ * or worker threads; only from threads that "own" the default
+ * AioContext that qemu_get_aio_context() returns.  For tests, block
+ * layer tools and qemu-storage-daemon there is a designated thread that
+ * runs the event loop for qemu_get_aio_context(), and that is the
+ * main thread.
+ *
+ * For emulators, however, any thread that holds the BQL can act
+ * as the block layer main thread; this will be any of the actual
+ * main thread, the vCPU threads or the RCU thread.
+ *
+ * For clarity, do not use this function outside the block layer.
+ */
+bool qemu_in_main_thread(void);
+
 /**
  * qemu_mutex_lock_iothread: Lock the main loop mutex.
  *
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 035395ae13..422aa52746 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -480,6 +480,11 @@ bool qemu_mutex_iothread_locked(void)
 return iothread_locked;
 }
 
+bool qemu_in_main_thread(void)
+{
+return qemu_mutex_iothread_locked();
+}
+
 /*
  * The BQL is taken from so many places that it is worth profiling the
  * callers directly, instead of funneling them all through a single function.
diff --git a/stubs/iothread-lock.c b/stubs/iothread-lock.c
index 5b45b7fc8b..ff7386e42c 100644
--- a/stubs/iothread-lock.c
+++ b/stubs/iothread-lock.c
@@ -6,6 +6,11 @@ bool qemu_mutex_iothread_locked(void)
 return false;
 }
 
+bool qemu_in_main_thread(void)
+{
+return qemu_get_current_aio_context() == qemu_get_aio_context();
+}
+
 void qemu_mutex_lock_iothread_impl(const char *file, int line)
 {
 }
-- 
2.31.1




[PATCH v8 11/31] include/block/block_int: split header into I/O and global state API

2022-03-03 Thread Emanuele Giuseppe Esposito
Similarly to the previous patch, split block_int.h
in block_int-io.h and block_int-global-state.h

block_int-common.h contains the structures shared between
the two headers, and the functions that can't be categorized as
I/O or global state.

Assertions are added in the next patch.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockdev.c |5 +
 include/block/block_int-common.h   | 1180 +++
 include/block/block_int-global-state.h |  312 +
 include/block/block_int-io.h   |  179 +++
 include/block/block_int.h  | 1489 +---
 5 files changed, 1679 insertions(+), 1486 deletions(-)
 create mode 100644 include/block/block_int-common.h
 create mode 100644 include/block/block_int-global-state.h
 create mode 100644 include/block/block_int-io.h

diff --git a/blockdev.c b/blockdev.c
index d601ae522e..52078e772f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -63,6 +63,7 @@
 #include "qemu/main-loop.h"
 #include "qemu/throttle-options.h"
 
+/* Protected by BQL */
 QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states =
 QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states);
 
@@ -1175,6 +1176,8 @@ typedef struct BlkActionState BlkActionState;
  *
  * Only prepare() may fail. In a single transaction, only one of commit() or
  * abort() will be called. clean() will always be called if it is present.
+ *
+ * Always run under BQL.
  */
 typedef struct BlkActionOps {
 size_t instance_size;
@@ -2284,6 +2287,8 @@ static TransactionProperties *get_transaction_properties(
 /*
  * 'Atomic' group operations.  The operations are performed as a set, and if
  * any fail then we roll back all operations in the group.
+ *
+ * Always run under BQL.
  */
 void qmp_transaction(TransactionActionList *dev_list,
  bool has_props,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
new file mode 100644
index 00..b92e3630fd
--- /dev/null
+++ b/include/block/block_int-common.h
@@ -0,0 +1,1180 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_INT_COMMON_H
+#define BLOCK_INT_COMMON_H
+
+#include "block/accounting.h"
+#include "block/block.h"
+#include "block/aio-wait.h"
+#include "qemu/queue.h"
+#include "qemu/coroutine.h"
+#include "qemu/stats64.h"
+#include "qemu/timer.h"
+#include "qemu/hbitmap.h"
+#include "block/snapshot.h"
+#include "qemu/throttle.h"
+#include "qemu/rcu.h"
+
+#define BLOCK_FLAG_LAZY_REFCOUNTS   8
+
+#define BLOCK_OPT_SIZE  "size"
+#define BLOCK_OPT_ENCRYPT   "encryption"
+#define BLOCK_OPT_ENCRYPT_FORMAT"encrypt.format"
+#define BLOCK_OPT_COMPAT6   "compat6"
+#define BLOCK_OPT_HWVERSION "hwversion"
+#define BLOCK_OPT_BACKING_FILE  "backing_file"
+#define BLOCK_OPT_BACKING_FMT   "backing_fmt"
+#define BLOCK_OPT_CLUSTER_SIZE  "cluster_size"
+#define BLOCK_OPT_TABLE_SIZE"table_size"
+#define BLOCK_OPT_PREALLOC  "preallocation"
+#define BLOCK_OPT_SUBFMT"subformat"
+#define BLOCK_OPT_COMPAT_LEVEL  "compat"
+#define BLOCK_OPT_LAZY_REFCOUNTS"lazy_refcounts"
+#define BLOCK_OPT_ADAPTER_TYPE  "adapter_type"
+#define BLOCK_OPT_REDUNDANCY"redundancy"
+#define BLOCK_OPT_NOCOW "nocow"
+#define BLOCK_OPT_EXTENT_SIZE_HINT  "extent_size_hint"
+#define BLOCK_OPT_OBJECT_SIZE   "object_size"
+#define BLOCK_OPT_REFCOUNT_BITS "refcount_bits"
+#define BLOCK_OPT_DATA_FILE "data_file"
+#define BLOCK_OPT_DATA_FILE_RAW "data_file_raw"
+#define BLOCK_OPT_COMPRESSION_TYPE  "compression_type"
+#define BLOCK_OPT_EXTL2 "extended_l2"
+
+#define BLOCK_PROBE_BUF_SIZE512
+
+enum BdrvTrackedRequestType {
+BDRV_TRACKED_READ,
+BDRV_TRACKED_WRITE,
+BDRV_TRACKED_DISCARD,
+BDRV_TRACKED_TRUNCATE,
+};
+
+/*
+ * That is not 

[PATCH v8 12/31] assertions for block_int global state API

2022-03-03 Thread Emanuele Giuseppe Esposito
Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c | 15 +++
 block/backup.c  |  1 +
 block/block-backend.c   |  3 +++
 block/commit.c  |  2 ++
 block/dirty-bitmap.c|  1 +
 block/io.c  |  1 +
 block/mirror.c  |  4 
 block/monitor/bitmap-qmp-cmds.c |  6 ++
 block/stream.c  |  2 ++
 blockdev.c  |  7 +++
 10 files changed, 42 insertions(+)

diff --git a/block.c b/block.c
index 7d4a5440de..3bf2689a99 100644
--- a/block.c
+++ b/block.c
@@ -665,6 +665,8 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver 
*drv,
 Error *local_err = NULL;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
 prealloc = qapi_enum_parse(_lookup, buf,
@@ -2504,6 +2506,8 @@ void bdrv_get_cumulative_perm(BlockDriverState *bs, 
uint64_t *perm,
 uint64_t cumulative_perms = 0;
 uint64_t cumulative_shared_perms = BLK_PERM_ALL;
 
+GLOBAL_STATE_CODE();
+
 QLIST_FOREACH(c, >parents, next_parent) {
 cumulative_perms |= c->perm;
 cumulative_shared_perms &= c->shared_perm;
@@ -2562,6 +2566,8 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, 
uint64_t shared,
 Transaction *tran = tran_new();
 int ret;
 
+GLOBAL_STATE_CODE();
+
 bdrv_child_set_perm(c, perm, shared, tran);
 
 ret = bdrv_refresh_perms(c->bs, _err);
@@ -2592,6 +2598,8 @@ int bdrv_child_refresh_perms(BlockDriverState *bs, 
BdrvChild *c, Error **errp)
 uint64_t parent_perms, parent_shared;
 uint64_t perms, shared;
 
+GLOBAL_STATE_CODE();
+
 bdrv_get_cumulative_perm(bs, _perms, _shared);
 bdrv_child_perm(bs, c->bs, c, c->role, NULL,
 parent_perms, parent_shared, , );
@@ -2736,6 +2744,7 @@ void bdrv_default_perms(BlockDriverState *bs, BdrvChild 
*c,
 uint64_t perm, uint64_t shared,
 uint64_t *nperm, uint64_t *nshared)
 {
+GLOBAL_STATE_CODE();
 if (role & BDRV_CHILD_FILTERED) {
 assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
  BDRV_CHILD_COW)));
@@ -3093,6 +3102,8 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState 
*child_bs,
 BdrvChild *child = NULL;
 Transaction *tran = tran_new();
 
+GLOBAL_STATE_CODE();
+
 ret = bdrv_attach_child_common(child_bs, child_name, child_class,
child_role, perm, shared_perm, opaque,
, tran, errp);
@@ -7486,6 +7497,8 @@ bool bdrv_recurse_can_replace(BlockDriverState *bs,
 {
 BlockDriverState *filtered;
 
+GLOBAL_STATE_CODE();
+
 if (!bs || !bs->drv) {
 return false;
 }
@@ -7657,6 +7670,7 @@ static bool append_strong_runtime_options(QDict *d, 
BlockDriverState *bs)
  * would result in exactly bs->backing. */
 static bool bdrv_backing_overridden(BlockDriverState *bs)
 {
+GLOBAL_STATE_CODE();
 if (bs->backing) {
 return strcmp(bs->auto_backing_file,
   bs->backing->bs->filename);
@@ -8045,6 +8059,7 @@ static BlockDriverState 
*bdrv_do_skip_filters(BlockDriverState *bs,
  */
 BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
 {
+GLOBAL_STATE_CODE();
 return bdrv_do_skip_filters(bs, true);
 }
 
diff --git a/block/backup.c b/block/backup.c
index 21d5983779..5cfd0b999c 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -372,6 +372,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 
 assert(bs);
 assert(target);
+GLOBAL_STATE_CODE();
 
 /* QMP interface protects us from these cases */
 assert(sync_mode != MIRROR_SYNC_MODE_INCREMENTAL);
diff --git a/block/block-backend.c b/block/block-backend.c
index be7adce246..93dc5cd8d6 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1106,6 +1106,7 @@ static void blk_root_change_media(BdrvChild *child, bool 
load)
  */
 bool blk_dev_has_removable_media(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
 }
 
@@ -1123,6 +1124,7 @@ bool blk_dev_has_tray(BlockBackend *blk)
  */
 void blk_dev_eject_request(BlockBackend *blk, bool force)
 {
+GLOBAL_STATE_CODE();
 if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
 blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
 }
@@ -1145,6 +1147,7 @@ bool blk_dev_is_tray_open(BlockBackend *blk)
  */
 bool blk_dev_is_medium_locked(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
 return blk->dev_ops->is_medium_locked(blk->dev_opaque);
 }
diff --git a/block/commit.c b/block/commit.c
index 2ce6637ca6..c76899f640 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -253,6 +253,8 @@ void commit_start(const char 

[PATCH v8 13/31] IO_CODE and IO_OR_GS_CODE for block_int I/O API

2022-03-03 Thread Emanuele Giuseppe Esposito
Mark all I/O functions with IO_CODE, and all "I/O OR GS" with
IO_OR_GS_CODE.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c  | 14 +-
 block/block-backend.c|  2 ++
 block/dirty-bitmap.c |  3 +++
 block/io.c   | 13 +
 include/block/block_int-io.h |  6 ++
 5 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 3bf2689a99..cf656e5b26 100644
--- a/block.c
+++ b/block.c
@@ -999,6 +999,7 @@ BlockDriver *bdrv_probe_all(const uint8_t *buf, int 
buf_size,
 {
 int score_max = 0, score;
 BlockDriver *drv = NULL, *d;
+IO_CODE();
 
 QLIST_FOREACH(d, _drivers, list) {
 if (d->bdrv_probe) {
@@ -1051,6 +1052,7 @@ static int find_image_format(BlockBackend *file, const 
char *filename,
 int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
 {
 BlockDriver *drv = bs->drv;
+IO_CODE();
 
 if (!drv) {
 return -ENOMEDIUM;
@@ -6197,6 +6199,7 @@ const char *bdrv_get_parent_name(const BlockDriverState 
*bs)
 {
 BdrvChild *c;
 const char *name;
+IO_CODE();
 
 /* If multiple parents have a name, just pick the first one. */
 QLIST_FOREACH(c, >parents, next_parent) {
@@ -7933,6 +7936,8 @@ int bdrv_make_empty(BdrvChild *c, Error **errp)
  */
 BdrvChild *bdrv_cow_child(BlockDriverState *bs)
 {
+IO_CODE();
+
 if (!bs || !bs->drv) {
 return NULL;
 }
@@ -7956,6 +7961,7 @@ BdrvChild *bdrv_cow_child(BlockDriverState *bs)
 BdrvChild *bdrv_filter_child(BlockDriverState *bs)
 {
 BdrvChild *c;
+IO_CODE();
 
 if (!bs || !bs->drv) {
 return NULL;
@@ -7987,6 +7993,7 @@ BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
 {
 BdrvChild *cow_child = bdrv_cow_child(bs);
 BdrvChild *filter_child = bdrv_filter_child(bs);
+IO_CODE();
 
 /* Filter nodes cannot have COW backing files */
 assert(!(cow_child && filter_child));
@@ -8007,6 +8014,7 @@ BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
 BdrvChild *bdrv_primary_child(BlockDriverState *bs)
 {
 BdrvChild *c, *found = NULL;
+IO_CODE();
 
 QLIST_FOREACH(c, >children, next) {
 if (c->role & BDRV_CHILD_PRIMARY) {
@@ -8069,6 +8077,7 @@ BlockDriverState 
*bdrv_skip_implicit_filters(BlockDriverState *bs)
  */
 BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
 {
+IO_CODE();
 return bdrv_do_skip_filters(bs, false);
 }
 
@@ -8078,6 +8087,7 @@ BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
  */
 BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
 {
+IO_CODE();
 return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
 }
 
@@ -8113,8 +8123,8 @@ static bool 
bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
  */
 bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
 {
+IO_CODE();
 RCU_READ_LOCK_GUARD();
-
 return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
 }
 
@@ -8124,6 +8134,7 @@ bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t 
offset, int64_t *pnum)
 void bdrv_bsc_invalidate_range(BlockDriverState *bs,
int64_t offset, int64_t bytes)
 {
+IO_CODE();
 RCU_READ_LOCK_GUARD();
 
 if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
@@ -8138,6 +8149,7 @@ void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, 
int64_t bytes)
 {
 BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
 BdrvBlockStatusCache *old_bsc;
+IO_CODE();
 
 *new_bsc = (BdrvBlockStatusCache) {
 .valid = true,
diff --git a/block/block-backend.c b/block/block-backend.c
index 93dc5cd8d6..7d2181ec01 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1115,6 +1115,7 @@ bool blk_dev_has_removable_media(BlockBackend *blk)
  */
 bool blk_dev_has_tray(BlockBackend *blk)
 {
+IO_CODE();
 return blk->dev_ops && blk->dev_ops->is_tray_open;
 }
 
@@ -1135,6 +1136,7 @@ void blk_dev_eject_request(BlockBackend *blk, bool force)
  */
 bool blk_dev_is_tray_open(BlockBackend *blk)
 {
+IO_CODE();
 if (blk_dev_has_tray(blk)) {
 return blk->dev_ops->is_tray_open(blk->dev_opaque);
 }
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index e2a1648deb..0334b85805 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -657,6 +657,7 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
 
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
 {
+IO_CODE();
 assert(!bdrv_dirty_bitmap_readonly(bitmap));
 bdrv_dirty_bitmaps_lock(bitmap->bs);
 if (!out) {
@@ -739,6 +740,7 @@ void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap 
*bitmap)
 void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
 BdrvDirtyBitmap *bitmap;
+IO_CODE();
 
 if (QLIST_EMPTY(>dirty_bitmaps)) {
 return;
@@ -930,6 +932,7 @@ bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
  

[PATCH v8 03/31] include/block/block: split header into I/O and global state API

2022-03-03 Thread Emanuele Giuseppe Esposito
block.h currently contains a mix of functions:
some of them run under the BQL and modify the block layer graph,
others are instead thread-safe and perform I/O in iothreads.
Some others can only be called by either the main loop or the
iothread running the AioContext (and not other iothreads),
and using them in another thread would cause deadlocks, and therefore
it is not ideal to define them as I/O.

It is not easy to understand which function is part of which
group (I/O vs GS vs "I/O or GS"), and this patch aims to clarify it.

The "GS" functions need the BQL, and often use
aio_context_acquire/release and/or drain to be sure they
can modify the graph safely.
The I/O function are instead thread safe, and can run in
any AioContext.
"I/O or GS" functions run instead in the main loop or in
a single iothread, and use BDRV_POLL_WHILE().

By splitting the header in two files, block-io.h
and block-global-state.h we have a clearer view on what
needs what kind of protection. block-common.h
contains common structures shared by both headers.

block.h is left there for legacy and to avoid changing
all includes in all c files that use the block APIs.

Assertions are added in the next patch.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c|   3 +
 block/meson.build  |   7 +-
 include/block/block-common.h   | 418 ++
 include/block/block-global-state.h | 252 +
 include/block/block-io.h   | 369 
 include/block/block.h  | 879 +
 6 files changed, 1070 insertions(+), 858 deletions(-)
 create mode 100644 include/block/block-common.h
 create mode 100644 include/block/block-global-state.h
 create mode 100644 include/block/block-io.h

diff --git a/block.c b/block.c
index df353d55e8..7483dfaddc 100644
--- a/block.c
+++ b/block.c
@@ -67,12 +67,15 @@
 
 #define NOT_DONE 0x7fff /* used while emulated sync operation in progress 
*/
 
+/* Protected by BQL */
 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
 
+/* Protected by BQL */
 static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
 QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
 
+/* Protected by BQL */
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
 QLIST_HEAD_INITIALIZER(bdrv_drivers);
 
diff --git a/block/meson.build b/block/meson.build
index 8a1ce58c9c..e42bcb58d5 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -131,8 +131,11 @@ block_ss.add(module_block_h)
 wrapper_py = find_program('../scripts/block-coroutine-wrapper.py')
 block_gen_c = custom_target('block-gen.c',
 output: 'block-gen.c',
-input: files('../include/block/block.h',
- 'coroutines.h'),
+input: files(
+  '../include/block/block-io.h',
+  '../include/block/block-global-state.h',
+  'coroutines.h'
+  ),
 command: [wrapper_py, '@OUTPUT@', '@INPUT@'])
 block_ss.add(block_gen_c)
 
diff --git a/include/block/block-common.h b/include/block/block-common.h
new file mode 100644
index 00..0c5dc4a86a
--- /dev/null
+++ b/include/block/block-common.h
@@ -0,0 +1,418 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_COMMON_H
+#define BLOCK_COMMON_H
+
+#include "block/aio.h"
+#include "block/aio-wait.h"
+#include "qemu/iov.h"
+#include "qemu/coroutine.h"
+#include "block/accounting.h"
+#include "block/dirty-bitmap.h"
+#include "block/blockjob.h"
+#include "qemu/hbitmap.h"
+#include "qemu/transactions.h"
+
+/*
+ * generated_co_wrapper
+ *
+ * Function specifier, which does nothing but 

[PATCH v8 05/31] IO_CODE and IO_OR_GS_CODE for block I/O API

2022-03-03 Thread Emanuele Giuseppe Esposito
Mark all I/O functions with IO_CODE, and all "I/O OR GS" with
IO_OR_GS_CODE.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c  | 38 ++-
 block/dirty-bitmap.c |  1 +
 block/io.c   | 43 ++--
 include/block/block-io.h |  1 +
 4 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index f59f290653..2a7df2a013 100644
--- a/block.c
+++ b/block.c
@@ -137,6 +137,7 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
 /* page size or 4k (hdd sector size) should be on the safe side */
 return MAX(4096, qemu_real_host_page_size);
 }
+IO_CODE();
 
 return bs->bl.opt_mem_alignment;
 }
@@ -147,6 +148,7 @@ size_t bdrv_min_mem_align(BlockDriverState *bs)
 /* page size or 4k (hdd sector size) should be on the safe side */
 return MAX(4096, qemu_real_host_page_size);
 }
+IO_CODE();
 
 return bs->bl.min_mem_alignment;
 }
@@ -272,12 +274,15 @@ void bdrv_parse_filename_strip_prefix(const char 
*filename, const char *prefix,
  * image is inactivated. */
 bool bdrv_is_read_only(BlockDriverState *bs)
 {
+IO_CODE();
 return !(bs->open_flags & BDRV_O_RDWR);
 }
 
 int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
bool ignore_allow_rdw, Error **errp)
 {
+IO_CODE();
+
 /* Do not set read_only if copy_on_read is enabled */
 if (bs->copy_on_read && read_only) {
 error_setg(errp, "Can't set node '%s' to r/o with copy-on-read 
enabled",
@@ -311,6 +316,7 @@ int bdrv_apply_auto_read_only(BlockDriverState *bs, const 
char *errmsg,
   Error **errp)
 {
 int ret = 0;
+IO_CODE();
 
 if (!(bs->open_flags & BDRV_O_RDWR)) {
 return 0;
@@ -757,6 +763,7 @@ int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, 
Error **errp)
 Error *local_err = NULL;
 int ret;
 
+IO_CODE();
 assert(bs != NULL);
 
 if (!bs->drv) {
@@ -782,6 +789,7 @@ void coroutine_fn 
bdrv_co_delete_file_noerr(BlockDriverState *bs)
 {
 Error *local_err = NULL;
 int ret;
+IO_CODE();
 
 if (!bs) {
 return;
@@ -1442,6 +1450,7 @@ static int bdrv_child_cb_update_filename(BdrvChild *c, 
BlockDriverState *base,
 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
 {
 BlockDriverState *bs = c->opaque;
+IO_CODE();
 
 return bdrv_get_aio_context(bs);
 }
@@ -1464,6 +1473,7 @@ const BdrvChildClass child_of_bds = {
 
 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
 {
+IO_CODE();
 return c->klass->get_parent_aio_context(c);
 }
 
@@ -2077,6 +2087,7 @@ static bool 
bdrv_is_writable_after_reopen(BlockDriverState *bs,
  */
 bool bdrv_is_writable(BlockDriverState *bs)
 {
+IO_CODE();
 return bdrv_is_writable_after_reopen(bs, NULL);
 }
 
@@ -5706,6 +5717,8 @@ static int64_t 
bdrv_sum_allocated_file_size(BlockDriverState *bs)
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
+IO_CODE();
+
 if (!drv) {
 return -ENOMEDIUM;
 }
@@ -5755,6 +5768,7 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
BlockDriverState *in_bs, Error **errp)
 {
+IO_CODE();
 if (!drv->bdrv_measure) {
 error_setg(errp, "Block driver '%s' does not support size measurement",
drv->format_name);
@@ -5770,6 +5784,7 @@ BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts 
*opts,
 int64_t bdrv_nb_sectors(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
+IO_CODE();
 
 if (!drv)
 return -ENOMEDIUM;
@@ -5790,6 +5805,7 @@ int64_t bdrv_nb_sectors(BlockDriverState *bs)
 int64_t bdrv_getlength(BlockDriverState *bs)
 {
 int64_t ret = bdrv_nb_sectors(bs);
+IO_CODE();
 
 if (ret < 0) {
 return ret;
@@ -5804,12 +5820,14 @@ int64_t bdrv_getlength(BlockDriverState *bs)
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
 {
 int64_t nb_sectors = bdrv_nb_sectors(bs);
+IO_CODE();
 
 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
 }
 
 bool bdrv_is_sg(BlockDriverState *bs)
 {
+IO_CODE();
 return bs->sg;
 }
 
@@ -5819,6 +5837,7 @@ bool bdrv_is_sg(BlockDriverState *bs)
 bool bdrv_supports_compressed_writes(BlockDriverState *bs)
 {
 BlockDriverState *filtered;
+IO_CODE();
 
 if (!bs->drv || !block_driver_can_compress(bs->drv)) {
 return false;
@@ -5838,6 +5857,7 @@ bool bdrv_supports_compressed_writes(BlockDriverState *bs)
 
 const char *bdrv_get_format_name(BlockDriverState *bs)
 {
+IO_CODE();
 return bs->drv ? bs->drv->format_name : NULL;
 }
 
@@ -6146,6 +6166,7 @@ BlockDriverState *bdrv_next_all_states(BlockDriverState 
*bs)
 
 const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
+IO_CODE();
 return bs->node_name;
 

[PATCH v8 09/31] IO_CODE and IO_OR_GS_CODE for block-backend I/O API

2022-03-03 Thread Emanuele Giuseppe Esposito
Mark all I/O functions with IO_CODE, and all "I/O OR GS" with
IO_OR_GS_CODE.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/block-backend.c | 58 +++
 include/sysemu/block-backend-io.h |  2 ++
 2 files changed, 60 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 4476b61b8b..be7adce246 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -733,6 +733,7 @@ void monitor_remove_blk(BlockBackend *blk)
  */
 const char *blk_name(const BlockBackend *blk)
 {
+IO_CODE();
 return blk->name ?: "";
 }
 
@@ -759,6 +760,7 @@ BlockBackend *blk_by_name(const char *name)
  */
 BlockDriverState *blk_bs(BlockBackend *blk)
 {
+IO_CODE();
 return blk->root ? blk->root->bs : NULL;
 }
 
@@ -1009,6 +1011,7 @@ DeviceState *blk_get_attached_dev(BlockBackend *blk)
 char *blk_get_attached_dev_id(BlockBackend *blk)
 {
 DeviceState *dev = blk->dev;
+IO_CODE();
 
 if (!dev) {
 return g_strdup("");
@@ -1171,6 +1174,7 @@ void blk_iostatus_enable(BlockBackend *blk)
  * enables it _and_ the VM is configured to stop on errors */
 bool blk_iostatus_is_enabled(const BlockBackend *blk)
 {
+IO_CODE();
 return (blk->iostatus_enabled &&
(blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
 blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
@@ -1199,6 +1203,7 @@ void blk_iostatus_reset(BlockBackend *blk)
 
 void blk_iostatus_set_err(BlockBackend *blk, int error)
 {
+IO_CODE();
 assert(blk_iostatus_is_enabled(blk));
 if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
 blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
@@ -1208,16 +1213,19 @@ void blk_iostatus_set_err(BlockBackend *blk, int error)
 
 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
 {
+IO_CODE();
 blk->allow_write_beyond_eof = allow;
 }
 
 void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
 {
+IO_CODE();
 blk->allow_aio_context_change = allow;
 }
 
 void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
 {
+IO_CODE();
 blk->disable_request_queuing = disable;
 }
 
@@ -1301,6 +1309,7 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t 
offset,
BdrvRequestFlags flags)
 {
 int ret;
+IO_OR_GS_CODE();
 
 blk_inc_in_flight(blk);
 ret = blk_co_do_preadv(blk, offset, bytes, qiov, flags);
@@ -1352,6 +1361,7 @@ int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, 
int64_t offset,
  BdrvRequestFlags flags)
 {
 int ret;
+IO_OR_GS_CODE();
 
 blk_inc_in_flight(blk);
 ret = blk_co_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
@@ -1364,6 +1374,7 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, 
int64_t offset,
 int64_t bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags)
 {
+IO_OR_GS_CODE();
 return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
 }
 
@@ -1392,6 +1403,7 @@ typedef struct BlkRwCo {
 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
   int64_t bytes, BdrvRequestFlags flags)
 {
+IO_OR_GS_CODE();
 return blk_pwritev_part(blk, offset, bytes, NULL, 0,
 flags | BDRV_REQ_ZERO_WRITE);
 }
@@ -1404,11 +1416,13 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags 
flags)
 
 void blk_inc_in_flight(BlockBackend *blk)
 {
+IO_CODE();
 qatomic_inc(>in_flight);
 }
 
 void blk_dec_in_flight(BlockBackend *blk)
 {
+IO_CODE();
 qatomic_dec(>in_flight);
 aio_wait_kick();
 }
@@ -1427,6 +1441,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
   void *opaque, int ret)
 {
 struct BlockBackendAIOCB *acb;
+IO_CODE();
 
 blk_inc_in_flight(blk);
 acb = blk_aio_get(_backend_aiocb_info, blk, cb, opaque);
@@ -1534,6 +1549,7 @@ BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, 
int64_t offset,
   int64_t bytes, BdrvRequestFlags flags,
   BlockCompletionFunc *cb, void *opaque)
 {
+IO_CODE();
 return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_write_entry,
 flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
 }
@@ -1542,6 +1558,7 @@ int blk_pread(BlockBackend *blk, int64_t offset, void 
*buf, int bytes)
 {
 int ret;
 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+IO_OR_GS_CODE();
 
 blk_inc_in_flight(blk);
 ret = blk_do_preadv(blk, offset, bytes, , 0);
@@ -1555,6 +1572,7 @@ int blk_pwrite(BlockBackend *blk, int64_t offset, const 
void *buf, int bytes,
 {
 int ret;
 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+IO_OR_GS_CODE();
 
 ret = blk_pwritev_part(blk, offset, bytes, , 0, flags);
 
@@ -1563,6 +1581,7 @@ int blk_pwrite(BlockBackend *blk, int64_t 

[PATCH v8 07/31] include/sysemu/block-backend: split header into I/O and global state (GS) API

2022-03-03 Thread Emanuele Giuseppe Esposito
Similarly to the previous patches, split block-backend.h
in block-backend-io.h and block-backend-global-state.h

In addition, remove "block/block.h" include as it seems
it is not necessary anymore, together with "qemu/iov.h"

block-backend-common.h contains the structures shared between
the two headers, and the functions that can't be categorized as
I/O or global state.

Assertions are added in the next patch.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/block-backend.c   |   9 +-
 include/sysemu/block-backend-common.h   |  84 ++
 include/sysemu/block-backend-global-state.h | 116 +
 include/sysemu/block-backend-io.h   | 159 
 include/sysemu/block-backend.h  | 269 +---
 5 files changed, 368 insertions(+), 269 deletions(-)
 create mode 100644 include/sysemu/block-backend-common.h
 create mode 100644 include/sysemu/block-backend-global-state.h
 create mode 100644 include/sysemu/block-backend-io.h

diff --git a/block/block-backend.c b/block/block-backend.c
index 98bfcd5cf2..462e18facf 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -79,6 +79,7 @@ struct BlockBackend {
 bool allow_aio_context_change;
 bool allow_write_beyond_eof;
 
+/* Protected by BQL */
 NotifierList remove_bs_notifiers, insert_bs_notifiers;
 QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
 
@@ -111,12 +112,14 @@ static const AIOCBInfo block_backend_aiocb_info = {
 static void drive_info_del(DriveInfo *dinfo);
 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
 
-/* All BlockBackends */
+/* All BlockBackends. Protected by BQL. */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
 QTAILQ_HEAD_INITIALIZER(block_backends);
 
-/* All BlockBackends referenced by the monitor and which are iterated through 
by
- * blk_next() */
+/*
+ * All BlockBackends referenced by the monitor and which are iterated through 
by
+ * blk_next(). Protected by BQL.
+ */
 static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
 QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
 
diff --git a/include/sysemu/block-backend-common.h 
b/include/sysemu/block-backend-common.h
new file mode 100644
index 00..6963bbf45a
--- /dev/null
+++ b/include/sysemu/block-backend-common.h
@@ -0,0 +1,84 @@
+/*
+ * QEMU Block backends
+ *
+ * Copyright (C) 2014-2016 Red Hat, Inc.
+ *
+ * Authors:
+ *  Markus Armbruster ,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1
+ * or later.  See the COPYING.LIB file in the top-level directory.
+ */
+
+#ifndef BLOCK_BACKEND_COMMON_H
+#define BLOCK_BACKEND_COMMON_H
+
+#include "qemu/iov.h"
+#include "block/throttle-groups.h"
+
+/*
+ * TODO Have to include block/block.h for a bunch of block layer
+ * types.  Unfortunately, this pulls in the whole BlockDriverState
+ * API, which we don't want used by many BlockBackend users.  Some of
+ * the types belong here, and the rest should be split into a common
+ * header and one for the BlockDriverState API.
+ */
+#include "block/block.h"
+
+/* Callbacks for block device models */
+typedef struct BlockDevOps {
+/*
+ * Runs when virtual media changed (monitor commands eject, change)
+ * Argument load is true on load and false on eject.
+ * Beware: doesn't run when a host device's physical media
+ * changes.  Sure would be useful if it did.
+ * Device models with removable media must implement this callback.
+ */
+void (*change_media_cb)(void *opaque, bool load, Error **errp);
+/*
+ * Runs when an eject request is issued from the monitor, the tray
+ * is closed, and the medium is locked.
+ * Device models that do not implement is_medium_locked will not need
+ * this callback.  Device models that can lock the medium or tray might
+ * want to implement the callback and unlock the tray when "force" is
+ * true, even if they do not support eject requests.
+ */
+void (*eject_request_cb)(void *opaque, bool force);
+/*
+ * Is the virtual tray open?
+ * Device models implement this only when the device has a tray.
+ */
+bool (*is_tray_open)(void *opaque);
+/*
+ * Is the virtual medium locked into the device?
+ * Device models implement this only when device has such a lock.
+ */
+bool (*is_medium_locked)(void *opaque);
+/*
+ * Runs when the size changed (e.g. monitor command block_resize)
+ */
+void (*resize_cb)(void *opaque);
+/*
+ * Runs when the backend receives a drain request.
+ */
+void (*drained_begin)(void *opaque);
+/*
+ * Runs when the backend's last drain request ends.
+ */
+void (*drained_end)(void *opaque);
+/*
+ * Is the device still busy?
+ */
+bool (*drained_poll)(void *opaque);
+} BlockDevOps;
+
+/*
+ * This struct is embedded in (the private) BlockBackend struct and contains
+ * fields that must be public. This is in particular for 

[PATCH v8 00/31] block layer: split block APIs in global state and I/O

2022-03-03 Thread Emanuele Giuseppe Esposito
Currently, block layer APIs like block.h contain a mix of
functions that are either running in the main loop and under the
BQL, or are thread-safe functions and run in iothreads performing I/O.
The functions running under BQL also take care of modifying the
block graph, by using drain and/or aio_context_acquire/release.
This makes it very confusing to understand where each function
runs, and what assumptions it provided with regards to thread
safety.

We call the functions running under BQL "global state (GS) API", and
distinguish them from the thread-safe "I/O API".

The aim of this series is to split the relevant block headers in
global state and I/O sub-headers. The division will be done in
this way:
header.h will be split in header-global-state.h, header-io.h and
header-common.h. The latter will just contain the data structures
needed by header-global-state and header-io, and common helpers
that are neither in GS nor in I/O. header.h will remain for
legacy and to avoid changing all includes in all QEMU c files,
but will only include the two new headers. No function shall be
added in header.c .
Once we split all relevant headers, it will be much easier to see what
uses the AioContext lock and remove it, which is the overall main
goal of this and other series that I posted/will post.

In addition to splitting the relevant headers shown in this series,
it is also very helpful splitting the function pointers in some
block structures, to understand what runs under AioContext lock and
what doesn't. This is what patches 21-27 do.

Each function in the GS API will have an assertion, checking
that it is always running under BQL.
I/O functions are instead thread safe (or so should be), meaning
that they *can* run under BQL, but also in an iothread in another
AioContext. Therefore they do not provide any assertion, and
need to be audited manually to verify the correctness.

Adding assetions has helped finding 2 bugs already, as shown in
my series "Migration: fix missing iothread locking".

Tested this series by running unit tests, qemu-iotests and qtests
(x86_64).
Some functions in the GS API are used everywhere but not
properly tested. Therefore their assertion is never actually run in
the tests, so despite my very careful auditing, it is not impossible
to exclude that some will trigger while actually using QEMU.

Patch 1 introduces qemu_in_main_thread(), the function used in
all assertions. This had to be introduced otherwise all unit tests
would fail, since they run in the main loop but use the code in
stubs/iothread.c
Patches 2-27 (with the exception of patch 9-10, that are an additional
assert) are all structured in the same way: first we split the header
and in the next (or same, if small) patch we add assertions.
Patch 28-31 take care instead of the block layer permission API,
fixing some bugs where they are used in I/O functions.

This serie depends on my previous serie "block layer: permission API
refactoring in preparation to the API split"

Based-on: <20220209105452.1694545-1-eespo...@redhat.com>

Signed-off-by: Emanuele Giuseppe Esposito 
---
v8:
bdrv_get_full_backing_filename to GLOBAL_STATE_CODE
blk_iostatus_is_enabled in IO_CODE
blk_iostatus_set_err in IO_CODE
bdrv_apply_auto_read_only in IO_CODE
bdrv_can_set_read_only in IO_CODE
blk_drain to GLOBAL_STATE_CODE

v7:
* crypto permissions and bdrv-activate patches sent in another serie
* (*bdrv_probe) and (*get_name) are I/O
* add missing license header in block-common.h
* in block-common.h:
  bdrv_parse_cache_mode
  bdrv_parse_discard_flags
  bdrv_perm_names
  bdrv_qapi_perm_to_blk_perm
  bdrv_init_with_whitelist
  bdrv_uses_whitelist
  bdrv_is_whitelisted
* in block-io.h:
  bdrv_get_full_backing_filename
  bdrv_make_zero
  bdrv_aio_cancel
* Introduce new "Global OR I/O" category for the functions using
BDRV_POLL_WHILE
  Functions in this category:
BDRV_POLL_WHILE
bdrv_drain
bdrv_co_drain
bdrv_truncate
bdrv_check
bdrv_invalidate_cache
bdrv_flush
bdrv_pdiscard
bdrv_readv_vmstate
bdrv_writev_vmstate
bdrv_parent_drained_begin_single
bdrv_parent_drained_end_single
bdrv_drain_poll
bdrv_drained_begin
bdrv_do_drained_begin_quiesce
bdrv_subtree_drained_begin
bdrv_drained_end
bdrv_drained_end_no_poll
bdrv_subtree_drained_end
* better comment descriptions of common, GS, I/O and "I/O or GS" categories
* remove job_pre_run, we don't really need it.
* insert assertion GLOBAL_STATE_CODE, IO_CODE and IO_OR_GS_CODE macros
* replace all assert(qemu_in_main_thread()) with GLOBAL_STATE_CODE
* use IO_CODE and IO_OR_GS_CODE assertions, in additional patches

v6:
* Additional assertions in "block.c: add assertions to static functions"
* bdrv_co_invalidate_cache: create a new GS function bdrv_activate, and move
  all GS logic of bdrv_co_invalidate_cache there, so that the
  coroutine only runs I/O code. Move the resulting 3 patches before
  "block/coroutines: I/O API"
* crypto (patch 30): 

[PATCH v8 06/31] block/export/fuse.c: allow writable exports to take RESIZE permission

2022-03-03 Thread Emanuele Giuseppe Esposito
Allow writable exports to get BLK_PERM_RESIZE permission
from creation, in fuse_export_create().
In this way, there is no need to give the permission in
fuse_do_truncate(), which might be run in an iothread.

Permissions should be set only in the main thread, so
in any case if an iothread tries to set RESIZE, it will
be blocked.

Also assert in fuse_do_truncate that if we give the
RESIZE permission we can then restore the original ones.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Hanna Reitz 
---
 block/export/fuse.c | 25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index fdda8e3c81..5029e70f84 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -86,8 +86,8 @@ static int fuse_export_create(BlockExport *blk_exp,
 
 assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
 
-/* For growable exports, take the RESIZE permission */
-if (args->growable) {
+/* For growable and writable exports, take the RESIZE permission */
+if (args->growable || blk_exp_args->writable) {
 uint64_t blk_perm, blk_shared_perm;
 
 blk_get_perm(exp->common.blk, _perm, _shared_perm);
@@ -392,14 +392,23 @@ static int fuse_do_truncate(const FuseExport *exp, 
int64_t size,
 {
 uint64_t blk_perm, blk_shared_perm;
 BdrvRequestFlags truncate_flags = 0;
-int ret;
+bool add_resize_perm;
+int ret, ret_check;
+
+/* Growable and writable exports have a permanent RESIZE permission */
+add_resize_perm = !exp->growable && !exp->writable;
 
 if (req_zero_write) {
 truncate_flags |= BDRV_REQ_ZERO_WRITE;
 }
 
-/* Growable exports have a permanent RESIZE permission */
-if (!exp->growable) {
+if (add_resize_perm) {
+
+if (!qemu_in_main_thread()) {
+/* Changing permissions like below only works in the main thread */
+return -EPERM;
+}
+
 blk_get_perm(exp->common.blk, _perm, _shared_perm);
 
 ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE,
@@ -412,9 +421,11 @@ static int fuse_do_truncate(const FuseExport *exp, int64_t 
size,
 ret = blk_truncate(exp->common.blk, size, true, prealloc,
truncate_flags, NULL);
 
-if (!exp->growable) {
+if (add_resize_perm) {
 /* Must succeed, because we are only giving up the RESIZE permission */
-blk_set_perm(exp->common.blk, blk_perm, blk_shared_perm, _abort);
+ret_check = blk_set_perm(exp->common.blk, blk_perm,
+ blk_shared_perm, _abort);
+assert(ret_check == 0);
 }
 
 return ret;
-- 
2.31.1




[PATCH v8 04/31] assertions for block global state API

2022-03-03 Thread Emanuele Giuseppe Esposito
All the global state (GS) API functions will check that
qemu_in_main_thread() returns true. If not, it means
that the safety of BQL cannot be guaranteed, and
they need to be moved to I/O.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c| 124 -
 block/commit.c |   2 +
 block/io.c |  11 +
 blockdev.c |   1 +
 4 files changed, 136 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 7483dfaddc..f59f290653 100644
--- a/block.c
+++ b/block.c
@@ -387,12 +387,14 @@ static char *bdrv_make_absolute_filename(BlockDriverState 
*relative_to,
 
 char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
 {
+GLOBAL_STATE_CODE();
 return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
 }
 
 void bdrv_register(BlockDriver *bdrv)
 {
 assert(bdrv->format_name);
+GLOBAL_STATE_CODE();
 QLIST_INSERT_HEAD(_drivers, bdrv, list);
 }
 
@@ -401,6 +403,8 @@ BlockDriverState *bdrv_new(void)
 BlockDriverState *bs;
 int i;
 
+GLOBAL_STATE_CODE();
+
 bs = g_new0(BlockDriverState, 1);
 QLIST_INIT(>dirty_bitmaps);
 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
@@ -443,6 +447,8 @@ BlockDriver *bdrv_find_format(const char *format_name)
 BlockDriver *drv1;
 int i;
 
+GLOBAL_STATE_CODE();
+
 drv1 = bdrv_do_find_format(format_name);
 if (drv1) {
 return drv1;
@@ -492,6 +498,7 @@ static int bdrv_format_is_whitelisted(const char 
*format_name, bool read_only)
 
 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
 {
+GLOBAL_STATE_CODE();
 return bdrv_format_is_whitelisted(drv->format_name, read_only);
 }
 
@@ -527,6 +534,8 @@ int bdrv_create(BlockDriver *drv, const char* filename,
 {
 int ret;
 
+GLOBAL_STATE_CODE();
+
 Coroutine *co;
 CreateCo cco = {
 .drv = drv,
@@ -702,6 +711,8 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, 
Error **errp)
 QDict *qdict;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 drv = bdrv_find_protocol(filename, true, errp);
 if (drv == NULL) {
 return -ENOENT;
@@ -799,6 +810,7 @@ int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes 
*bsz)
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *filtered = bdrv_filter_bs(bs);
+GLOBAL_STATE_CODE();
 
 if (drv && drv->bdrv_probe_blocksizes) {
 return drv->bdrv_probe_blocksizes(bs, bsz);
@@ -819,6 +831,7 @@ int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry 
*geo)
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *filtered = bdrv_filter_bs(bs);
+GLOBAL_STATE_CODE();
 
 if (drv && drv->bdrv_probe_geometry) {
 return drv->bdrv_probe_geometry(bs, geo);
@@ -910,6 +923,7 @@ BlockDriver *bdrv_find_protocol(const char *filename,
 const char *p;
 int i;
 
+GLOBAL_STATE_CODE();
 /* TODO Drivers without bdrv_file_open must be specified explicitly */
 
 /*
@@ -1634,6 +1648,8 @@ BlockDriverState *bdrv_new_open_driver_opts(BlockDriver 
*drv,
 BlockDriverState *bs;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 bs = bdrv_new();
 bs->open_flags = flags;
 bs->options = options ?: qdict_new();
@@ -1659,6 +1675,7 @@ BlockDriverState *bdrv_new_open_driver_opts(BlockDriver 
*drv,
 BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
int flags, Error **errp)
 {
+GLOBAL_STATE_CODE();
 return bdrv_new_open_driver_opts(drv, node_name, NULL, flags, errp);
 }
 
@@ -3094,6 +3111,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 BdrvChild *child = NULL;
 Transaction *tran = tran_new();
 
+GLOBAL_STATE_CODE();
+
 ret = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, 
child_class,
child_role, , tran, errp);
 if (ret < 0) {
@@ -3120,6 +3139,8 @@ void bdrv_root_unref_child(BdrvChild *child)
 {
 BlockDriverState *child_bs;
 
+GLOBAL_STATE_CODE();
+
 child_bs = child->bs;
 bdrv_detach_child();
 bdrv_unref(child_bs);
@@ -3194,6 +3215,7 @@ static void bdrv_unset_inherits_from(BlockDriverState 
*root, BdrvChild *child,
 /* Callers must ensure that child->frozen is false. */
 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
 {
+GLOBAL_STATE_CODE();
 if (child == NULL) {
 return;
 }
@@ -3344,6 +3366,7 @@ int bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd,
 int ret;
 Transaction *tran = tran_new();
 
+GLOBAL_STATE_CODE();
 bdrv_drained_begin(bs);
 
 ret = bdrv_set_backing_noperm(bs, backing_hd, tran, errp);
@@ -3383,6 +3406,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*parent_options,
 QDict *tmp_parent_options = NULL;
 Error *local_err = NULL;
 
+GLOBAL_STATE_CODE();
+
 if (bs->backing != NULL) {
 goto free_exit;
 }
@@ -3542,6 +3567,8 @@ BdrvChild *bdrv_open_child(const char 

[PATCH v8 02/31] main loop: macros to mark GS and I/O functions

2022-03-03 Thread Emanuele Giuseppe Esposito
Righ now, IO_CODE and IO_OR_GS_CODE are nop, as there isn't
really a way to check that a function is only called in I/O.
On the other side, we can use qemu_in_main_thread() to check if
we are in the main loop.

The usage of macros makes easy to extend them in the future without
making changes in all callers. They will also visually help understanding
in which category each function is, without looking at the header.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/qemu/main-loop.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index bc42b5939d..7a4d6a0920 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -269,6 +269,24 @@ bool qemu_mutex_iothread_locked(void);
  */
 bool qemu_in_main_thread(void);
 
+/* Mark and check that the function is part of the global state API. */
+#define GLOBAL_STATE_CODE() \
+do {\
+assert(qemu_in_main_thread());  \
+} while (0)
+
+/* Mark and check that the function is part of the I/O API. */
+#define IO_CODE()   \
+do {\
+/* nop */   \
+} while (0)
+
+/* Mark and check that the function is part of the "I/O OR GS" API. */
+#define IO_OR_GS_CODE() \
+do {\
+/* nop */   \
+} while (0)
+
 /**
  * qemu_mutex_lock_iothread: Lock the main loop mutex.
  *
-- 
2.31.1




[PATCH v2 4/4] util/event-loop-base: Introduce options to set the thread pool size

2022-03-03 Thread Nicolas Saenz Julienne
The thread pool regulates itself: when idle, it kills threads until
empty, when in demand, it creates new threads until full. This behaviour
doesn't play well with latency sensitive workloads where the price of
creating a new thread is too high. For example, when paired with qemu's
'-mlock', or using safety features like SafeStack, creating a new thread
has been measured take multiple milliseconds.

In order to mitigate this let's introduce a new 'EventLoopBaase'
property to set the thread pool size. The threads will be created during
the pool's initialization or upon updating the property's value, remain
available during its lifetime regardless of demand, and destroyed upon
freeing it. A properly characterized workload will then be able to
configure the pool to avoid any latency spikes.

Signed-off-by: Nicolas Saenz Julienne 
---

Changes since v1:
 - Add INT_MAX check
 - Have copy of thread pool sizes in AioContext to properly decouple
   both instances
 - More coherent variable naming
 - Handle case where max_threads decreases
 - Code comments

 event-loop-base.c| 23 +
 include/block/aio.h  | 10 ++
 include/block/thread-pool.h  |  3 ++
 include/sysemu/event-loop-base.h |  4 +++
 iothread.c   |  3 ++
 qapi/qom.json| 22 ++--
 util/aio-posix.c |  1 +
 util/async.c | 20 +++
 util/main-loop.c |  9 +
 util/thread-pool.c   | 59 +---
 10 files changed, 148 insertions(+), 6 deletions(-)

diff --git a/event-loop-base.c b/event-loop-base.c
index e7f99a6ec8..d5be4dc6fc 100644
--- a/event-loop-base.c
+++ b/event-loop-base.c
@@ -14,6 +14,7 @@
 #include "qemu/osdep.h"
 #include "qom/object_interfaces.h"
 #include "qapi/error.h"
+#include "block/thread-pool.h"
 #include "sysemu/event-loop-base.h"
 
 typedef struct {
@@ -21,9 +22,22 @@ typedef struct {
 ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
 } EventLoopBaseParamInfo;
 
+static void event_loop_base_instance_init(Object *obj)
+{
+EventLoopBase *base = EVENT_LOOP_BASE(obj);
+
+base->thread_pool_max = THREAD_POOL_MAX_THREADS_DEFAULT;
+}
+
 static EventLoopBaseParamInfo aio_max_batch_info = {
 "aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
 };
+static EventLoopBaseParamInfo thread_pool_min_info = {
+"thread-pool-min", offsetof(EventLoopBase, thread_pool_min),
+};
+static EventLoopBaseParamInfo thread_pool_max_info = {
+"thread-pool-max", offsetof(EventLoopBase, thread_pool_max),
+};
 
 static void event_loop_base_get_param(Object *obj, Visitor *v,
 const char *name, void *opaque, Error **errp)
@@ -95,12 +109,21 @@ static void event_loop_base_class_init(ObjectClass *klass, 
void *class_data)
   event_loop_base_get_param,
   event_loop_base_set_param,
   NULL, _max_batch_info);
+object_class_property_add(klass, "thread-pool-min", "int",
+  event_loop_base_get_param,
+  event_loop_base_set_param,
+  NULL, _pool_min_info);
+object_class_property_add(klass, "thread-pool-max", "int",
+  event_loop_base_get_param,
+  event_loop_base_set_param,
+  NULL, _pool_max_info);
 }
 
 static const TypeInfo event_loop_base_info = {
 .name = TYPE_EVENT_LOOP_BASE,
 .parent = TYPE_OBJECT,
 .instance_size = sizeof(EventLoopBase),
+.instance_init = event_loop_base_instance_init,
 .class_size = sizeof(EventLoopBaseClass),
 .class_init = event_loop_base_class_init,
 .abstract = true,
diff --git a/include/block/aio.h b/include/block/aio.h
index 5634173b12..d128558f1d 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -192,6 +192,8 @@ struct AioContext {
 QSLIST_HEAD(, Coroutine) scheduled_coroutines;
 QEMUBH *co_schedule_bh;
 
+int thread_pool_min;
+int thread_pool_max;
 /* Thread pool for performing work and receiving completion callbacks.
  * Has its own locking.
  */
@@ -769,4 +771,12 @@ void aio_context_set_poll_params(AioContext *ctx, int64_t 
max_ns,
 void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
 Error **errp);
 
+/**
+ * aio_context_set_thread_pool_params:
+ * @ctx: the aio context
+ * @min: min number of threads to have readily available in the thread pool
+ * @min: max number of threads the thread pool can contain
+ */
+void aio_context_set_thread_pool_params(AioContext *ctx, int64_t min,
+int64_t max, Error **errp);
 #endif
diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h
index 7dd7d730a0..2020bcc92d 100644
--- a/include/block/thread-pool.h
+++ b/include/block/thread-pool.h
@@ -20,6 

[PATCH v2 2/4] Introduce event-loop-base abstract class

2022-03-03 Thread Nicolas Saenz Julienne
Introduce the 'event-loop-base' abstract class, it'll hold the
properties common to all event loops and provide the necessary hooks for
their creation and maintenance. Then have iothread inherit from it.

EventLoopBaseClass is defined as user creatable and provides a hook for
its children to attach themselves to the user creatable class 'complete'
function. It also provides an update_params() callback to propagate
property changes onto its children.

The new 'event-loop-base' class will live in the root directory, and it
imposes new compilation dependencies:

qom <- event-loop-base <- blockdev (iothread)

It is built with on its own using the link_whole option as there are no
direct function dependencies between the class and its children
(everything happens through the 'contructor' attribute). All this forced
some amount of reordering in meson.build, among other things the 'hw'
subdir is processed earlier as it introduces files into the 'qom' source
set.

No functional changes intended.

Signed-off-by: Nicolas Saenz Julienne 
---

Changes since v1:
 - Rename to event-loop-base
 - Move event-loop-base into root directory
 - Build event-loop-base on its own, use link_whole to avoid the problem
   of the object file not being linked due to lacking direct calls from
   dependencies.
 - Move poll parameters into iothread, as main loop can't poll
 - Update Authorship (I took what iothread.c had and added myself, I
   hope that's fine)
 - Introduce update_params() callback

 event-loop-base.c| 104 +++
 include/sysemu/event-loop-base.h |  36 +++
 include/sysemu/iothread.h|   6 +-
 iothread.c   |  65 ++-
 meson.build  |  23 ---
 5 files changed, 175 insertions(+), 59 deletions(-)
 create mode 100644 event-loop-base.c
 create mode 100644 include/sysemu/event-loop-base.h

diff --git a/event-loop-base.c b/event-loop-base.c
new file mode 100644
index 00..a924c73a7c
--- /dev/null
+++ b/event-loop-base.c
@@ -0,0 +1,104 @@
+/*
+ * QEMU event-loop base
+ *
+ * Copyright (C) 2022 Red Hat Inc
+ *
+ * Authors:
+ *  Stefan Hajnoczi 
+ *  Nicolas Saenz Julienne 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qom/object_interfaces.h"
+#include "qapi/error.h"
+#include "sysemu/event-loop-base.h"
+
+typedef struct {
+const char *name;
+ptrdiff_t offset; /* field's byte offset in EventLoopBase struct */
+} EventLoopBaseParamInfo;
+
+static EventLoopBaseParamInfo aio_max_batch_info = {
+"aio-max-batch", offsetof(EventLoopBase, aio_max_batch),
+};
+
+static void event_loop_base_get_param(Object *obj, Visitor *v,
+const char *name, void *opaque, Error **errp)
+{
+EventLoopBase *event_loop_base = EVENT_LOOP_BASE(obj);
+EventLoopBaseParamInfo *info = opaque;
+int64_t *field = (void *)event_loop_base + info->offset;
+
+visit_type_int64(v, name, field, errp);
+}
+
+static void event_loop_base_set_param(Object *obj, Visitor *v,
+const char *name, void *opaque, Error **errp)
+{
+EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(obj);
+EventLoopBase *base = EVENT_LOOP_BASE(obj);
+EventLoopBaseParamInfo *info = opaque;
+int64_t *field = (void *)base + info->offset;
+int64_t value;
+
+if (!visit_type_int64(v, name, , errp)) {
+return;
+}
+
+if (value < 0) {
+error_setg(errp, "%s value must be in range [0, %" PRId64 "]",
+   info->name, INT64_MAX);
+return;
+}
+
+*field = value;
+
+if (bc->update_params) {
+bc->update_params(base, errp);
+}
+
+return;
+}
+
+static void event_loop_base_complete(UserCreatable *uc, Error **errp)
+{
+EventLoopBaseClass *bc = EVENT_LOOP_BASE_GET_CLASS(uc);
+EventLoopBase *base = EVENT_LOOP_BASE(uc);
+
+if (bc->init) {
+bc->init(base, errp);
+}
+}
+
+static void event_loop_base_class_init(ObjectClass *klass, void *class_data)
+{
+UserCreatableClass *ucc = USER_CREATABLE_CLASS(klass);
+ucc->complete = event_loop_base_complete;
+
+object_class_property_add(klass, "aio-max-batch", "int",
+  event_loop_base_get_param,
+  event_loop_base_set_param,
+  NULL, _max_batch_info);
+}
+
+static const TypeInfo event_loop_base_info = {
+.name = TYPE_EVENT_LOOP_BASE,
+.parent = TYPE_OBJECT,
+.instance_size = sizeof(EventLoopBase),
+.class_size = sizeof(EventLoopBaseClass),
+.class_init = event_loop_base_class_init,
+.abstract = true,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_USER_CREATABLE },
+{ }
+}
+};
+
+static void register_types(void)
+{
+type_register_static(_loop_base_info);
+}
+type_init(register_types);
diff --git a/include/sysemu/event-loop-base.h 

[PATCH v2 1/4] util/thread-pool: Fix thread pool freeing locking

2022-03-03 Thread Nicolas Saenz Julienne
Upon freeing a thread pool we need to get rid of any remaining worker.
This is achieved by setting the thread pool's topping flag, waking the
workers up, and waiting for them to exit one by one. The problem is that
currently all this process happens with the thread pool lock held,
effectively blocking the workers from exiting.

So let's release the thread pool lock after signaling a worker thread
that it's time to exit to give it a chance to do so.

Fixes: f7311ccc63 ("threadpool: add thread_pool_new() and thread_pool_free()")
Signed-off-by: Nicolas Saenz Julienne 
---
 util/thread-pool.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/util/thread-pool.c b/util/thread-pool.c
index d763cea505..fdb43c2d3b 100644
--- a/util/thread-pool.c
+++ b/util/thread-pool.c
@@ -339,7 +339,9 @@ void thread_pool_free(ThreadPool *pool)
 pool->stopping = true;
 while (pool->cur_threads > 0) {
 qemu_sem_post(>sem);
+qemu_mutex_unlock(>lock);
 qemu_cond_wait(>worker_stopped, >lock);
+qemu_mutex_lock(>lock);
 }
 
 qemu_mutex_unlock(>lock);
-- 
2.35.1




  1   2   >