date:20230504

[PULL 13/16] qemu-file: No need to check for shutdown in qemu_file_rate_limit

2023-05-04 Thread Juan Quintela

After calling qemu_file_shutdown() we set the error as -EIO if there
is no another previous error, so no need to check it here.

Signed-off-by: Juan Quintela 
Reviewed-by: Peter Xu 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504113841.23130-6-quint...@redhat.com>
---
 migration/qemu-file.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 578b6309ba..a4ea808b15 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -732,9 +732,6 @@ int64_t qemu_file_total_transferred(QEMUFile *f)
 
 int qemu_file_rate_limit(QEMUFile *f)
 {
-if (f->shutdown) {
-return 1;
-}
 if (qemu_file_get_error(f)) {
 return 1;
 }
-- 
2.40.0

[PULL 08/16] migration/rdma: We can calculate the rioc from the QEMUFile

2023-05-04 Thread Juan Quintela

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504114443.23891-4-quint...@redhat.com>
---
 migration/rdma.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 5b82085bd7..17c4b9206f 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -3811,9 +3811,10 @@ out:
  * the source.
  */
 static int
-rdma_block_notification_handle(QIOChannelRDMA *rioc, const char *name)
+rdma_block_notification_handle(QEMUFile *f, const char *name)
 {
 RDMAContext *rdma;
+QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
 int curr;
 int found = -1;
 
@@ -3846,10 +3847,9 @@ rdma_block_notification_handle(QIOChannelRDMA *rioc, 
const char *name)
 
 static int rdma_load_hook(QEMUFile *f, uint64_t flags, void *data)
 {
-QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
 switch (flags) {
 case RAM_CONTROL_BLOCK_REG:
-return rdma_block_notification_handle(rioc, data);
+return rdma_block_notification_handle(f, data);
 
 case RAM_CONTROL_HOOK:
 return qemu_rdma_registration_handle(f);
-- 
2.40.0

[PULL 03/16] migration: Document all migration_stats

2023-05-04 Thread Juan Quintela

Message-Id: <20230504103357.22130-2-quint...@redhat.com>
Reviewed-by: David Edmondson 
Reviewed-by: Daniel P. Berrangé 
Signed-off-by: Juan Quintela 
---
 migration/migration-stats.h | 43 +
 1 file changed, 43 insertions(+)

diff --git a/migration/migration-stats.h b/migration/migration-stats.h
index 149af932d7..50966328b2 100644
--- a/migration/migration-stats.h
+++ b/migration/migration-stats.h
@@ -22,17 +22,60 @@
  * one thread).
  */
 typedef struct {
+/*
+ * Number of bytes that were dirty last time that we synced with
+ * the guest memory.  We use that to calculate the downtime.  As
+ * the remaining dirty amounts to what we know that is still dirty
+ * since last iteration, not counting what the guest has dirtied
+ * since we synchronized bitmaps.
+ */
 Stat64 dirty_bytes_last_sync;
+/*
+ * Number of pages dirtied per second.
+ */
 Stat64 dirty_pages_rate;
+/*
+ * Number of times we have synchronized guest bitmaps.
+ */
 Stat64 dirty_sync_count;
+/*
+ * Number of times zero copy failed to send any page using zero
+ * copy.
+ */
 Stat64 dirty_sync_missed_zero_copy;
+/*
+ * Number of bytes sent at migration completion stage while the
+ * guest is stopped.
+ */
 Stat64 downtime_bytes;
+/*
+ * Number of pages transferred that were full of zeros.
+ */
 Stat64 zero_pages;
+/*
+ * Number of bytes sent through multifd channels.
+ */
 Stat64 multifd_bytes;
+/*
+ * Number of pages transferred that were not full of zeros.
+ */
 Stat64 normal_pages;
+/*
+ * Number of bytes sent during postcopy.
+ */
 Stat64 postcopy_bytes;
+/*
+ * Number of postcopy page faults that we have handled during
+ * postcopy stage.
+ */
 Stat64 postcopy_requests;
+/*
+ * Number of bytes sent during precopy stage.
+ */
 Stat64 precopy_bytes;
+/*
+ * Total number of bytes transferred.
+ */
 Stat64 transferred;
 } MigrationAtomicStats;
 
-- 
2.40.0

[PULL 15/16] qemu-file: Make total_transferred an uint64_t

2023-05-04 Thread Juan Quintela

Change all the functions that use it.  It was already passed as
uint64_t.

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504113841.23130-8-quint...@redhat.com>
---
 migration/block.c | 5 ++---
 migration/qemu-file.c | 8 
 migration/qemu-file.h | 4 ++--
 migration/savevm.c| 6 ++
 migration/vmstate.c   | 2 +-
 5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index 3499f75e37..a37678ce95 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -747,8 +747,7 @@ static int block_save_setup(QEMUFile *f, void *opaque)
 static int block_save_iterate(QEMUFile *f, void *opaque)
 {
 int ret;
-int64_t last_bytes = qemu_file_total_transferred(f);
-int64_t delta_bytes;
+uint64_t last_bytes = qemu_file_total_transferred(f);
 
 trace_migration_block_save("iterate", block_mig_state.submitted,
block_mig_state.transferred);
@@ -800,7 +799,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
 }
 
 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
-delta_bytes = qemu_file_total_transferred(f) - last_bytes;
+uint64_t delta_bytes = qemu_file_total_transferred(f) - last_bytes;
 return (delta_bytes > 0);
 }
 
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 11b510aa29..ccbefc347e 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -51,7 +51,7 @@ struct QEMUFile {
 int64_t rate_limit_used;
 
 /* The sum of bytes transferred on the wire */
-int64_t total_transferred;
+uint64_t total_transferred;
 
 int buf_index;
 int buf_size; /* 0 when writing */
@@ -708,9 +708,9 @@ int coroutine_mixed_fn qemu_get_byte(QEMUFile *f)
 return result;
 }
 
-int64_t qemu_file_total_transferred_fast(QEMUFile *f)
+uint64_t qemu_file_total_transferred_fast(QEMUFile *f)
 {
-int64_t ret = f->total_transferred;
+uint64_t ret = f->total_transferred;
 int i;
 
 for (i = 0; i < f->iovcnt; i++) {
@@ -720,7 +720,7 @@ int64_t qemu_file_total_transferred_fast(QEMUFile *f)
 return ret;
 }
 
-int64_t qemu_file_total_transferred(QEMUFile *f)
+uint64_t qemu_file_total_transferred(QEMUFile *f)
 {
 qemu_fflush(f);
 return f->total_transferred;
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index d16cd50448..4f26bf6961 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -83,7 +83,7 @@ int qemu_fclose(QEMUFile *f);
  *
  * Returns: the total bytes transferred
  */
-int64_t qemu_file_total_transferred(QEMUFile *f);
+uint64_t qemu_file_total_transferred(QEMUFile *f);
 
 /*
  * qemu_file_total_transferred_fast:
@@ -95,7 +95,7 @@ int64_t qemu_file_total_transferred(QEMUFile *f);
  *
  * Returns: the total bytes transferred and queued
  */
-int64_t qemu_file_total_transferred_fast(QEMUFile *f);
+uint64_t qemu_file_total_transferred_fast(QEMUFile *f);
 
 /*
  * put_buffer without copying the buffer.
diff --git a/migration/savevm.c b/migration/savevm.c
index a9d0a88e62..032044b1d5 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -927,11 +927,9 @@ static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
JSONWriter *vmdesc)
 {
-int64_t old_offset, size;
-
-old_offset = qemu_file_total_transferred_fast(f);
+uint64_t old_offset = qemu_file_total_transferred_fast(f);
 se->ops->save_state(f, se->opaque);
-size = qemu_file_total_transferred_fast(f) - old_offset;
+uint64_t size = qemu_file_total_transferred_fast(f) - old_offset;
 
 if (vmdesc) {
 json_writer_int64(vmdesc, "size", size);
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 83ca4c7d3e..351f56104e 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -349,7 +349,7 @@ int vmstate_save_state_v(QEMUFile *f, const 
VMStateDescription *vmsd,
 void *first_elem = opaque + field->offset;
 int i, n_elems = vmstate_n_elems(opaque, field);
 int size = vmstate_size(opaque, field);
-int64_t old_offset, written_bytes;
+uint64_t old_offset, written_bytes;
 JSONWriter *vmdesc_loop = vmdesc;
 
 trace_vmstate_save_state_loop(vmsd->name, field->name, n_elems);
-- 
2.40.0

[PULL 09/16] migration/rdma: It makes no sense to recive that flag without RDMA

2023-05-04 Thread Juan Quintela

This could only happen if the source sent
RAM_SAVE_FLAG_HOOK (i.e. rdma) and destination don't have CONFIG_RDMA.

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504114443.23891-5-quint...@redhat.com>
---
 migration/qemu-file.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index b7afc8d498..578b6309ba 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -345,14 +345,6 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags, 
void *data)
 if (ret < 0) {
 qemu_file_set_error(f, ret);
 }
-} else {
-/*
- * Hook is a hook specifically requested by the source sending a flag
- * that expects there to be a hook on the destination.
- */
-if (flags == RAM_CONTROL_HOOK) {
-qemu_file_set_error(f, -EINVAL);
-}
 }
 }
 
-- 
2.40.0

[PULL 02/16] migration/rdma: Don't pass the QIOChannelRDMA as an opaque

2023-05-04 Thread Juan Quintela

We can calculate it from the QEMUFile like the caller.

Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230503131847.11603-6-quint...@redhat.com>
Signed-off-by: Juan Quintela 
---
 migration/rdma.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 7e747b2595..5b82085bd7 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -3527,7 +3527,7 @@ static int dest_ram_sort_func(const void *a, const void 
*b)
  *
  * Keep doing this until the source tells us to stop.
  */
-static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
+static int qemu_rdma_registration_handle(QEMUFile *f)
 {
 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
.type = RDMA_CONTROL_REGISTER_RESULT,
@@ -3539,7 +3539,7 @@ static int qemu_rdma_registration_handle(QEMUFile *f, 
void *opaque)
  };
 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
  .repeat = 1 };
-QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(opaque);
+QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
 RDMAContext *rdma;
 RDMALocalBlocks *local;
 RDMAControlHeader head;
@@ -3852,7 +3852,7 @@ static int rdma_load_hook(QEMUFile *f, uint64_t flags, 
void *data)
 return rdma_block_notification_handle(rioc, data);
 
 case RAM_CONTROL_HOOK:
-return qemu_rdma_registration_handle(f, rioc);
+return qemu_rdma_registration_handle(f);
 
 default:
 /* Shouldn't be called with any other values */
-- 
2.40.0

[PULL 16/16] qemu-file: Make ram_control_save_page() use accessors for rate_limit

2023-05-04 Thread Juan Quintela

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Peter Xu 
Message-Id: <20230504113841.23130-9-quint...@redhat.com>
---
 migration/qemu-file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index ccbefc347e..f4cfd05c67 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -352,7 +352,7 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t 
block_offset,
 int ret = f->hooks->save_page(f, block_offset,
   offset, size, bytes_sent);
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
-f->rate_limit_used += size;
+qemu_file_acct_rate_limit(f, size);
 }
 
 if (ret != RAM_SAVE_CONTROL_DELAYED &&
-- 
2.40.0

[PULL 04/16] migration: Put zero_pages in alphabetical order

2023-05-04 Thread Juan Quintela

I forgot to move it when I rename it from duplicated_pages.

Message-Id: <20230504103357.22130-3-quint...@redhat.com>
Reviewed-by: David Edmondson 
Reviewed-by: Daniel P. Berrangé 
Signed-off-by: Juan Quintela 
---
 migration/migration-stats.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/migration/migration-stats.h b/migration/migration-stats.h
index 50966328b2..cf8a4f0410 100644
--- a/migration/migration-stats.h
+++ b/migration/migration-stats.h
@@ -48,10 +48,6 @@ typedef struct {
  * guest is stopped.
  */
 Stat64 downtime_bytes;
-/*
- * Number of pages transferred that were full of zeros.
- */
-Stat64 zero_pages;
 /*
  * Number of bytes sent through multifd channels.
  */
@@ -77,6 +73,10 @@ typedef struct {
  * Total number of bytes transferred.
  */
 Stat64 transferred;
+/*
+ * Number of pages transferred that were full of zeros.
+ */
+Stat64 zero_pages;
 } MigrationAtomicStats;
 
 extern MigrationAtomicStats mig_stats;
-- 
2.40.0

[PULL 00/16] Migration 20230505 patches

2023-05-04 Thread Juan Quintela

The following changes since commit f6b761bdbd8ba63cee7428d52fb6b46e4224ddab:

  Merge tag 'qga-pull-2023-05-04' of https://github.com/kostyanf14/qemu into 
staging (2023-05-04 12:08:00 +0100)

are available in the Git repository at:

  https://gitlab.com/juan.quintela/qemu.git tags/migration-20230505-pull-request

for you to fetch changes up to fae4009fb51b12927165667a9c9d6af93d31b1df:

  qemu-file: Make ram_control_save_page() use accessors for rate_limit 
(2023-05-05 02:01:59 +0200)


Migration Pull request (20230505 edition)

In this series:
- fix block_bitmap_mapping (juan)
- RDMA cleanup (juan)
- qemu file cleanup (juan)

Please apply.



Juan Quintela (16):
  migration: Fix block_bitmap_mapping migration
  migration/rdma: Don't pass the QIOChannelRDMA as an opaque
  migration: Document all migration_stats
  migration: Put zero_pages in alphabetical order
  migration: Rename xbzrle_enabled xbzrle_started
  migration: Make RAM_SAVE_FLAG_HOOK a normal case entry
  migration/rdma: simplify ram_control_load_hook()
  migration/rdma: We can calculate the rioc from the QEMUFile
  migration/rdma: It makes no sense to recive that flag without RDMA
  migration/rdma: Check for postcopy sooner
  migration: max_postcopy_bandwidth is a size parameter
  migration: qemu_file_total_transferred() function is monotonic
  qemu-file: No need to check for shutdown in qemu_file_rate_limit
  qemu-file: remove shutdown member
  qemu-file: Make total_transferred an uint64_t
  qemu-file: Make ram_control_save_page() use accessors for rate_limit

 migration/block-dirty-bitmap.c | 14 ---
 migration/block.c  | 13 +++---
 migration/migration-stats.h| 45 -
 migration/migration.c  |  4 +--
 migration/options.c|  9 ++-
 migration/options.h|  4 ++-
 migration/qemu-file.c  | 35 +++---
 migration/qemu-file.h  |  4 +--
 migration/ram.c| 26 +--
 migration/rdma.c   | 46 +-
 migration/savevm.c |  6 ++---
 migration/vmstate.c|  2 +-
 12 files changed, 114 insertions(+), 94 deletions(-)

-- 
2.40.0

[PULL 07/16] migration/rdma: simplify ram_control_load_hook()

2023-05-04 Thread Juan Quintela

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504114443.23891-3-quint...@redhat.com>
---
 migration/qemu-file.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index ee04240a21..b7afc8d498 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -340,10 +340,8 @@ void ram_control_after_iterate(QEMUFile *f, uint64_t flags)
 
 void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data)
 {
-int ret = -EINVAL;
-
 if (f->hooks && f->hooks->hook_ram_load) {
-ret = f->hooks->hook_ram_load(f, flags, data);
+int ret = f->hooks->hook_ram_load(f, flags, data);
 if (ret < 0) {
 qemu_file_set_error(f, ret);
 }
@@ -353,7 +351,7 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags, 
void *data)
  * that expects there to be a hook on the destination.
  */
 if (flags == RAM_CONTROL_HOOK) {
-qemu_file_set_error(f, ret);
+qemu_file_set_error(f, -EINVAL);
 }
 }
 }
-- 
2.40.0

[PULL 11/16] migration: max_postcopy_bandwidth is a size parameter

2023-05-04 Thread Juan Quintela

So make everything that uses it uint64_t no int64_t.

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504113841.23130-2-quint...@redhat.com>
---
 migration/migration.c | 4 ++--
 migration/options.c   | 2 +-
 migration/options.h   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index feb5ab7493..232e387109 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2056,7 +2056,7 @@ static int postcopy_start(MigrationState *ms)
 QIOChannelBuffer *bioc;
 QEMUFile *fb;
 int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-int64_t bandwidth = migrate_max_postcopy_bandwidth();
+uint64_t bandwidth = migrate_max_postcopy_bandwidth();
 bool restart_block = false;
 int cur_state = MIGRATION_STATUS_ACTIVE;
 
@@ -3176,7 +3176,7 @@ fail:
 void migrate_fd_connect(MigrationState *s, Error *error_in)
 {
 Error *local_err = NULL;
-int64_t rate_limit;
+uint64_t rate_limit;
 bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
 
 /*
diff --git a/migration/options.c b/migration/options.c
index 7395787960..2e759cc306 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -717,7 +717,7 @@ uint64_t migrate_max_bandwidth(void)
 return s->parameters.max_bandwidth;
 }
 
-int64_t migrate_max_postcopy_bandwidth(void)
+uint64_t migrate_max_postcopy_bandwidth(void)
 {
 MigrationState *s = migrate_get_current();
 
diff --git a/migration/options.h b/migration/options.h
index 09841d6a63..5cca3326d6 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -85,7 +85,7 @@ int migrate_decompress_threads(void);
 uint64_t migrate_downtime_limit(void);
 uint8_t migrate_max_cpu_throttle(void);
 uint64_t migrate_max_bandwidth(void);
-int64_t migrate_max_postcopy_bandwidth(void);
+uint64_t migrate_max_postcopy_bandwidth(void);
 int migrate_multifd_channels(void);
 MultiFDCompression migrate_multifd_compression(void);
 int migrate_multifd_zlib_level(void);
-- 
2.40.0

[PULL 06/16] migration: Make RAM_SAVE_FLAG_HOOK a normal case entry

2023-05-04 Thread Juan Quintela

Fixes this commit, clearly a bad merge after a rebase or similar, it
should have been its own case since that point.

commit 5b0e9dd46fbda5152566a4a26fd96bc0d0452bf7
Author: Peter Lieven 
Date:   Tue Jun 24 11:32:36 2014 +0200

migration: catch unknown flag combinations in ram_load

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504114443.23891-2-quint...@redhat.com>
---
 migration/ram.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 23ba1cefff..5e7bf20ca5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -4445,14 +4445,12 @@ static int ram_load_precopy(QEMUFile *f)
 multifd_recv_sync_main();
 }
 break;
+case RAM_SAVE_FLAG_HOOK:
+ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
+break;
 default:
-if (flags & RAM_SAVE_FLAG_HOOK) {
-ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
-} else {
-error_report("Unknown combination of migration flags: 0x%x",
- flags);
-ret = -EINVAL;
-}
+error_report("Unknown combination of migration flags: 0x%x", 
flags);
+ret = -EINVAL;
 }
 if (!ret) {
 ret = qemu_file_get_error(f);
-- 
2.40.0

[PULL 14/16] qemu-file: remove shutdown member

2023-05-04 Thread Juan Quintela

The first thing that we do after setting the shutdown value is set the
error as -EIO if there is not a previous error.

So this value is redundant.  Just remove it and use
qemu_file_get_error() in the places that it was tested.

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Peter Xu 
Message-Id: <20230504113841.23130-7-quint...@redhat.com>
---
 migration/qemu-file.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index a4ea808b15..11b510aa29 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -63,8 +63,6 @@ struct QEMUFile {
 
 int last_error;
 Error *last_error_obj;
-/* has the file has been shutdown */
-bool shutdown;
 };
 
 /*
@@ -78,8 +76,6 @@ int qemu_file_shutdown(QEMUFile *f)
 {
 int ret = 0;
 
-f->shutdown = true;
-
 /*
  * We must set qemufile error before the real shutdown(), otherwise
  * there can be a race window where we thought IO all went though
@@ -294,7 +290,7 @@ void qemu_fflush(QEMUFile *f)
 return;
 }
 
-if (f->shutdown) {
+if (qemu_file_get_error(f)) {
 return;
 }
 if (f->iovcnt > 0) {
@@ -397,7 +393,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile 
*f)
 f->buf_index = 0;
 f->buf_size = pending;
 
-if (f->shutdown) {
+if (qemu_file_get_error(f)) {
 return 0;
 }
 
@@ -486,7 +482,7 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, 
size_t size,
 } else {
 if (f->iovcnt >= MAX_IOV_SIZE) {
 /* Should only happen if a previous fflush failed */
-assert(f->shutdown || !qemu_file_is_writable(f));
+assert(qemu_file_get_error(f) || !qemu_file_is_writable(f));
 return 1;
 }
 if (may_free) {
-- 
2.40.0

[PULL 10/16] migration/rdma: Check for postcopy sooner

2023-05-04 Thread Juan Quintela

It makes no sense first try to see if there is an rdma error and then
do nothing on postcopy stage.  Change it so we check we are in
postcopy before doing anything.

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504114443.23891-6-quint...@redhat.com>
---
 migration/rdma.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/migration/rdma.c b/migration/rdma.c
index 17c4b9206f..2cd8f1cc66 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -3234,19 +3234,19 @@ static size_t qemu_rdma_save_page(QEMUFile *f,
 RDMAContext *rdma;
 int ret;
 
-RCU_READ_LOCK_GUARD();
-rdma = qatomic_rcu_read(>rdmaout);
-
-if (!rdma) {
-return -EIO;
-}
-
-CHECK_ERROR_STATE();
-
 if (migration_in_postcopy()) {
 return RAM_SAVE_CONTROL_NOT_SUPP;
 }
 
+RCU_READ_LOCK_GUARD();
+rdma = qatomic_rcu_read(>rdmaout);
+
+if (!rdma) {
+return -EIO;
+}
+
+CHECK_ERROR_STATE();
+
 qemu_fflush(f);
 
 /*
@@ -3866,6 +3866,10 @@ static int qemu_rdma_registration_start(QEMUFile *f,
 QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
 RDMAContext *rdma;
 
+if (migration_in_postcopy()) {
+return 0;
+}
+
 RCU_READ_LOCK_GUARD();
 rdma = qatomic_rcu_read(>rdmaout);
 if (!rdma) {
@@ -3874,10 +3878,6 @@ static int qemu_rdma_registration_start(QEMUFile *f,
 
 CHECK_ERROR_STATE();
 
-if (migration_in_postcopy()) {
-return 0;
-}
-
 trace_qemu_rdma_registration_start(flags);
 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
 qemu_fflush(f);
@@ -3897,6 +3897,10 @@ static int qemu_rdma_registration_stop(QEMUFile *f,
 RDMAControlHeader head = { .len = 0, .repeat = 1 };
 int ret = 0;
 
+if (migration_in_postcopy()) {
+return 0;
+}
+
 RCU_READ_LOCK_GUARD();
 rdma = qatomic_rcu_read(>rdmaout);
 if (!rdma) {
@@ -3905,10 +3909,6 @@ static int qemu_rdma_registration_stop(QEMUFile *f,
 
 CHECK_ERROR_STATE();
 
-if (migration_in_postcopy()) {
-return 0;
-}
-
 qemu_fflush(f);
 ret = qemu_rdma_drain_cq(f, rdma);
 
-- 
2.40.0

[PULL 05/16] migration: Rename xbzrle_enabled xbzrle_started

2023-05-04 Thread Juan Quintela

Otherwise it is confusing with the function xbzrle_enabled().

Suggested-by: Daniel P. Berrangé 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Peter Xu 
Signed-off-by: Juan Quintela 
Message-Id: <20230504115323.24407-1-quint...@redhat.com>
---
 migration/ram.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 7d81c4a39e..23ba1cefff 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -388,8 +388,8 @@ struct RAMState {
 uint64_t xbzrle_pages_prev;
 /* Amount of xbzrle encoded bytes since the beginning of the period */
 uint64_t xbzrle_bytes_prev;
-/* Start using XBZRLE (e.g., after the first round). */
-bool xbzrle_enabled;
+/* Are we really using XBZRLE (e.g., after the first round). */
+bool xbzrle_started;
 /* Are we on the last stage of migration */
 bool last_stage;
 /* compression statistics since the beginning of the period */
@@ -1420,7 +1420,7 @@ static int ram_save_page(RAMState *rs, PageSearchStatus 
*pss)
 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 
 XBZRLE_cache_lock();
-if (rs->xbzrle_enabled && !migration_in_postcopy()) {
+if (rs->xbzrle_started && !migration_in_postcopy()) {
 pages = save_xbzrle_page(rs, pss, , current_addr,
  block, offset);
 if (!rs->last_stage) {
@@ -1636,7 +1636,7 @@ static int find_dirty_block(RAMState *rs, 
PageSearchStatus *pss)
 pss->complete_round = true;
 /* After the first round, enable XBZRLE. */
 if (migrate_xbzrle()) {
-rs->xbzrle_enabled = true;
+rs->xbzrle_started = true;
 }
 }
 /* Didn't find anything this time, but try again on the new block */
@@ -2288,7 +2288,7 @@ static bool save_page_use_compression(RAMState *rs)
  * using the data compression. In theory, xbzrle can do better than
  * compression.
  */
-if (rs->xbzrle_enabled) {
+if (rs->xbzrle_started) {
 return false;
 }
 
@@ -2357,7 +2357,7 @@ static int ram_save_target_page_legacy(RAMState *rs, 
PageSearchStatus *pss)
 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
  * page would be stale
  */
-if (rs->xbzrle_enabled) {
+if (rs->xbzrle_started) {
 XBZRLE_cache_lock();
 xbzrle_cache_zero_page(rs, block->offset + offset);
 XBZRLE_cache_unlock();
@@ -2738,7 +2738,7 @@ static void ram_state_reset(RAMState *rs)
 rs->last_seen_block = NULL;
 rs->last_page = 0;
 rs->last_version = ram_list.version;
-rs->xbzrle_enabled = false;
+rs->xbzrle_started = false;
 }
 
 #define MAX_WAIT 50 /* ms, half buffered_file limit */
-- 
2.40.0

[PULL 12/16] migration: qemu_file_total_transferred() function is monotonic

2023-05-04 Thread Juan Quintela

So delta_bytes can only be greater or equal to zero.  Never negative.

Signed-off-by: Juan Quintela 
Reviewed-by: Daniel P. Berrangé 
Message-Id: <20230504113841.23130-3-quint...@redhat.com>
---
 migration/block.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index 6d532ac7a2..3499f75e37 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -801,13 +801,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
 
 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 delta_bytes = qemu_file_total_transferred(f) - last_bytes;
-if (delta_bytes > 0) {
-return 1;
-} else if (delta_bytes < 0) {
-return -1;
-} else {
-return 0;
-}
+return (delta_bytes > 0);
 }
 
 /* Called with iothread lock taken.  */
-- 
2.40.0

[PULL 01/16] migration: Fix block_bitmap_mapping migration

2023-05-04 Thread Juan Quintela

It is valid that params->has_block_bitmap_mapping is true and
params->block_bitmap_mapping is NULL.  So we can't use the trick of
having a single function.

Move to two functions one for each value and the tests are fixed.

Fixes: b804b35b1c8a0edfd127ac20819c234be55ac7fc
   migration: Create migrate_block_bitmap_mapping() function

Reported-by: Kevin Wolf 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20230503181036.14890-1-quint...@redhat.com>
Signed-off-by: Juan Quintela 
---
 migration/block-dirty-bitmap.c | 14 +-
 migration/options.c|  7 +++
 migration/options.h|  2 ++
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
index 6624f39bc6..20f36e6bd8 100644
--- a/migration/block-dirty-bitmap.c
+++ b/migration/block-dirty-bitmap.c
@@ -606,11 +606,9 @@ static int init_dirty_bitmap_migration(DBMSaveState *s)
 GHashTable *handled_by_blk = g_hash_table_new(NULL, NULL);
 BlockBackend *blk;
 GHashTable *alias_map = NULL;
-const BitmapMigrationNodeAliasList *block_bitmap_mapping =
-migrate_block_bitmap_mapping();
 
-if (block_bitmap_mapping) {
-alias_map = construct_alias_map(block_bitmap_mapping, true,
+if (migrate_has_block_bitmap_mapping()) {
+alias_map = construct_alias_map(migrate_block_bitmap_mapping(), true,
 _abort);
 }
 
@@ -1159,8 +1157,6 @@ static int dirty_bitmap_load_header(QEMUFile *f, 
DBMLoadState *s,
 static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
 {
 GHashTable *alias_map = NULL;
-const BitmapMigrationNodeAliasList *block_bitmap_mapping =
-migrate_block_bitmap_mapping();
 DBMLoadState *s = &((DBMState *)opaque)->load;
 int ret = 0;
 
@@ -1172,9 +1168,9 @@ static int dirty_bitmap_load(QEMUFile *f, void *opaque, 
int version_id)
 return -EINVAL;
 }
 
-if (block_bitmap_mapping) {
-alias_map = construct_alias_map(block_bitmap_mapping,
-false, _abort);
+if (migrate_has_block_bitmap_mapping()) {
+alias_map = construct_alias_map(migrate_block_bitmap_mapping(), false,
+_abort);
 }
 
 do {
diff --git a/migration/options.c b/migration/options.c
index 53b7fc5d5d..7395787960 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -626,6 +626,13 @@ const BitmapMigrationNodeAliasList 
*migrate_block_bitmap_mapping(void)
 return s->parameters.block_bitmap_mapping;
 }
 
+bool migrate_has_block_bitmap_mapping(void)
+{
+MigrationState *s = migrate_get_current();
+
+return s->parameters.has_block_bitmap_mapping;
+}
+
 bool migrate_block_incremental(void)
 {
 MigrationState *s = migrate_get_current();
diff --git a/migration/options.h b/migration/options.h
index 3c322867cd..09841d6a63 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -71,6 +71,8 @@ bool migrate_cap_set(int cap, bool value, Error **errp);
 /* parameters */
 
 const BitmapMigrationNodeAliasList *migrate_block_bitmap_mapping(void);
+bool migrate_has_block_bitmap_mapping(void);
+
 bool migrate_block_incremental(void);
 uint32_t migrate_checkpoint_delay(void);
 int migrate_compress_level(void);
-- 
2.40.0

Re: [PATCH 3/9] qemu-file: make qemu_file_[sg]et_rate_limit() use an uint64_t

2023-05-04 Thread Juan Quintela

Juan Quintela  wrote:
> It is really size_t.  Everything else uses uint64_t, so move this to
> uint64_t as well.  A size can't be negative anyways.
>
> Signed-off-by: Juan Quintela 

Self-nack.

> -qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
> +qemu_file_set_rate_limit(ms->to_dst_file, UINT64_MAX);

1st: this should be zero.

>  } else {
>  qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / 
> XFER_LIMIT_RATIO);
>  }
> @@ -2301,7 +2301,7 @@ static void migration_completion(MigrationState *s)
>  }
>  if (ret >= 0) {
>  s->block_inactive = !migrate_colo();
> -qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
> +qemu_file_set_rate_limit(s->to_dst_file, UINT64_MAX);

Same here

>  ret = qemu_savevm_state_complete_precopy(s->to_dst_file, 
> false,
>   s->block_inactive);
>  }


> @@ -3049,7 +3049,7 @@ static void *bg_migration_thread(void *opaque)
>  rcu_register_thread();
>  object_ref(OBJECT(s));
>  
> -qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
> +qemu_file_set_rate_limit(s->to_dst_file, UINT64_MAX);

And here.

> @@ -748,18 +748,18 @@ int qemu_file_rate_limit(QEMUFile *f)
>  if (qemu_file_get_error(f)) {
>  return 1;
>  }
> -if (f->rate_limit_max > 0 && f->rate_limit_used > f->rate_limit_max) {
> +if (f->rate_limit_used > f->rate_limit_max) {

And this is wrong.  f->rate_limit_max == 0 means that we don't do  rate_limit.

Will resend this one later.

Sorry, Juan.

PD.  No, I have no clue how I have had this patch applied the whole day
 and no failures and now I get failures in migration-test.  The
 number of times that I run this test on the last two days have been
 in the hundreds.

Re: [PATCH v5 00/21] block: remove aio_disable_external() API

2023-05-04 Thread Kevin Wolf

Am 04.05.2023 um 21:53 hat Stefan Hajnoczi geschrieben:
> v5:
> - Use atomic accesses for in_flight counter in vhost-user-server.c [Kevin]
> - Stash SCSIDevice id/lun values for VIRTIO_SCSI_T_TRANSPORT_RESET event
>   before unrealizing the SCSIDevice [Kevin]
> - Keep vhost-user-blk export .detach() callback so ctx is set to NULL [Kevin]
> - Narrow BdrvChildClass and BlockDriver drained_{begin/end/poll} callbacks 
> from
>   IO_OR_GS_CODE() to GLOBAL_STATE_CODE() [Kevin]
> - Include Kevin's "block: Fix use after free in blockdev_mark_auto_del()" to
>   fix a latent bug that was exposed by this series

I only just finished reviewing v4 when you had already sent v5, but it
hadn't arrived yet. I had a few more comments on what are now patches
17, 18, 19 and 21 in v5. I think they all still apply.

Kevin

Re: [PATCH v4 20/20] aio: remove aio_disable_external() API

2023-05-04 Thread Kevin Wolf

Am 25.04.2023 um 19:27 hat Stefan Hajnoczi geschrieben:
> All callers now pass is_external=false to aio_set_fd_handler() and
> aio_set_event_notifier(). The aio_disable_external() API that
> temporarily disables fd handlers that were registered is_external=true
> is therefore dead code.
> 
> Remove aio_disable_external(), aio_enable_external(), and the
> is_external arguments to aio_set_fd_handler() and
> aio_set_event_notifier().
> 
> The entire test-fdmon-epoll test is removed because its sole purpose was
> testing aio_disable_external().
> 
> Parts of this patch were generated using the following coccinelle
> (https://coccinelle.lip6.fr/) semantic patch:
> 
>   @@
>   expression ctx, fd, is_external, io_read, io_write, io_poll, io_poll_ready, 
> opaque;
>   @@
>   - aio_set_fd_handler(ctx, fd, is_external, io_read, io_write, io_poll, 
> io_poll_ready, opaque)
>   + aio_set_fd_handler(ctx, fd, io_read, io_write, io_poll, io_poll_ready, 
> opaque)
> 
>   @@
>   expression ctx, notifier, is_external, io_read, io_poll, io_poll_ready;
>   @@
>   - aio_set_event_notifier(ctx, notifier, is_external, io_read, io_poll, 
> io_poll_ready)
>   + aio_set_event_notifier(ctx, notifier, io_read, io_poll, io_poll_ready)
> 
> Reviewed-by: Juan Quintela 
> Reviewed-by: Philippe Mathieu-Daudé 
> Signed-off-by: Stefan Hajnoczi 

> diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
> index 1683aa1105..6b6a1a91f8 100644
> --- a/util/fdmon-epoll.c
> +++ b/util/fdmon-epoll.c
> @@ -133,13 +128,12 @@ bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned 
> npfd)
>  return false;
>  }
>  
> -/* Do not upgrade while external clients are disabled */
> -if (qatomic_read(>external_disable_cnt)) {
> -return false;
> -}
> -
> -if (npfd < EPOLL_ENABLE_THRESHOLD) {
> -return false;
> +if (npfd >= EPOLL_ENABLE_THRESHOLD) {
> +if (fdmon_epoll_try_enable(ctx)) {
> +return true;
> +} else {
> +fdmon_epoll_disable(ctx);
> +}
>  }
>  
>  /* The list must not change while we add fds to epoll */

I don't understand this hunk. Why are you changing more than just
deleting the external_disable_cnt check?

Is this a mismerge with your own commit e62da985?

Kevin

Re: [PATCH v4 18/20] virtio-scsi: implement BlockDevOps->drained_begin()

2023-05-04 Thread Kevin Wolf

Am 25.04.2023 um 19:27 hat Stefan Hajnoczi geschrieben:
> The virtio-scsi Host Bus Adapter provides access to devices on a SCSI
> bus. Those SCSI devices typically have a BlockBackend. When the
> BlockBackend enters a drained section, the SCSI device must temporarily
> stop submitting new I/O requests.
> 
> Implement this behavior by temporarily stopping virtio-scsi virtqueue
> processing when one of the SCSI devices enters a drained section. The
> new scsi_device_drained_begin() API allows scsi-disk to message the
> virtio-scsi HBA.
> 
> scsi_device_drained_begin() uses a drain counter so that multiple SCSI
> devices can have overlapping drained sections. The HBA only sees one
> pair of .drained_begin/end() calls.
> 
> After this commit, virtio-scsi no longer depends on hw/virtio's
> ioeventfd aio_set_event_notifier(is_external=true). This commit is a
> step towards removing the aio_disable_external() API.
> 
> Signed-off-by: Stefan Hajnoczi 

> @@ -206,9 +208,11 @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev)
>  }
>  s->dataplane_stopping = true;
>  
> -aio_context_acquire(s->ctx);
> -aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s);
> -aio_context_release(s->ctx);
> +if (s->bus.drain_count == 0) {
> +aio_context_acquire(s->ctx);
> +aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s);
> +aio_context_release(s->ctx);
> +}

Same question as for virtio-blk: We lose processing the virtqueue one
last time during drain. Is it okay, and if so, why do we need it outside
of drain?

Kevin

Re: [PATCH v4 17/20] virtio-blk: implement BlockDevOps->drained_begin()

2023-05-04 Thread Kevin Wolf

Am 25.04.2023 um 19:27 hat Stefan Hajnoczi geschrieben:
> Detach ioeventfds during drained sections to stop I/O submission from
> the guest. virtio-blk is no longer reliant on aio_disable_external()
> after this patch. This will allow us to remove the
> aio_disable_external() API once all other code that relies on it is
> converted.
> 
> Take extra care to avoid attaching/detaching ioeventfds if the data
> plane is started/stopped during a drained section. This should be rare,
> but maybe the mirror block job can trigger it.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  hw/block/dataplane/virtio-blk.c | 17 +--
>  hw/block/virtio-blk.c   | 38 -
>  2 files changed, 48 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index bd7cc6e76b..d77fc6028c 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -245,13 +245,15 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
>  }
>  
>  /* Get this show started by hooking up our callbacks */
> -aio_context_acquire(s->ctx);
> -for (i = 0; i < nvqs; i++) {
> -VirtQueue *vq = virtio_get_queue(s->vdev, i);
> +if (!blk_in_drain(s->conf->conf.blk)) {
> +aio_context_acquire(s->ctx);
> +for (i = 0; i < nvqs; i++) {
> +VirtQueue *vq = virtio_get_queue(s->vdev, i);
>  
> -virtio_queue_aio_attach_host_notifier(vq, s->ctx);
> +virtio_queue_aio_attach_host_notifier(vq, s->ctx);
> +}
> +aio_context_release(s->ctx);
>  }
> -aio_context_release(s->ctx);
>  return 0;
>  
>fail_aio_context:
> @@ -317,7 +319,10 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
>  trace_virtio_blk_data_plane_stop(s);
>  
>  aio_context_acquire(s->ctx);
> -aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
> +
> +if (!blk_in_drain(s->conf->conf.blk)) {
> +aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
> +}

So here we actually get a semantic change: What you described as the
second part in the previous patch, processing the virtqueue one last
time, isn't done any more if the device is drained.

If it's okay to just skip this during drain, why do we need to do it
outside of drain?

Kevin

Re: [PATCH v4 16/20] virtio: make it possible to detach host notifier from any thread

2023-05-04 Thread Kevin Wolf

Am 25.04.2023 um 19:27 hat Stefan Hajnoczi geschrieben:
> virtio_queue_aio_detach_host_notifier() does two things:
> 1. It removes the fd handler from the event loop.
> 2. It processes the virtqueue one last time.
> 
> The first step can be peformed by any thread and without taking the
> AioContext lock.
> 
> The second step may need the AioContext lock (depending on the device
> implementation) and runs in the thread where request processing takes
> place. virtio-blk and virtio-scsi therefore call
> virtio_queue_aio_detach_host_notifier() from a BH that is scheduled in
> AioContext
> 
> Scheduling a BH is undesirable for .drained_begin() functions. The next
> patch will introduce a .drained_begin() function that needs to call
> virtio_queue_aio_detach_host_notifier().

Why is it undesirable? In my mental model, .drained_begin() is still
free to start as many asynchronous things as it likes. The only
important thing to take care of is that .drained_poll() returns true as
long as the BH (or other asynchronous operation) is still pending.

Of course, your way of doing things still seems to result in simpler
code because you don't have to deal with a BH at all if you only really
want the first part and not the second.

> Move the virtqueue processing out to the callers of
> virtio_queue_aio_detach_host_notifier() so that the function can be
> called from any thread. This is in preparation for the next patch.

Did you forget to remove it in virtio_queue_aio_detach_host_notifier()?
If it's unchanged, I don't think the AioContext requirement is lifted.

> Signed-off-by: Stefan Hajnoczi 
> ---
>  hw/block/dataplane/virtio-blk.c | 2 ++
>  hw/scsi/virtio-scsi-dataplane.c | 9 +
>  2 files changed, 11 insertions(+)
> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index b28d81737e..bd7cc6e76b 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -286,8 +286,10 @@ static void virtio_blk_data_plane_stop_bh(void *opaque)
>  
>  for (i = 0; i < s->conf->num_queues; i++) {
>  VirtQueue *vq = virtio_get_queue(s->vdev, i);
> +EventNotifier *host_notifier = virtio_queue_get_host_notifier(vq);
>  
>  virtio_queue_aio_detach_host_notifier(vq, s->ctx);
> +virtio_queue_host_notifier_read(host_notifier);
>  }
>  }

The existing code in virtio_queue_aio_detach_host_notifier() has a
comment before the read:

/* Test and clear notifier before after disabling event,
 * in case poll callback didn't have time to run. */

Do we want to keep it around in the new places? (And also fix the
"before after", I suppose, or replace it with a similar, but better
comment that explains why we're reading here.)

Kevin

[PATCH v5 03/21] hw/qdev: introduce qdev_is_realized() helper

2023-05-04 Thread Stefan Hajnoczi

Add a helper function to check whether the device is realized without
requiring the Big QEMU Lock. The next patch adds a second caller. The
goal is to avoid spreading DeviceState field accesses throughout the
code.

Suggested-by: Philippe Mathieu-Daudé 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Stefan Hajnoczi 
---
 include/hw/qdev-core.h | 17 ++---
 hw/scsi/scsi-bus.c |  3 +--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 7623703943..f1070d6dc7 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -1,6 +1,7 @@
 #ifndef QDEV_CORE_H
 #define QDEV_CORE_H
 
+#include "qemu/atomic.h"
 #include "qemu/queue.h"
 #include "qemu/bitmap.h"
 #include "qemu/rcu.h"
@@ -168,9 +169,6 @@ typedef struct {
 
 /**
  * DeviceState:
- * @realized: Indicates whether the device has been fully constructed.
- *When accessed outside big qemu lock, must be accessed with
- *qatomic_load_acquire()
  * @reset: ResettableState for the device; handled by Resettable interface.
  *
  * This structure should not be accessed directly.  We declare it here
@@ -339,6 +337,19 @@ DeviceState *qdev_new(const char *name);
  */
 DeviceState *qdev_try_new(const char *name);
 
+/**
+ * qdev_is_realized:
+ * @dev: The device to check.
+ *
+ * May be called outside big qemu lock.
+ *
+ * Returns: %true% if the device has been fully constructed, %false% otherwise.
+ */
+static inline bool qdev_is_realized(DeviceState *dev)
+{
+return qatomic_load_acquire(>realized);
+}
+
 /**
  * qdev_realize: Realize @dev.
  * @dev: device to realize
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 3c20b47ad0..8857ff41f6 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -60,8 +60,7 @@ static SCSIDevice *do_scsi_device_find(SCSIBus *bus,
  * the user access the device.
  */
 
-if (retval && !include_unrealized &&
-!qatomic_load_acquire(>qdev.realized)) {
+if (retval && !include_unrealized && !qdev_is_realized(>qdev)) {
 retval = NULL;
 }
 
-- 
2.40.1

[PATCH v5 04/21] virtio-scsi: avoid race between unplug and transport event

2023-05-04 Thread Stefan Hajnoczi

Only report a transport reset event to the guest after the SCSIDevice
has been unrealized by qdev_simple_device_unplug_cb().

qdev_simple_device_unplug_cb() sets the SCSIDevice's qdev.realized field
to false so that scsi_device_find/get() no longer see it.

scsi_target_emulate_report_luns() also needs to be updated to filter out
SCSIDevices that are unrealized.

Change virtio_scsi_push_event() to take event information as an argument
instead of the SCSIDevice. This allows virtio_scsi_hotunplug() to emit a
VIRTIO_SCSI_T_TRANSPORT_RESET event after the SCSIDevice has already
been unrealized.

These changes ensure that the guest driver does not see the SCSIDevice
that's being unplugged if it responds very quickly to the transport
reset event.

Reviewed-by: Paolo Bonzini 
Reviewed-by: Michael S. Tsirkin 
Reviewed-by: Daniil Tatianin 
Signed-off-by: Stefan Hajnoczi 
---
v5:
- Stash SCSIDevice id/lun values for VIRTIO_SCSI_T_TRANSPORT_RESET event
  before unrealizing the SCSIDevice [Kevin]
---
 hw/scsi/scsi-bus.c|  3 +-
 hw/scsi/virtio-scsi.c | 86 ++-
 2 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 8857ff41f6..64013c8a24 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -487,7 +487,8 @@ static bool scsi_target_emulate_report_luns(SCSITargetReq 
*r)
 DeviceState *qdev = kid->child;
 SCSIDevice *dev = SCSI_DEVICE(qdev);
 
-if (dev->channel == channel && dev->id == id && dev->lun != 0) {
+if (dev->channel == channel && dev->id == id && dev->lun != 0 &&
+qdev_is_realized(>qdev)) {
 store_lun(tmp, dev->lun);
 g_byte_array_append(buf, tmp, 8);
 len += 8;
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 612c525d9d..ae314af3de 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -933,13 +933,27 @@ static void virtio_scsi_reset(VirtIODevice *vdev)
 s->events_dropped = false;
 }
 
-static void virtio_scsi_push_event(VirtIOSCSI *s, SCSIDevice *dev,
-   uint32_t event, uint32_t reason)
+typedef struct {
+uint32_t event;
+uint32_t reason;
+union {
+/* Used by messages specific to a device */
+struct {
+uint32_t id;
+uint32_t lun;
+} address;
+};
+} VirtIOSCSIEventInfo;
+
+static void virtio_scsi_push_event(VirtIOSCSI *s,
+   const VirtIOSCSIEventInfo *info)
 {
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
 VirtIOSCSIReq *req;
 VirtIOSCSIEvent *evt;
 VirtIODevice *vdev = VIRTIO_DEVICE(s);
+uint32_t event = info->event;
+uint32_t reason = info->reason;
 
 if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
 return;
@@ -965,27 +979,28 @@ static void virtio_scsi_push_event(VirtIOSCSI *s, 
SCSIDevice *dev,
 memset(evt, 0, sizeof(VirtIOSCSIEvent));
 evt->event = virtio_tswap32(vdev, event);
 evt->reason = virtio_tswap32(vdev, reason);
-if (!dev) {
-assert(event == VIRTIO_SCSI_T_EVENTS_MISSED);
-} else {
+if (event != VIRTIO_SCSI_T_EVENTS_MISSED) {
 evt->lun[0] = 1;
-evt->lun[1] = dev->id;
+evt->lun[1] = info->address.id;
 
 /* Linux wants us to keep the same encoding we use for REPORT LUNS.  */
-if (dev->lun >= 256) {
-evt->lun[2] = (dev->lun >> 8) | 0x40;
+if (info->address.lun >= 256) {
+evt->lun[2] = (info->address.lun >> 8) | 0x40;
 }
-evt->lun[3] = dev->lun & 0xFF;
+evt->lun[3] = info->address.lun & 0xFF;
 }
 trace_virtio_scsi_event(virtio_scsi_get_lun(evt->lun), event, reason);
- 
+
 virtio_scsi_complete_req(req);
 }
 
 static void virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
 {
 if (s->events_dropped) {
-virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
+VirtIOSCSIEventInfo info = {
+.event = VIRTIO_SCSI_T_NO_EVENT,
+};
+virtio_scsi_push_event(s, );
 }
 }
 
@@ -1009,9 +1024,17 @@ static void virtio_scsi_change(SCSIBus *bus, SCSIDevice 
*dev, SCSISense sense)
 
 if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_CHANGE) &&
 dev->type != TYPE_ROM) {
+VirtIOSCSIEventInfo info = {
+.event   = VIRTIO_SCSI_T_PARAM_CHANGE,
+.reason  = sense.asc | (sense.ascq << 8),
+.address = {
+.id  = dev->id,
+.lun = dev->lun,
+},
+};
+
 virtio_scsi_acquire(s);
-virtio_scsi_push_event(s, dev, VIRTIO_SCSI_T_PARAM_CHANGE,
-   sense.asc | (sense.ascq << 8));
+virtio_scsi_push_event(s, );
 virtio_scsi_release(s);
 }
 }
@@ -1046,10 +1069,17 @@ static void virtio_scsi_hotplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 }
 
 if

[PATCH v5 11/21] block: drain from main loop thread in bdrv_co_yield_to_drain()

2023-05-04 Thread Stefan Hajnoczi

For simplicity, always run BlockDevOps .drained_begin/end/poll()
callbacks in the main loop thread. This makes it easier to implement the
callbacks and avoids extra locks.

Move the function pointer declarations from the I/O Code section to the
Global State section for BlockDevOps, BdrvChildClass, and BlockDriver.

Narrow IO_OR_GS_CODE() to GLOBAL_STATE_CODE() where appropriate.

The test-bdrv-drain test case calls bdrv_drain() from an IOThread. This
is now only allowed from coroutine context, so update the test case to
run in a coroutine.

Signed-off-by: Stefan Hajnoczi 
---
 include/block/block_int-common.h  | 90 +--
 include/sysemu/block-backend-common.h | 25 
 block/io.c| 14 +++--
 tests/unit/test-bdrv-drain.c  | 14 +++--
 4 files changed, 76 insertions(+), 67 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 013d419444..f462a8be55 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -356,6 +356,21 @@ struct BlockDriver {
 void (*bdrv_attach_aio_context)(BlockDriverState *bs,
 AioContext *new_context);
 
+/**
+ * bdrv_drain_begin is called if implemented in the beginning of a
+ * drain operation to drain and stop any internal sources of requests in
+ * the driver.
+ * bdrv_drain_end is called if implemented at the end of the drain.
+ *
+ * They should be used by the driver to e.g. manage scheduled I/O
+ * requests, or toggle an internal state. After the end of the drain new
+ * requests will continue normally.
+ *
+ * Implementations of both functions must not call aio_poll().
+ */
+void (*bdrv_drain_begin)(BlockDriverState *bs);
+void (*bdrv_drain_end)(BlockDriverState *bs);
+
 /**
  * Try to get @bs's logical and physical block size.
  * On success, store them in @bsz and return zero.
@@ -743,21 +758,6 @@ struct BlockDriver {
 void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_unplug)(
 BlockDriverState *bs);
 
-/**
- * bdrv_drain_begin is called if implemented in the beginning of a
- * drain operation to drain and stop any internal sources of requests in
- * the driver.
- * bdrv_drain_end is called if implemented at the end of the drain.
- *
- * They should be used by the driver to e.g. manage scheduled I/O
- * requests, or toggle an internal state. After the end of the drain new
- * requests will continue normally.
- *
- * Implementations of both functions must not call aio_poll().
- */
-void (*bdrv_drain_begin)(BlockDriverState *bs);
-void (*bdrv_drain_end)(BlockDriverState *bs);
-
 bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
 
 bool coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_can_store_new_dirty_bitmap)(
@@ -920,36 +920,6 @@ struct BdrvChildClass {
 void GRAPH_WRLOCK_PTR (*attach)(BdrvChild *child);
 void GRAPH_WRLOCK_PTR (*detach)(BdrvChild *child);
 
-/*
- * Notifies the parent that the filename of its child has changed (e.g.
- * because the direct child was removed from the backing chain), so that it
- * can update its reference.
- */
-int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
-   const char *filename, Error **errp);
-
-bool (*change_aio_ctx)(BdrvChild *child, AioContext *ctx,
-   GHashTable *visited, Transaction *tran,
-   Error **errp);
-
-/*
- * I/O API functions. These functions are thread-safe.
- *
- * See include/block/block-io.h for more information about
- * the I/O API.
- */
-
-void (*resize)(BdrvChild *child);
-
-/*
- * Returns a name that is supposedly more useful for human users than the
- * node name for identifying the node in question (in particular, a BB
- * name), or NULL if the parent can't provide a better name.
- */
-const char *(*get_name)(BdrvChild *child);
-
-AioContext *(*get_parent_aio_context)(BdrvChild *child);
-
 /*
  * If this pair of functions is implemented, the parent doesn't issue new
  * requests after returning from .drained_begin() until .drained_end() is
@@ -970,6 +940,36 @@ struct BdrvChildClass {
  * activity on the child has stopped.
  */
 bool (*drained_poll)(BdrvChild *child);
+
+/*
+ * Notifies the parent that the filename of its child has changed (e.g.
+ * because the direct child was removed from the backing chain), so that it
+ * can update its reference.
+ */
+int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
+   const char *filename, Error **errp);
+
+bool (*change_aio_ctx)(BdrvChild *child, AioContext *ctx,
+   GHashTable *visited, Transaction *tran,
+   Error

[PATCH v5 20/21] virtio: do not set is_external=true on host notifiers

2023-05-04 Thread Stefan Hajnoczi

Host notifiers can now use is_external=false since virtio-blk and
virtio-scsi no longer rely on is_external=true for drained sections.

Signed-off-by: Stefan Hajnoczi 
---
 hw/virtio/virtio.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 272d930721..9cdad7e550 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -3491,7 +3491,7 @@ static void 
virtio_queue_host_notifier_aio_poll_end(EventNotifier *n)
 
 void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx)
 {
-aio_set_event_notifier(ctx, >host_notifier, true,
+aio_set_event_notifier(ctx, >host_notifier, false,
virtio_queue_host_notifier_read,
virtio_queue_host_notifier_aio_poll,
virtio_queue_host_notifier_aio_poll_ready);
@@ -3508,14 +3508,14 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue 
*vq, AioContext *ctx)
  */
 void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext 
*ctx)
 {
-aio_set_event_notifier(ctx, >host_notifier, true,
+aio_set_event_notifier(ctx, >host_notifier, false,
virtio_queue_host_notifier_read,
NULL, NULL);
 }
 
 void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx)
 {
-aio_set_event_notifier(ctx, >host_notifier, true, NULL, NULL, NULL);
+aio_set_event_notifier(ctx, >host_notifier, false, NULL, NULL, NULL);
 /* Test and clear notifier before after disabling event,
  * in case poll callback didn't have time to run. */
 virtio_queue_host_notifier_read(>host_notifier);
-- 
2.40.1

[PATCH v5 13/21] hw/xen: do not set is_external=true on evtchn fds

2023-05-04 Thread Stefan Hajnoczi

is_external=true suspends fd handlers between aio_disable_external() and
aio_enable_external(). The block layer's drain operation uses this
mechanism to prevent new I/O from sneaking in between
bdrv_drained_begin() and bdrv_drained_end().

The previous commit converted the xen-block device to use BlockDevOps
.drained_begin/end() callbacks. It no longer relies on is_external=true
so it is safe to pass is_external=false.

This is part of ongoing work to remove the aio_disable_external() API.

Signed-off-by: Stefan Hajnoczi 
---
 hw/xen/xen-bus.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/xen/xen-bus.c b/hw/xen/xen-bus.c
index b8f408c9ed..bf256d4da2 100644
--- a/hw/xen/xen-bus.c
+++ b/hw/xen/xen-bus.c
@@ -842,14 +842,14 @@ void xen_device_set_event_channel_context(XenDevice 
*xendev,
 }
 
 if (channel->ctx)
-aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), 
true,
+aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), 
false,
NULL, NULL, NULL, NULL, NULL);
 
 channel->ctx = ctx;
 if (ctx) {
 aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh),
-   true, xen_device_event, NULL, xen_device_poll, NULL,
-   channel);
+   false, xen_device_event, NULL, xen_device_poll,
+   NULL, channel);
 }
 }
 
@@ -923,7 +923,7 @@ void xen_device_unbind_event_channel(XenDevice *xendev,
 
 QLIST_REMOVE(channel, list);
 
-aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), true,
+aio_set_fd_handler(channel->ctx, qemu_xen_evtchn_fd(channel->xeh), false,
NULL, NULL, NULL, NULL, NULL);
 
 if (qemu_xen_evtchn_unbind(channel->xeh, channel->local_port) < 0) {
-- 
2.40.1

[PATCH v5 15/21] block/export: don't require AioContext lock around blk_exp_ref/unref()

2023-05-04 Thread Stefan Hajnoczi

The FUSE export calls blk_exp_ref/unref() without the AioContext lock.
Instead of fixing the FUSE export, adjust blk_exp_ref/unref() so they
work without the AioContext lock. This way it's less error-prone.

Suggested-by: Paolo Bonzini 
Signed-off-by: Stefan Hajnoczi 
---
 include/block/export.h   |  2 ++
 block/export/export.c| 13 ++---
 block/export/vduse-blk.c |  4 
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/include/block/export.h b/include/block/export.h
index 7feb02e10d..f2fe0f8078 100644
--- a/include/block/export.h
+++ b/include/block/export.h
@@ -57,6 +57,8 @@ struct BlockExport {
  * Reference count for this block export. This includes strong references
  * both from the owner (qemu-nbd or the monitor) and clients connected to
  * the export.
+ *
+ * Use atomics to access this field.
  */
 int refcount;
 
diff --git a/block/export/export.c b/block/export/export.c
index 62c7c22d45..ab007e9d31 100644
--- a/block/export/export.c
+++ b/block/export/export.c
@@ -202,11 +202,10 @@ fail:
 return NULL;
 }
 
-/* Callers must hold exp->ctx lock */
 void blk_exp_ref(BlockExport *exp)
 {
-assert(exp->refcount > 0);
-exp->refcount++;
+assert(qatomic_read(>refcount) > 0);
+qatomic_inc(>refcount);
 }
 
 /* Runs in the main thread */
@@ -229,11 +228,10 @@ static void blk_exp_delete_bh(void *opaque)
 aio_context_release(aio_context);
 }
 
-/* Callers must hold exp->ctx lock */
 void blk_exp_unref(BlockExport *exp)
 {
-assert(exp->refcount > 0);
-if (--exp->refcount == 0) {
+assert(qatomic_read(>refcount) > 0);
+if (qatomic_fetch_dec(>refcount) == 1) {
 /* Touch the block_exports list only in the main thread */
 aio_bh_schedule_oneshot(qemu_get_aio_context(), blk_exp_delete_bh,
 exp);
@@ -341,7 +339,8 @@ void qmp_block_export_del(const char *id,
 if (!has_mode) {
 mode = BLOCK_EXPORT_REMOVE_MODE_SAFE;
 }
-if (mode == BLOCK_EXPORT_REMOVE_MODE_SAFE && exp->refcount > 1) {
+if (mode == BLOCK_EXPORT_REMOVE_MODE_SAFE &&
+qatomic_read(>refcount) > 1) {
 error_setg(errp, "export '%s' still in use", exp->id);
 error_append_hint(errp, "Use mode='hard' to force client "
   "disconnect\n");
diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
index a25556fe04..e041f9 100644
--- a/block/export/vduse-blk.c
+++ b/block/export/vduse-blk.c
@@ -44,9 +44,7 @@ static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
 {
 if (qatomic_fetch_inc(_exp->inflight) == 0) {
 /* Prevent export from being deleted */
-aio_context_acquire(vblk_exp->export.ctx);
 blk_exp_ref(_exp->export);
-aio_context_release(vblk_exp->export.ctx);
 }
 }
 
@@ -57,9 +55,7 @@ static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
 aio_wait_kick();
 
 /* Now the export can be deleted */
-aio_context_acquire(vblk_exp->export.ctx);
 blk_exp_unref(_exp->export);
-aio_context_release(vblk_exp->export.ctx);
 }
 }
 
-- 
2.40.1

[PATCH v5 16/21] block/fuse: do not set is_external=true on FUSE fd

2023-05-04 Thread Stefan Hajnoczi

This is part of ongoing work to remove the aio_disable_external() API.

Use BlockDevOps .drained_begin/end/poll() instead of
aio_set_fd_handler(is_external=true).

As a side-effect the FUSE export now follows AioContext changes like the
other export types.

Signed-off-by: Stefan Hajnoczi 
---
 block/export/fuse.c | 56 +++--
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index 06fa41079e..adf3236b5a 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -50,6 +50,7 @@ typedef struct FuseExport {
 
 struct fuse_session *fuse_session;
 struct fuse_buf fuse_buf;
+unsigned int in_flight; /* atomic */
 bool mounted, fd_handler_set_up;
 
 char *mountpoint;
@@ -78,6 +79,42 @@ static void read_from_fuse_export(void *opaque);
 static bool is_regular_file(const char *path, Error **errp);
 
 
+static void fuse_export_drained_begin(void *opaque)
+{
+FuseExport *exp = opaque;
+
+aio_set_fd_handler(exp->common.ctx,
+   fuse_session_fd(exp->fuse_session), false,
+   NULL, NULL, NULL, NULL, NULL);
+exp->fd_handler_set_up = false;
+}
+
+static void fuse_export_drained_end(void *opaque)
+{
+FuseExport *exp = opaque;
+
+/* Refresh AioContext in case it changed */
+exp->common.ctx = blk_get_aio_context(exp->common.blk);
+
+aio_set_fd_handler(exp->common.ctx,
+   fuse_session_fd(exp->fuse_session), false,
+   read_from_fuse_export, NULL, NULL, NULL, exp);
+exp->fd_handler_set_up = true;
+}
+
+static bool fuse_export_drained_poll(void *opaque)
+{
+FuseExport *exp = opaque;
+
+return qatomic_read(>in_flight) > 0;
+}
+
+static const BlockDevOps fuse_export_blk_dev_ops = {
+.drained_begin = fuse_export_drained_begin,
+.drained_end   = fuse_export_drained_end,
+.drained_poll  = fuse_export_drained_poll,
+};
+
 static int fuse_export_create(BlockExport *blk_exp,
   BlockExportOptions *blk_exp_args,
   Error **errp)
@@ -101,6 +138,15 @@ static int fuse_export_create(BlockExport *blk_exp,
 }
 }
 
+blk_set_dev_ops(exp->common.blk, _export_blk_dev_ops, exp);
+
+/*
+ * We handle draining ourselves using an in-flight counter and by disabling
+ * the FUSE fd handler. Do not queue BlockBackend requests, they need to
+ * complete so the in-flight counter reaches zero.
+ */
+blk_set_disable_request_queuing(exp->common.blk, true);
+
 init_exports_table();
 
 /*
@@ -224,7 +270,7 @@ static int setup_fuse_export(FuseExport *exp, const char 
*mountpoint,
 g_hash_table_insert(exports, g_strdup(mountpoint), NULL);
 
 aio_set_fd_handler(exp->common.ctx,
-   fuse_session_fd(exp->fuse_session), true,
+   fuse_session_fd(exp->fuse_session), false,
read_from_fuse_export, NULL, NULL, NULL, exp);
 exp->fd_handler_set_up = true;
 
@@ -246,6 +292,8 @@ static void read_from_fuse_export(void *opaque)
 
 blk_exp_ref(>common);
 
+qatomic_inc(>in_flight);
+
 do {
 ret = fuse_session_receive_buf(exp->fuse_session, >fuse_buf);
 } while (ret == -EINTR);
@@ -256,6 +304,10 @@ static void read_from_fuse_export(void *opaque)
 fuse_session_process_buf(exp->fuse_session, >fuse_buf);
 
 out:
+if (qatomic_fetch_dec(>in_flight) == 1) {
+aio_wait_kick(); /* wake AIO_WAIT_WHILE() */
+}
+
 blk_exp_unref(>common);
 }
 
@@ -268,7 +320,7 @@ static void fuse_export_shutdown(BlockExport *blk_exp)
 
 if (exp->fd_handler_set_up) {
 aio_set_fd_handler(exp->common.ctx,
-   fuse_session_fd(exp->fuse_session), true,
+   fuse_session_fd(exp->fuse_session), false,
NULL, NULL, NULL, NULL, NULL);
 exp->fd_handler_set_up = false;
 }
-- 
2.40.1

[PATCH v5 09/21] hw/xen: do not use aio_set_fd_handler(is_external=true) in xen_xenstore

2023-05-04 Thread Stefan Hajnoczi

There is no need to suspend activity between aio_disable_external() and
aio_enable_external(), which is mainly used for the block layer's drain
operation.

This is part of ongoing work to remove the aio_disable_external() API.

Reviewed-by: David Woodhouse 
Reviewed-by: Paul Durrant 
Signed-off-by: Stefan Hajnoczi 
---
 hw/i386/kvm/xen_xenstore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/i386/kvm/xen_xenstore.c b/hw/i386/kvm/xen_xenstore.c
index 900679af8a..6e81bc8791 100644
--- a/hw/i386/kvm/xen_xenstore.c
+++ b/hw/i386/kvm/xen_xenstore.c
@@ -133,7 +133,7 @@ static void xen_xenstore_realize(DeviceState *dev, Error 
**errp)
 error_setg(errp, "Xenstore evtchn port init failed");
 return;
 }
-aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), true,
+aio_set_fd_handler(qemu_get_aio_context(), xen_be_evtchn_fd(s->eh), false,
xen_xenstore_event, NULL, NULL, NULL, s);
 
 s->impl = xs_impl_create(xen_domid);
-- 
2.40.1

[PATCH v5 14/21] block/export: rewrite vduse-blk drain code

2023-05-04 Thread Stefan Hajnoczi

vduse_blk_detach_ctx() waits for in-flight requests using
AIO_WAIT_WHILE(). This is not allowed according to a comment in
bdrv_set_aio_context_commit():

  /*
   * Take the old AioContex when detaching it from bs.
   * At this point, new_context lock is already acquired, and we are now
   * also taking old_context. This is safe as long as bdrv_detach_aio_context
   * does not call AIO_POLL_WHILE().
   */

Use this opportunity to rewrite the drain code in vduse-blk:

- Use the BlockExport refcount so that vduse_blk_exp_delete() is only
  called when there are no more requests in flight.

- Implement .drained_poll() so in-flight request coroutines are stopped
  by the time .bdrv_detach_aio_context() is called.

- Remove AIO_WAIT_WHILE() from vduse_blk_detach_ctx() to solve the
  .bdrv_detach_aio_context() constraint violation. It's no longer
  needed due to the previous changes.

- Always handle the VDUSE file descriptor, even in drained sections. The
  VDUSE file descriptor doesn't submit I/O, so it's safe to handle it in
  drained sections. This ensures that the VDUSE kernel code gets a fast
  response.

- Suspend virtqueue fd handlers in .drained_begin() and resume them in
  .drained_end(). This eliminates the need for the
  aio_set_fd_handler(is_external=true) flag, which is being removed from
  QEMU.

This is a long list but splitting it into individual commits would
probably lead to git bisect failures - the changes are all related.

Signed-off-by: Stefan Hajnoczi 
---
 block/export/vduse-blk.c | 132 +++
 1 file changed, 93 insertions(+), 39 deletions(-)

diff --git a/block/export/vduse-blk.c b/block/export/vduse-blk.c
index b53ef39da0..a25556fe04 100644
--- a/block/export/vduse-blk.c
+++ b/block/export/vduse-blk.c
@@ -31,7 +31,8 @@ typedef struct VduseBlkExport {
 VduseDev *dev;
 uint16_t num_queues;
 char *recon_file;
-unsigned int inflight;
+unsigned int inflight; /* atomic */
+bool vqs_started;
 } VduseBlkExport;
 
 typedef struct VduseBlkReq {
@@ -41,13 +42,24 @@ typedef struct VduseBlkReq {
 
 static void vduse_blk_inflight_inc(VduseBlkExport *vblk_exp)
 {
-vblk_exp->inflight++;
+if (qatomic_fetch_inc(_exp->inflight) == 0) {
+/* Prevent export from being deleted */
+aio_context_acquire(vblk_exp->export.ctx);
+blk_exp_ref(_exp->export);
+aio_context_release(vblk_exp->export.ctx);
+}
 }
 
 static void vduse_blk_inflight_dec(VduseBlkExport *vblk_exp)
 {
-if (--vblk_exp->inflight == 0) {
+if (qatomic_fetch_dec(_exp->inflight) == 1) {
+/* Wake AIO_WAIT_WHILE() */
 aio_wait_kick();
+
+/* Now the export can be deleted */
+aio_context_acquire(vblk_exp->export.ctx);
+blk_exp_unref(_exp->export);
+aio_context_release(vblk_exp->export.ctx);
 }
 }
 
@@ -124,8 +136,12 @@ static void vduse_blk_enable_queue(VduseDev *dev, 
VduseVirtq *vq)
 {
 VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
 
+if (!vblk_exp->vqs_started) {
+return; /* vduse_blk_drained_end() will start vqs later */
+}
+
 aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
-   true, on_vduse_vq_kick, NULL, NULL, NULL, vq);
+   false, on_vduse_vq_kick, NULL, NULL, NULL, vq);
 /* Make sure we don't miss any kick afer reconnecting */
 eventfd_write(vduse_queue_get_fd(vq), 1);
 }
@@ -133,9 +149,14 @@ static void vduse_blk_enable_queue(VduseDev *dev, 
VduseVirtq *vq)
 static void vduse_blk_disable_queue(VduseDev *dev, VduseVirtq *vq)
 {
 VduseBlkExport *vblk_exp = vduse_dev_get_priv(dev);
+int fd = vduse_queue_get_fd(vq);
 
-aio_set_fd_handler(vblk_exp->export.ctx, vduse_queue_get_fd(vq),
-   true, NULL, NULL, NULL, NULL, NULL);
+if (fd < 0) {
+return;
+}
+
+aio_set_fd_handler(vblk_exp->export.ctx, fd, false,
+   NULL, NULL, NULL, NULL, NULL);
 }
 
 static const VduseOps vduse_blk_ops = {
@@ -152,42 +173,19 @@ static void on_vduse_dev_kick(void *opaque)
 
 static void vduse_blk_attach_ctx(VduseBlkExport *vblk_exp, AioContext *ctx)
 {
-int i;
-
 aio_set_fd_handler(vblk_exp->export.ctx, vduse_dev_get_fd(vblk_exp->dev),
-   true, on_vduse_dev_kick, NULL, NULL, NULL,
+   false, on_vduse_dev_kick, NULL, NULL, NULL,
vblk_exp->dev);
 
-for (i = 0; i < vblk_exp->num_queues; i++) {
-VduseVirtq *vq = vduse_dev_get_queue(vblk_exp->dev, i);
-int fd = vduse_queue_get_fd(vq);
-
-if (fd < 0) {
-continue;
-}
-aio_set_fd_handler(vblk_exp->export.ctx, fd, true,
-   on_vduse_vq_kick, NULL, NULL, NULL, vq);
-}
+/* Virtqueues are handled by vduse_blk_drained_end() */
 }
 
 static void vduse_blk_detach_ctx(VduseBlkExport *vblk_exp)
 {
-int i;
-
-for (i = 0; i < vblk_exp->num_queues; i++)

[PATCH v5 07/21] block/export: wait for vhost-user-blk requests when draining

2023-05-04 Thread Stefan Hajnoczi

Each vhost-user-blk request runs in a coroutine. When the BlockBackend
enters a drained section we need to enter a quiescent state. Currently
any in-flight requests race with bdrv_drained_begin() because it is
unaware of vhost-user-blk requests.

When blk_co_preadv/pwritev()/etc returns it wakes the
bdrv_drained_begin() thread but vhost-user-blk request processing has
not yet finished. The request coroutine continues executing while the
main loop thread thinks it is in a drained section.

One example where this is unsafe is for blk_set_aio_context() where
bdrv_drained_begin() is called before .aio_context_detached() and
.aio_context_attach(). If request coroutines are still running after
bdrv_drained_begin(), then the AioContext could change underneath them
and they race with new requests processed in the new AioContext. This
could lead to virtqueue corruption, for example.

(This example is theoretical, I came across this while reading the
code and have not tried to reproduce it.)

It's easy to make bdrv_drained_begin() wait for in-flight requests: add
a .drained_poll() callback that checks the VuServer's in-flight counter.
VuServer just needs an API that returns true when there are requests in
flight. The in-flight counter needs to be atomic.

Signed-off-by: Stefan Hajnoczi 
---
v5:
- Use atomic accesses for in_flight counter in vhost-user-server.c [Kevin]
---
 include/qemu/vhost-user-server.h |  4 +++-
 block/export/vhost-user-blk-server.c | 13 +
 util/vhost-user-server.c | 18 --
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/include/qemu/vhost-user-server.h b/include/qemu/vhost-user-server.h
index bc0ac9ddb6..b1c1cda886 100644
--- a/include/qemu/vhost-user-server.h
+++ b/include/qemu/vhost-user-server.h
@@ -40,8 +40,9 @@ typedef struct {
 int max_queues;
 const VuDevIface *vu_iface;
 
+unsigned int in_flight; /* atomic */
+
 /* Protected by ctx lock */
-unsigned int in_flight;
 bool wait_idle;
 VuDev vu_dev;
 QIOChannel *ioc; /* The I/O channel with the client */
@@ -62,6 +63,7 @@ void vhost_user_server_stop(VuServer *server);
 
 void vhost_user_server_inc_in_flight(VuServer *server);
 void vhost_user_server_dec_in_flight(VuServer *server);
+bool vhost_user_server_has_in_flight(VuServer *server);
 
 void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx);
 void vhost_user_server_detach_aio_context(VuServer *server);
diff --git a/block/export/vhost-user-blk-server.c 
b/block/export/vhost-user-blk-server.c
index 841acb36e3..f51a36a14f 100644
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -272,7 +272,20 @@ static void vu_blk_exp_resize(void *opaque)
 vu_config_change_msg(>vu_server.vu_dev);
 }
 
+/*
+ * Ensures that bdrv_drained_begin() waits until in-flight requests complete.
+ *
+ * Called with vexp->export.ctx acquired.
+ */
+static bool vu_blk_drained_poll(void *opaque)
+{
+VuBlkExport *vexp = opaque;
+
+return vhost_user_server_has_in_flight(>vu_server);
+}
+
 static const BlockDevOps vu_blk_dev_ops = {
+.drained_poll  = vu_blk_drained_poll,
 .resize_cb = vu_blk_exp_resize,
 };
 
diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
index 1622f8cfb3..68c3bf162f 100644
--- a/util/vhost-user-server.c
+++ b/util/vhost-user-server.c
@@ -78,17 +78,23 @@ static void panic_cb(VuDev *vu_dev, const char *buf)
 void vhost_user_server_inc_in_flight(VuServer *server)
 {
 assert(!server->wait_idle);
-server->in_flight++;
+qatomic_inc(>in_flight);
 }
 
 void vhost_user_server_dec_in_flight(VuServer *server)
 {
-server->in_flight--;
-if (server->wait_idle && !server->in_flight) {
-aio_co_wake(server->co_trip);
+if (qatomic_fetch_dec(>in_flight) == 1) {
+if (server->wait_idle) {
+aio_co_wake(server->co_trip);
+}
 }
 }
 
+bool vhost_user_server_has_in_flight(VuServer *server)
+{
+return qatomic_load_acquire(>in_flight) > 0;
+}
+
 static bool coroutine_fn
 vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
 {
@@ -192,13 +198,13 @@ static coroutine_fn void vu_client_trip(void *opaque)
 /* Keep running */
 }
 
-if (server->in_flight) {
+if (vhost_user_server_has_in_flight(server)) {
 /* Wait for requests to complete before we can unmap the memory */
 server->wait_idle = true;
 qemu_coroutine_yield();
 server->wait_idle = false;
 }
-assert(server->in_flight == 0);
+assert(!vhost_user_server_has_in_flight(server));
 
 vu_deinit(vu_dev);
 
-- 
2.40.1

[PATCH v5 21/21] aio: remove aio_disable_external() API

2023-05-04 Thread Stefan Hajnoczi

All callers now pass is_external=false to aio_set_fd_handler() and
aio_set_event_notifier(). The aio_disable_external() API that
temporarily disables fd handlers that were registered is_external=true
is therefore dead code.

Remove aio_disable_external(), aio_enable_external(), and the
is_external arguments to aio_set_fd_handler() and
aio_set_event_notifier().

The entire test-fdmon-epoll test is removed because its sole purpose was
testing aio_disable_external().

Parts of this patch were generated using the following coccinelle
(https://coccinelle.lip6.fr/) semantic patch:

  @@
  expression ctx, fd, is_external, io_read, io_write, io_poll, io_poll_ready, 
opaque;
  @@
  - aio_set_fd_handler(ctx, fd, is_external, io_read, io_write, io_poll, 
io_poll_ready, opaque)
  + aio_set_fd_handler(ctx, fd, io_read, io_write, io_poll, io_poll_ready, 
opaque)

  @@
  expression ctx, notifier, is_external, io_read, io_poll, io_poll_ready;
  @@
  - aio_set_event_notifier(ctx, notifier, is_external, io_read, io_poll, 
io_poll_ready)
  + aio_set_event_notifier(ctx, notifier, io_read, io_poll, io_poll_ready)

Reviewed-by: Juan Quintela 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Stefan Hajnoczi 
---
 include/block/aio.h   | 57 ---
 util/aio-posix.h  |  1 -
 block.c   |  7 
 block/blkio.c | 15 +++
 block/curl.c  | 10 ++---
 block/export/fuse.c   |  8 ++--
 block/export/vduse-blk.c  | 10 ++---
 block/io.c|  2 -
 block/io_uring.c  |  4 +-
 block/iscsi.c |  3 +-
 block/linux-aio.c |  4 +-
 block/nfs.c   |  5 +--
 block/nvme.c  |  8 ++--
 block/ssh.c   |  4 +-
 block/win32-aio.c |  6 +--
 hw/i386/kvm/xen_xenstore.c|  2 +-
 hw/virtio/virtio.c|  6 +--
 hw/xen/xen-bus.c  |  8 ++--
 io/channel-command.c  |  6 +--
 io/channel-file.c |  3 +-
 io/channel-socket.c   |  3 +-
 migration/rdma.c  | 16 
 tests/unit/test-aio.c | 27 +
 tests/unit/test-bdrv-drain.c  |  1 -
 tests/unit/test-fdmon-epoll.c | 73 ---
 util/aio-posix.c  | 20 +++---
 util/aio-win32.c  |  8 +---
 util/async.c  |  3 +-
 util/fdmon-epoll.c| 18 +++--
 util/fdmon-io_uring.c |  8 +---
 util/fdmon-poll.c |  3 +-
 util/main-loop.c  |  7 ++--
 util/qemu-coroutine-io.c  |  7 ++--
 util/vhost-user-server.c  | 11 +++---
 tests/unit/meson.build|  3 --
 35 files changed, 82 insertions(+), 295 deletions(-)
 delete mode 100644 tests/unit/test-fdmon-epoll.c

diff --git a/include/block/aio.h b/include/block/aio.h
index 89bbc536f9..32042e8905 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -225,8 +225,6 @@ struct AioContext {
  */
 QEMUTimerListGroup tlg;
 
-int external_disable_cnt;
-
 /* Number of AioHandlers without .io_poll() */
 int poll_disable_cnt;
 
@@ -481,7 +479,6 @@ bool aio_poll(AioContext *ctx, bool blocking);
  */
 void aio_set_fd_handler(AioContext *ctx,
 int fd,
-bool is_external,
 IOHandler *io_read,
 IOHandler *io_write,
 AioPollFn *io_poll,
@@ -497,7 +494,6 @@ void aio_set_fd_handler(AioContext *ctx,
  */
 void aio_set_event_notifier(AioContext *ctx,
 EventNotifier *notifier,
-bool is_external,
 EventNotifierHandler *io_read,
 AioPollFn *io_poll,
 EventNotifierHandler *io_poll_ready);
@@ -626,59 +622,6 @@ static inline void aio_timer_init(AioContext *ctx,
  */
 int64_t aio_compute_timeout(AioContext *ctx);
 
-/**
- * aio_disable_external:
- * @ctx: the aio context
- *
- * Disable the further processing of external clients.
- */
-static inline void aio_disable_external(AioContext *ctx)
-{
-qatomic_inc(>external_disable_cnt);
-}
-
-/**
- * aio_enable_external:
- * @ctx: the aio context
- *
- * Enable the processing of external clients.
- */
-static inline void aio_enable_external(AioContext *ctx)
-{
-int old;
-
-old = qatomic_fetch_dec(>external_disable_cnt);
-assert(old > 0);
-if (old == 1) {
-/* Kick event loop so it re-arms file descriptors */
-aio_notify(ctx);
-}
-}
-
-/**
- * aio_external_disabled:
- * @ctx: the aio context
- *
- * Return true if the external clients are disabled.
- */
-static inline bool aio_external_disabled(AioContext *ctx)
-{
-return qatomic_read(>external_disable_cnt);
-}
-
-/**
- * aio_node_check:
- * @ctx: the aio context
- * @is_external: Whether or not the checked node is an external event source.
- *
- * Check if the node's

[PATCH v5 00/21] block: remove aio_disable_external() API

2023-05-04 Thread Stefan Hajnoczi

v5:
- Use atomic accesses for in_flight counter in vhost-user-server.c [Kevin]
- Stash SCSIDevice id/lun values for VIRTIO_SCSI_T_TRANSPORT_RESET event
  before unrealizing the SCSIDevice [Kevin]
- Keep vhost-user-blk export .detach() callback so ctx is set to NULL [Kevin]
- Narrow BdrvChildClass and BlockDriver drained_{begin/end/poll} callbacks from
  IO_OR_GS_CODE() to GLOBAL_STATE_CODE() [Kevin]
- Include Kevin's "block: Fix use after free in blockdev_mark_auto_del()" to
  fix a latent bug that was exposed by this series

v4:
- Remove external_disable_cnt variable [Philippe]
- Add Patch 1 to fix assertion failure in .drained_end() -> 
blk_get_aio_context()
v3:
- Resend full patch series. v2 was sent in the middle of a git rebase and was
  missing patches. [Eric]
- Apply Reviewed-by tags.
v2:
- Do not rely on BlockBackend request queuing, implement .drained_begin/end()
  instead in xen-block, virtio-blk, and virtio-scsi [Paolo]
- Add qdev_is_realized() API [Philippe]
- Add patch to avoid AioContext lock around blk_exp_ref/unref() [Paolo]
- Add patch to call .drained_begin/end() from main loop thread to simplify
  callback implementations

The aio_disable_external() API temporarily suspends file descriptor monitoring
in the event loop. The block layer uses this to prevent new I/O requests being
submitted from the guest and elsewhere between bdrv_drained_begin() and
bdrv_drained_end().

While the block layer still needs to prevent new I/O requests in drained
sections, the aio_disable_external() API can be replaced with
.drained_begin/end/poll() callbacks that have been added to BdrvChildClass and
BlockDevOps.

This newer .bdrained_begin/end/poll() approach is attractive because it works
without specifying a specific AioContext. The block layer is moving towards
multi-queue and that means multiple AioContexts may be processing I/O
simultaneously.

The aio_disable_external() was always somewhat hacky. It suspends all file
descriptors that were registered with is_external=true, even if they have
nothing to do with the BlockDriverState graph nodes that are being drained.
It's better to solve a block layer problem in the block layer than to have an
odd event loop API solution.

The approach in this patch series is to implement BlockDevOps
.drained_begin/end() callbacks that temporarily stop file descriptor handlers.
This ensures that new I/O requests are not submitted in drained sections.

Kevin Wolf (1):
  block: Fix use after free in blockdev_mark_auto_del()

Stefan Hajnoczi (20):
  block-backend: split blk_do_set_aio_context()
  hw/qdev: introduce qdev_is_realized() helper
  virtio-scsi: avoid race between unplug and transport event
  virtio-scsi: stop using aio_disable_external() during unplug
  util/vhost-user-server: rename refcount to in_flight counter
  block/export: wait for vhost-user-blk requests when draining
  block/export: stop using is_external in vhost-user-blk server
  hw/xen: do not use aio_set_fd_handler(is_external=true) in
xen_xenstore
  block: add blk_in_drain() API
  block: drain from main loop thread in bdrv_co_yield_to_drain()
  xen-block: implement BlockDevOps->drained_begin()
  hw/xen: do not set is_external=true on evtchn fds
  block/export: rewrite vduse-blk drain code
  block/export: don't require AioContext lock around blk_exp_ref/unref()
  block/fuse: do not set is_external=true on FUSE fd
  virtio: make it possible to detach host notifier from any thread
  virtio-blk: implement BlockDevOps->drained_begin()
  virtio-scsi: implement BlockDevOps->drained_begin()
  virtio: do not set is_external=true on host notifiers
  aio: remove aio_disable_external() API

 hw/block/dataplane/xen-block.h  |   2 +
 include/block/aio.h |  57 -
 include/block/block_int-common.h|  90 +++---
 include/block/export.h  |   2 +
 include/hw/qdev-core.h  |  17 ++-
 include/hw/scsi/scsi.h  |  14 +++
 include/qemu/vhost-user-server.h|   8 +-
 include/sysemu/block-backend-common.h   |  25 ++--
 include/sysemu/block-backend-global-state.h |   1 +
 util/aio-posix.h|   1 -
 block.c |   7 --
 block/blkio.c   |  15 +--
 block/block-backend.c   |  78 ++--
 block/curl.c|  10 +-
 block/export/export.c   |  13 +-
 block/export/fuse.c |  56 -
 block/export/vduse-blk.c| 128 ++--
 block/export/vhost-user-blk-server.c|  52 +++-
 block/io.c  |  16 ++-
 block/io_uring.c|   4 +-
 block/iscsi.c   |   3 +-
 block/linux-aio.c   |   4 +-
 block/nfs.c |   5 +-
 block/nvme.c

[PATCH v5 05/21] virtio-scsi: stop using aio_disable_external() during unplug

2023-05-04 Thread Stefan Hajnoczi

This patch is part of an effort to remove the aio_disable_external()
API because it does not fit in a multi-queue block layer world where
many AioContexts may be submitting requests to the same disk.

The SCSI emulation code is already in good shape to stop using
aio_disable_external(). It was only used by commit 9c5aad84da1c
("virtio-scsi: fixed virtio_scsi_ctx_check failed when detaching scsi
disk") to ensure that virtio_scsi_hotunplug() works while the guest
driver is submitting I/O.

Ensure virtio_scsi_hotunplug() is safe as follows:

1. qdev_simple_device_unplug_cb() -> qdev_unrealize() ->
   device_set_realized() calls qatomic_set(>realized, false) so
   that future scsi_device_get() calls return NULL because they exclude
   SCSIDevices with realized=false.

   That means virtio-scsi will reject new I/O requests to this
   SCSIDevice with VIRTIO_SCSI_S_BAD_TARGET even while
   virtio_scsi_hotunplug() is still executing. We are protected against
   new requests!

2. scsi_device_unrealize() already contains a call to
   scsi_device_purge_requests() so that in-flight requests are cancelled
   synchronously. This ensures that no in-flight requests remain once
   qdev_simple_device_unplug_cb() returns.

Thanks to these two conditions we don't need aio_disable_external()
anymore.

Cc: Zhengui Li 
Reviewed-by: Paolo Bonzini 
Reviewed-by: Daniil Tatianin 
Signed-off-by: Stefan Hajnoczi 
---
 hw/scsi/virtio-scsi.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index ae314af3de..c1a7ea9ae2 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -1091,7 +1091,6 @@ static void virtio_scsi_hotunplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
 VirtIOSCSI *s = VIRTIO_SCSI(vdev);
 SCSIDevice *sd = SCSI_DEVICE(dev);
-AioContext *ctx = s->ctx ?: qemu_get_aio_context();
 VirtIOSCSIEventInfo info = {
 .event   = VIRTIO_SCSI_T_TRANSPORT_RESET,
 .reason  = VIRTIO_SCSI_EVT_RESET_REMOVED,
@@ -1101,9 +1100,7 @@ static void virtio_scsi_hotunplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 },
 };
 
-aio_disable_external(ctx);
 qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
-aio_enable_external(ctx);
 
 if (s->ctx) {
 virtio_scsi_acquire(s);
-- 
2.40.1

[PATCH v5 17/21] virtio: make it possible to detach host notifier from any thread

2023-05-04 Thread Stefan Hajnoczi

virtio_queue_aio_detach_host_notifier() does two things:
1. It removes the fd handler from the event loop.
2. It processes the virtqueue one last time.

The first step can be peformed by any thread and without taking the
AioContext lock.

The second step may need the AioContext lock (depending on the device
implementation) and runs in the thread where request processing takes
place. virtio-blk and virtio-scsi therefore call
virtio_queue_aio_detach_host_notifier() from a BH that is scheduled in
AioContext

Scheduling a BH is undesirable for .drained_begin() functions. The next
patch will introduce a .drained_begin() function that needs to call
virtio_queue_aio_detach_host_notifier().

Move the virtqueue processing out to the callers of
virtio_queue_aio_detach_host_notifier() so that the function can be
called from any thread. This is in preparation for the next patch.

Signed-off-by: Stefan Hajnoczi 
---
 hw/block/dataplane/virtio-blk.c | 2 ++
 hw/scsi/virtio-scsi-dataplane.c | 9 +
 2 files changed, 11 insertions(+)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index a6202997ee..27eafa6c92 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -287,8 +287,10 @@ static void virtio_blk_data_plane_stop_bh(void *opaque)
 
 for (i = 0; i < s->conf->num_queues; i++) {
 VirtQueue *vq = virtio_get_queue(s->vdev, i);
+EventNotifier *host_notifier = virtio_queue_get_host_notifier(vq);
 
 virtio_queue_aio_detach_host_notifier(vq, s->ctx);
+virtio_queue_host_notifier_read(host_notifier);
 }
 }
 
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index 20bb91766e..81643445ed 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -71,12 +71,21 @@ static void virtio_scsi_dataplane_stop_bh(void *opaque)
 {
 VirtIOSCSI *s = opaque;
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+EventNotifier *host_notifier;
 int i;
 
 virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx);
+host_notifier = virtio_queue_get_host_notifier(vs->ctrl_vq);
+virtio_queue_host_notifier_read(host_notifier);
+
 virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx);
+host_notifier = virtio_queue_get_host_notifier(vs->event_vq);
+virtio_queue_host_notifier_read(host_notifier);
+
 for (i = 0; i < vs->conf.num_queues; i++) {
 virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx);
+host_notifier = virtio_queue_get_host_notifier(vs->cmd_vqs[i]);
+virtio_queue_host_notifier_read(host_notifier);
 }
 }
 
-- 
2.40.1

[PATCH v5 02/21] block-backend: split blk_do_set_aio_context()

2023-05-04 Thread Stefan Hajnoczi

blk_set_aio_context() is not fully transactional because
blk_do_set_aio_context() updates blk->ctx outside the transaction. Most
of the time this goes unnoticed but a BlockDevOps.drained_end() callback
that invokes blk_get_aio_context() fails assert(ctx == blk->ctx). This
happens because blk->ctx is only assigned after
BlockDevOps.drained_end() is called and we're in an intermediate state
where BlockDrvierState nodes already have the new context and the
BlockBackend still has the old context.

Making blk_set_aio_context() fully transactional solves this assertion
failure because the BlockBackend's context is updated as part of the
transaction (before BlockDevOps.drained_end() is called).

Split blk_do_set_aio_context() in order to solve this assertion failure.
This helper function actually serves two different purposes:
1. It drives blk_set_aio_context().
2. It responds to BdrvChildClass->change_aio_ctx().

Get rid of the helper function. Do #1 inside blk_set_aio_context() and
do #2 inside blk_root_set_aio_ctx_commit(). This simplifies the code.

The only drawback of the fully transactional approach is that
blk_set_aio_context() must contend with blk_root_set_aio_ctx_commit()
being invoked as part of the AioContext change propagation. This can be
solved by temporarily setting blk->allow_aio_context_change to true.

Future patches call blk_get_aio_context() from
BlockDevOps->drained_end(), so this patch will become necessary.

Signed-off-by: Stefan Hajnoczi 
---
 block/block-backend.c | 71 +--
 1 file changed, 28 insertions(+), 43 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index fc530ded6a..68d38635bc 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2205,52 +2205,31 @@ static AioContext *blk_aiocb_get_aio_context(BlockAIOCB 
*acb)
 return blk_get_aio_context(blk_acb->blk);
 }
 
-static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
-  bool update_root_node, Error **errp)
-{
-BlockDriverState *bs = blk_bs(blk);
-ThrottleGroupMember *tgm = >public.throttle_group_member;
-int ret;
-
-if (bs) {
-bdrv_ref(bs);
-
-if (update_root_node) {
-/*
- * update_root_node MUST be false for 
blk_root_set_aio_ctx_commit(),
- * as we are already in the commit function of a transaction.
- */
-ret = bdrv_try_change_aio_context(bs, new_context, blk->root, 
errp);
-if (ret < 0) {
-bdrv_unref(bs);
-return ret;
-}
-}
-/*
- * Make blk->ctx consistent with the root node before we invoke any
- * other operations like drain that might inquire blk->ctx
- */
-blk->ctx = new_context;
-if (tgm->throttle_state) {
-bdrv_drained_begin(bs);
-throttle_group_detach_aio_context(tgm);
-throttle_group_attach_aio_context(tgm, new_context);
-bdrv_drained_end(bs);
-}
-
-bdrv_unref(bs);
-} else {
-blk->ctx = new_context;
-}
-
-return 0;
-}
-
 int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
 Error **errp)
 {
+bool old_allow_change;
+BlockDriverState *bs = blk_bs(blk);
+int ret;
+
 GLOBAL_STATE_CODE();
-return blk_do_set_aio_context(blk, new_context, true, errp);
+
+if (!bs) {
+blk->ctx = new_context;
+return 0;
+}
+
+bdrv_ref(bs);
+
+old_allow_change = blk->allow_aio_context_change;
+blk->allow_aio_context_change = true;
+
+ret = bdrv_try_change_aio_context(bs, new_context, NULL, errp);
+
+blk->allow_aio_context_change = old_allow_change;
+
+bdrv_unref(bs);
+return ret;
 }
 
 typedef struct BdrvStateBlkRootContext {
@@ -2262,8 +2241,14 @@ static void blk_root_set_aio_ctx_commit(void *opaque)
 {
 BdrvStateBlkRootContext *s = opaque;
 BlockBackend *blk = s->blk;
+AioContext *new_context = s->new_ctx;
+ThrottleGroupMember *tgm = >public.throttle_group_member;
 
-blk_do_set_aio_context(blk, s->new_ctx, false, _abort);
+blk->ctx = new_context;
+if (tgm->throttle_state) {
+throttle_group_detach_aio_context(tgm);
+throttle_group_attach_aio_context(tgm, new_context);
+}
 }
 
 static TransactionActionDrv set_blk_root_context = {
-- 
2.40.1

[PATCH v5 18/21] virtio-blk: implement BlockDevOps->drained_begin()

2023-05-04 Thread Stefan Hajnoczi

Detach ioeventfds during drained sections to stop I/O submission from
the guest. virtio-blk is no longer reliant on aio_disable_external()
after this patch. This will allow us to remove the
aio_disable_external() API once all other code that relies on it is
converted.

Take extra care to avoid attaching/detaching ioeventfds if the data
plane is started/stopped during a drained section. This should be rare,
but maybe the mirror block job can trigger it.

Signed-off-by: Stefan Hajnoczi 
---
 hw/block/dataplane/virtio-blk.c | 17 +--
 hw/block/virtio-blk.c   | 38 -
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 27eafa6c92..c0d2663abc 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -246,13 +246,15 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
 }
 
 /* Get this show started by hooking up our callbacks */
-aio_context_acquire(s->ctx);
-for (i = 0; i < nvqs; i++) {
-VirtQueue *vq = virtio_get_queue(s->vdev, i);
+if (!blk_in_drain(s->conf->conf.blk)) {
+aio_context_acquire(s->ctx);
+for (i = 0; i < nvqs; i++) {
+VirtQueue *vq = virtio_get_queue(s->vdev, i);
 
-virtio_queue_aio_attach_host_notifier(vq, s->ctx);
+virtio_queue_aio_attach_host_notifier(vq, s->ctx);
+}
+aio_context_release(s->ctx);
 }
-aio_context_release(s->ctx);
 return 0;
 
   fail_aio_context:
@@ -318,7 +320,10 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
 trace_virtio_blk_data_plane_stop(s);
 
 aio_context_acquire(s->ctx);
-aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
+
+if (!blk_in_drain(s->conf->conf.blk)) {
+aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
+}
 
 /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
 blk_drain(s->conf->conf.blk);
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index cefca93b31..d8dedc575c 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -1109,8 +1109,44 @@ static void virtio_blk_resize(void *opaque)
 aio_bh_schedule_oneshot(qemu_get_aio_context(), virtio_resize_cb, vdev);
 }
 
+/* Suspend virtqueue ioeventfd processing during drain */
+static void virtio_blk_drained_begin(void *opaque)
+{
+VirtIOBlock *s = opaque;
+VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
+AioContext *ctx = blk_get_aio_context(s->conf.conf.blk);
+
+if (!s->dataplane || !s->dataplane_started) {
+return;
+}
+
+for (uint16_t i = 0; i < s->conf.num_queues; i++) {
+VirtQueue *vq = virtio_get_queue(vdev, i);
+virtio_queue_aio_detach_host_notifier(vq, ctx);
+}
+}
+
+/* Resume virtqueue ioeventfd processing after drain */
+static void virtio_blk_drained_end(void *opaque)
+{
+VirtIOBlock *s = opaque;
+VirtIODevice *vdev = VIRTIO_DEVICE(opaque);
+AioContext *ctx = blk_get_aio_context(s->conf.conf.blk);
+
+if (!s->dataplane || !s->dataplane_started) {
+return;
+}
+
+for (uint16_t i = 0; i < s->conf.num_queues; i++) {
+VirtQueue *vq = virtio_get_queue(vdev, i);
+virtio_queue_aio_attach_host_notifier(vq, ctx);
+}
+}
+
 static const BlockDevOps virtio_block_ops = {
-.resize_cb = virtio_blk_resize,
+.resize_cb = virtio_blk_resize,
+.drained_begin = virtio_blk_drained_begin,
+.drained_end   = virtio_blk_drained_end,
 };
 
 static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
-- 
2.40.1

[PATCH v5 19/21] virtio-scsi: implement BlockDevOps->drained_begin()

2023-05-04 Thread Stefan Hajnoczi

The virtio-scsi Host Bus Adapter provides access to devices on a SCSI
bus. Those SCSI devices typically have a BlockBackend. When the
BlockBackend enters a drained section, the SCSI device must temporarily
stop submitting new I/O requests.

Implement this behavior by temporarily stopping virtio-scsi virtqueue
processing when one of the SCSI devices enters a drained section. The
new scsi_device_drained_begin() API allows scsi-disk to message the
virtio-scsi HBA.

scsi_device_drained_begin() uses a drain counter so that multiple SCSI
devices can have overlapping drained sections. The HBA only sees one
pair of .drained_begin/end() calls.

After this commit, virtio-scsi no longer depends on hw/virtio's
ioeventfd aio_set_event_notifier(is_external=true). This commit is a
step towards removing the aio_disable_external() API.

Signed-off-by: Stefan Hajnoczi 
---
 include/hw/scsi/scsi.h  | 14 
 hw/scsi/scsi-bus.c  | 40 +
 hw/scsi/scsi-disk.c | 27 +-
 hw/scsi/virtio-scsi-dataplane.c | 22 ++
 hw/scsi/virtio-scsi.c   | 38 +++
 hw/scsi/trace-events|  2 ++
 6 files changed, 129 insertions(+), 14 deletions(-)

diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 6f23a7a73e..e2bb1a2fbf 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -133,6 +133,16 @@ struct SCSIBusInfo {
 void (*save_request)(QEMUFile *f, SCSIRequest *req);
 void *(*load_request)(QEMUFile *f, SCSIRequest *req);
 void (*free_request)(SCSIBus *bus, void *priv);
+
+/*
+ * Temporarily stop submitting new requests between drained_begin() and
+ * drained_end(). Called from the main loop thread with the BQL held.
+ *
+ * Implement these callbacks if request processing is triggered by a file
+ * descriptor like an EventNotifier. Otherwise set them to NULL.
+ */
+void (*drained_begin)(SCSIBus *bus);
+void (*drained_end)(SCSIBus *bus);
 };
 
 #define TYPE_SCSI_BUS "SCSI"
@@ -144,6 +154,8 @@ struct SCSIBus {
 
 SCSISense unit_attention;
 const SCSIBusInfo *info;
+
+int drain_count; /* protected by BQL */
 };
 
 /**
@@ -213,6 +225,8 @@ void scsi_req_cancel_complete(SCSIRequest *req);
 void scsi_req_cancel(SCSIRequest *req);
 void scsi_req_cancel_async(SCSIRequest *req, Notifier *notifier);
 void scsi_req_retry(SCSIRequest *req);
+void scsi_device_drained_begin(SCSIDevice *sdev);
+void scsi_device_drained_end(SCSIDevice *sdev);
 void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense);
 void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense);
 void scsi_device_report_change(SCSIDevice *dev, SCSISense sense);
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 64013c8a24..f80f4cb4fc 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -1669,6 +1669,46 @@ void scsi_device_purge_requests(SCSIDevice *sdev, 
SCSISense sense)
 scsi_device_set_ua(sdev, sense);
 }
 
+void scsi_device_drained_begin(SCSIDevice *sdev)
+{
+SCSIBus *bus = DO_UPCAST(SCSIBus, qbus, sdev->qdev.parent_bus);
+if (!bus) {
+return;
+}
+
+assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+assert(bus->drain_count < INT_MAX);
+
+/*
+ * Multiple BlockBackends can be on a SCSIBus and each may begin/end
+ * draining at any time. Keep a counter so HBAs only see begin/end once.
+ */
+if (bus->drain_count++ == 0) {
+trace_scsi_bus_drained_begin(bus, sdev);
+if (bus->info->drained_begin) {
+bus->info->drained_begin(bus);
+}
+}
+}
+
+void scsi_device_drained_end(SCSIDevice *sdev)
+{
+SCSIBus *bus = DO_UPCAST(SCSIBus, qbus, sdev->qdev.parent_bus);
+if (!bus) {
+return;
+}
+
+assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+assert(bus->drain_count > 0);
+
+if (bus->drain_count-- == 1) {
+trace_scsi_bus_drained_end(bus, sdev);
+if (bus->info->drained_end) {
+bus->info->drained_end(bus);
+}
+}
+}
+
 static char *scsibus_get_dev_path(DeviceState *dev)
 {
 SCSIDevice *d = SCSI_DEVICE(dev);
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 97c9b1c8cd..e0d79c7966 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2360,6 +2360,20 @@ static void scsi_disk_reset(DeviceState *dev)
 s->qdev.scsi_version = s->qdev.default_scsi_version;
 }
 
+static void scsi_disk_drained_begin(void *opaque)
+{
+SCSIDiskState *s = opaque;
+
+scsi_device_drained_begin(>qdev);
+}
+
+static void scsi_disk_drained_end(void *opaque)
+{
+SCSIDiskState *s = opaque;
+
+scsi_device_drained_end(>qdev);
+}
+
 static void scsi_disk_resize_cb(void *opaque)
 {
 SCSIDiskState *s = opaque;
@@ -2414,16 +2428,19 @@ static bool scsi_cd_is_medium_locked(void *opaque)
 }
 
 static const BlockDevOps scsi_disk_removable_block_ops = {
-

[PATCH v5 12/21] xen-block: implement BlockDevOps->drained_begin()

2023-05-04 Thread Stefan Hajnoczi

Detach event channels during drained sections to stop I/O submission
from the ring. xen-block is no longer reliant on aio_disable_external()
after this patch. This will allow us to remove the
aio_disable_external() API once all other code that relies on it is
converted.

Extend xen_device_set_event_channel_context() to allow ctx=NULL. The
event channel still exists but the event loop does not monitor the file
descriptor. Event channel processing can resume by calling
xen_device_set_event_channel_context() with a non-NULL ctx.

Factor out xen_device_set_event_channel_context() calls in
hw/block/dataplane/xen-block.c into attach/detach helper functions.
Incidentally, these don't require the AioContext lock because
aio_set_fd_handler() is thread-safe.

It's safer to register BlockDevOps after the dataplane instance has been
created. The BlockDevOps .drained_begin/end() callbacks depend on the
dataplane instance, so move the blk_set_dev_ops() call after
xen_block_dataplane_create().

Signed-off-by: Stefan Hajnoczi 
---
 hw/block/dataplane/xen-block.h |  2 ++
 hw/block/dataplane/xen-block.c | 42 +-
 hw/block/xen-block.c   | 24 ---
 hw/xen/xen-bus.c   |  7 --
 4 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/hw/block/dataplane/xen-block.h b/hw/block/dataplane/xen-block.h
index 76dcd51c3d..7b8e9df09f 100644
--- a/hw/block/dataplane/xen-block.h
+++ b/hw/block/dataplane/xen-block.h
@@ -26,5 +26,7 @@ void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
unsigned int protocol,
Error **errp);
 void xen_block_dataplane_stop(XenBlockDataPlane *dataplane);
+void xen_block_dataplane_attach(XenBlockDataPlane *dataplane);
+void xen_block_dataplane_detach(XenBlockDataPlane *dataplane);
 
 #endif /* HW_BLOCK_DATAPLANE_XEN_BLOCK_H */
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
index d8bc39d359..2597f38805 100644
--- a/hw/block/dataplane/xen-block.c
+++ b/hw/block/dataplane/xen-block.c
@@ -664,6 +664,30 @@ void xen_block_dataplane_destroy(XenBlockDataPlane 
*dataplane)
 g_free(dataplane);
 }
 
+void xen_block_dataplane_detach(XenBlockDataPlane *dataplane)
+{
+if (!dataplane || !dataplane->event_channel) {
+return;
+}
+
+/* Only reason for failure is a NULL channel */
+xen_device_set_event_channel_context(dataplane->xendev,
+ dataplane->event_channel,
+ NULL, _abort);
+}
+
+void xen_block_dataplane_attach(XenBlockDataPlane *dataplane)
+{
+if (!dataplane || !dataplane->event_channel) {
+return;
+}
+
+/* Only reason for failure is a NULL channel */
+xen_device_set_event_channel_context(dataplane->xendev,
+ dataplane->event_channel,
+ dataplane->ctx, _abort);
+}
+
 void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
 {
 XenDevice *xendev;
@@ -674,13 +698,11 @@ void xen_block_dataplane_stop(XenBlockDataPlane 
*dataplane)
 
 xendev = dataplane->xendev;
 
-aio_context_acquire(dataplane->ctx);
-if (dataplane->event_channel) {
-/* Only reason for failure is a NULL channel */
-xen_device_set_event_channel_context(xendev, dataplane->event_channel,
- qemu_get_aio_context(),
- _abort);
+if (!blk_in_drain(dataplane->blk)) {
+xen_block_dataplane_detach(dataplane);
 }
+
+aio_context_acquire(dataplane->ctx);
 /* Xen doesn't have multiple users for nodes, so this can't fail */
 blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), _abort);
 aio_context_release(dataplane->ctx);
@@ -819,11 +841,9 @@ void xen_block_dataplane_start(XenBlockDataPlane 
*dataplane,
 blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL);
 aio_context_release(old_context);
 
-/* Only reason for failure is a NULL channel */
-aio_context_acquire(dataplane->ctx);
-xen_device_set_event_channel_context(xendev, dataplane->event_channel,
- dataplane->ctx, _abort);
-aio_context_release(dataplane->ctx);
+if (!blk_in_drain(dataplane->blk)) {
+xen_block_dataplane_attach(dataplane);
+}
 
 return;
 
diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index f5a744589d..f099914831 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -189,8 +189,26 @@ static void xen_block_resize_cb(void *opaque)
 xen_device_backend_printf(xendev, "state", "%u", state);
 }
 
+/* Suspend request handling */
+static void xen_block_drained_begin(void *opaque)
+{
+XenBlockDevice *blockdev = opaque;
+
+xen_block_dataplane_detach(blockdev->dataplane);
+}
+
+/* Resume request handling */
+static void xen_block_drained_end(void *opaque)

[PATCH v5 10/21] block: add blk_in_drain() API

2023-05-04 Thread Stefan Hajnoczi

The BlockBackend quiesce_counter is greater than zero during drained
sections. Add an API to check whether the BlockBackend is in a drained
section.

The next patch will use this API.

Signed-off-by: Stefan Hajnoczi 
---
 include/sysemu/block-backend-global-state.h | 1 +
 block/block-backend.c   | 7 +++
 2 files changed, 8 insertions(+)

diff --git a/include/sysemu/block-backend-global-state.h 
b/include/sysemu/block-backend-global-state.h
index 2b6d27db7c..ac7cbd6b5e 100644
--- a/include/sysemu/block-backend-global-state.h
+++ b/include/sysemu/block-backend-global-state.h
@@ -78,6 +78,7 @@ void blk_activate(BlockBackend *blk, Error **errp);
 int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags);
 void blk_aio_cancel(BlockAIOCB *acb);
 int blk_commit_all(void);
+bool blk_in_drain(BlockBackend *blk);
 void blk_drain(BlockBackend *blk);
 void blk_drain_all(void);
 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
diff --git a/block/block-backend.c b/block/block-backend.c
index 68d38635bc..96f03cae95 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1270,6 +1270,13 @@ blk_check_byte_request(BlockBackend *blk, int64_t 
offset, int64_t bytes)
 return 0;
 }
 
+/* Are we currently in a drained section? */
+bool blk_in_drain(BlockBackend *blk)
+{
+GLOBAL_STATE_CODE(); /* change to IO_OR_GS_CODE(), if necessary */
+return qatomic_read(>quiesce_counter);
+}
+
 /* To be called between exactly one pair of blk_inc/dec_in_flight() */
 static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
 {
-- 
2.40.1

[PATCH v5 08/21] block/export: stop using is_external in vhost-user-blk server

2023-05-04 Thread Stefan Hajnoczi

vhost-user activity must be suspended during bdrv_drained_begin/end().
This prevents new requests from interfering with whatever is happening
in the drained section.

Previously this was done using aio_set_fd_handler()'s is_external
argument. In a multi-queue block layer world the aio_disable_external()
API cannot be used since multiple AioContext may be processing I/O, not
just one.

Switch to BlockDevOps->drained_begin/end() callbacks.

Signed-off-by: Stefan Hajnoczi 
---
 block/export/vhost-user-blk-server.c | 28 ++--
 util/vhost-user-server.c | 10 +-
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/block/export/vhost-user-blk-server.c 
b/block/export/vhost-user-blk-server.c
index f51a36a14f..81b59761e3 100644
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -212,15 +212,21 @@ static void blk_aio_attached(AioContext *ctx, void 
*opaque)
 {
 VuBlkExport *vexp = opaque;
 
+/*
+ * The actual attach will happen in vu_blk_drained_end() and we just
+ * restore ctx here.
+ */
 vexp->export.ctx = ctx;
-vhost_user_server_attach_aio_context(>vu_server, ctx);
 }
 
 static void blk_aio_detach(void *opaque)
 {
 VuBlkExport *vexp = opaque;
 
-vhost_user_server_detach_aio_context(>vu_server);
+/*
+ * The actual detach already happened in vu_blk_drained_begin() but from
+ * this point on we must not access ctx anymore.
+ */
 vexp->export.ctx = NULL;
 }
 
@@ -272,6 +278,22 @@ static void vu_blk_exp_resize(void *opaque)
 vu_config_change_msg(>vu_server.vu_dev);
 }
 
+/* Called with vexp->export.ctx acquired */
+static void vu_blk_drained_begin(void *opaque)
+{
+VuBlkExport *vexp = opaque;
+
+vhost_user_server_detach_aio_context(>vu_server);
+}
+
+/* Called with vexp->export.blk AioContext acquired */
+static void vu_blk_drained_end(void *opaque)
+{
+VuBlkExport *vexp = opaque;
+
+vhost_user_server_attach_aio_context(>vu_server, vexp->export.ctx);
+}
+
 /*
  * Ensures that bdrv_drained_begin() waits until in-flight requests complete.
  *
@@ -285,6 +307,8 @@ static bool vu_blk_drained_poll(void *opaque)
 }
 
 static const BlockDevOps vu_blk_dev_ops = {
+.drained_begin = vu_blk_drained_begin,
+.drained_end   = vu_blk_drained_end,
 .drained_poll  = vu_blk_drained_poll,
 .resize_cb = vu_blk_exp_resize,
 };
diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
index 68c3bf162f..a12b2d1bba 100644
--- a/util/vhost-user-server.c
+++ b/util/vhost-user-server.c
@@ -278,7 +278,7 @@ set_watch(VuDev *vu_dev, int fd, int vu_evt,
 vu_fd_watch->fd = fd;
 vu_fd_watch->cb = cb;
 qemu_socket_set_nonblock(fd);
-aio_set_fd_handler(server->ioc->ctx, fd, true, kick_handler,
+aio_set_fd_handler(server->ioc->ctx, fd, false, kick_handler,
NULL, NULL, NULL, vu_fd_watch);
 vu_fd_watch->vu_dev = vu_dev;
 vu_fd_watch->pvt = pvt;
@@ -299,7 +299,7 @@ static void remove_watch(VuDev *vu_dev, int fd)
 if (!vu_fd_watch) {
 return;
 }
-aio_set_fd_handler(server->ioc->ctx, fd, true,
+aio_set_fd_handler(server->ioc->ctx, fd, false,
NULL, NULL, NULL, NULL, NULL);
 
 QTAILQ_REMOVE(>vu_fd_watches, vu_fd_watch, next);
@@ -362,7 +362,7 @@ void vhost_user_server_stop(VuServer *server)
 VuFdWatch *vu_fd_watch;
 
 QTAILQ_FOREACH(vu_fd_watch, >vu_fd_watches, next) {
-aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true,
+aio_set_fd_handler(server->ctx, vu_fd_watch->fd, false,
NULL, NULL, NULL, NULL, vu_fd_watch);
 }
 
@@ -403,7 +403,7 @@ void vhost_user_server_attach_aio_context(VuServer *server, 
AioContext *ctx)
 qio_channel_attach_aio_context(server->ioc, ctx);
 
 QTAILQ_FOREACH(vu_fd_watch, >vu_fd_watches, next) {
-aio_set_fd_handler(ctx, vu_fd_watch->fd, true, kick_handler, NULL,
+aio_set_fd_handler(ctx, vu_fd_watch->fd, false, kick_handler, NULL,
NULL, NULL, vu_fd_watch);
 }
 
@@ -417,7 +417,7 @@ void vhost_user_server_detach_aio_context(VuServer *server)
 VuFdWatch *vu_fd_watch;
 
 QTAILQ_FOREACH(vu_fd_watch, >vu_fd_watches, next) {
-aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true,
+aio_set_fd_handler(server->ctx, vu_fd_watch->fd, false,
NULL, NULL, NULL, NULL, vu_fd_watch);
 }
 
-- 
2.40.1

[PATCH v5 06/21] util/vhost-user-server: rename refcount to in_flight counter

2023-05-04 Thread Stefan Hajnoczi

The VuServer object has a refcount field and ref/unref APIs. The name is
confusing because it's actually an in-flight request counter instead of
a refcount.

Normally a refcount destroys the object upon reaching zero. The VuServer
counter is used to wake up the vhost-user coroutine when there are no
more requests.

Avoid confusing by renaming refcount and ref/unref to in_flight and
inc/dec.

Reviewed-by: Paolo Bonzini 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Stefan Hajnoczi 
---
 include/qemu/vhost-user-server.h |  6 +++---
 block/export/vhost-user-blk-server.c | 11 +++
 util/vhost-user-server.c | 14 +++---
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/include/qemu/vhost-user-server.h b/include/qemu/vhost-user-server.h
index 25c72433ca..bc0ac9ddb6 100644
--- a/include/qemu/vhost-user-server.h
+++ b/include/qemu/vhost-user-server.h
@@ -41,7 +41,7 @@ typedef struct {
 const VuDevIface *vu_iface;
 
 /* Protected by ctx lock */
-unsigned int refcount;
+unsigned int in_flight;
 bool wait_idle;
 VuDev vu_dev;
 QIOChannel *ioc; /* The I/O channel with the client */
@@ -60,8 +60,8 @@ bool vhost_user_server_start(VuServer *server,
 
 void vhost_user_server_stop(VuServer *server);
 
-void vhost_user_server_ref(VuServer *server);
-void vhost_user_server_unref(VuServer *server);
+void vhost_user_server_inc_in_flight(VuServer *server);
+void vhost_user_server_dec_in_flight(VuServer *server);
 
 void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx);
 void vhost_user_server_detach_aio_context(VuServer *server);
diff --git a/block/export/vhost-user-blk-server.c 
b/block/export/vhost-user-blk-server.c
index e56b92f2e2..841acb36e3 100644
--- a/block/export/vhost-user-blk-server.c
+++ b/block/export/vhost-user-blk-server.c
@@ -50,7 +50,10 @@ static void vu_blk_req_complete(VuBlkReq *req, size_t in_len)
 free(req);
 }
 
-/* Called with server refcount increased, must decrease before returning */
+/*
+ * Called with server in_flight counter increased, must decrease before
+ * returning.
+ */
 static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
 {
 VuBlkReq *req = opaque;
@@ -68,12 +71,12 @@ static void coroutine_fn vu_blk_virtio_process_req(void 
*opaque)
 in_num, out_num);
 if (in_len < 0) {
 free(req);
-vhost_user_server_unref(server);
+vhost_user_server_dec_in_flight(server);
 return;
 }
 
 vu_blk_req_complete(req, in_len);
-vhost_user_server_unref(server);
+vhost_user_server_dec_in_flight(server);
 }
 
 static void vu_blk_process_vq(VuDev *vu_dev, int idx)
@@ -95,7 +98,7 @@ static void vu_blk_process_vq(VuDev *vu_dev, int idx)
 Coroutine *co =
 qemu_coroutine_create(vu_blk_virtio_process_req, req);
 
-vhost_user_server_ref(server);
+vhost_user_server_inc_in_flight(server);
 qemu_coroutine_enter(co);
 }
 }
diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
index 5b6216069c..1622f8cfb3 100644
--- a/util/vhost-user-server.c
+++ b/util/vhost-user-server.c
@@ -75,16 +75,16 @@ static void panic_cb(VuDev *vu_dev, const char *buf)
 error_report("vu_panic: %s", buf);
 }
 
-void vhost_user_server_ref(VuServer *server)
+void vhost_user_server_inc_in_flight(VuServer *server)
 {
 assert(!server->wait_idle);
-server->refcount++;
+server->in_flight++;
 }
 
-void vhost_user_server_unref(VuServer *server)
+void vhost_user_server_dec_in_flight(VuServer *server)
 {
-server->refcount--;
-if (server->wait_idle && !server->refcount) {
+server->in_flight--;
+if (server->wait_idle && !server->in_flight) {
 aio_co_wake(server->co_trip);
 }
 }
@@ -192,13 +192,13 @@ static coroutine_fn void vu_client_trip(void *opaque)
 /* Keep running */
 }
 
-if (server->refcount) {
+if (server->in_flight) {
 /* Wait for requests to complete before we can unmap the memory */
 server->wait_idle = true;
 qemu_coroutine_yield();
 server->wait_idle = false;
 }
-assert(server->refcount == 0);
+assert(server->in_flight == 0);
 
 vu_deinit(vu_dev);
 
-- 
2.40.1

[PATCH v5 01/21] block: Fix use after free in blockdev_mark_auto_del()

2023-05-04 Thread Stefan Hajnoczi

From: Kevin Wolf 

job_cancel_locked() drops the job list lock temporarily and it may call
aio_poll(). We must assume that the list has changed after this call.
Also, with unlucky timing, it can end up freeing the job during
job_completed_txn_abort_locked(), making the job pointer invalid, too.

For both reasons, we can't just continue at block_job_next_locked(job).
Instead, start at the head of the list again after job_cancel_locked()
and skip those jobs that we already cancelled (or that are completing
anyway).

Signed-off-by: Kevin Wolf 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Stefan Hajnoczi 
Message-Id: <20230503140142.474404-1-kw...@redhat.com>
---
 blockdev.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index d7b5c18f0a..2c1752a403 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -153,12 +153,22 @@ void blockdev_mark_auto_del(BlockBackend *blk)
 
 JOB_LOCK_GUARD();
 
-for (job = block_job_next_locked(NULL); job;
- job = block_job_next_locked(job)) {
-if (block_job_has_bdrv(job, blk_bs(blk))) {
+do {
+job = block_job_next_locked(NULL);
+while (job && (job->job.cancelled ||
+   job->job.deferred_to_main_loop ||
+   !block_job_has_bdrv(job, blk_bs(blk
+{
+job = block_job_next_locked(job);
+}
+if (job) {
+/*
+ * This drops the job lock temporarily and polls, so we need to
+ * restart processing the list from the start after this.
+ */
 job_cancel_locked(>job, false);
 }
-}
+} while (job);
 
 dinfo->auto_del = 1;
 }
-- 
2.40.1

Re: [PATCH 9/9] qemu-file: Account for rate_limit usage on qemu_fflush()

2023-05-04 Thread Juan Quintela

Daniel P. Berrangé  wrote:
> On Thu, May 04, 2023 at 01:38:41PM +0200, Juan Quintela wrote:
>> That is the moment we know we have transferred something.
>> 
>> Signed-off-by: Juan Quintela 
>> ---
>>  migration/qemu-file.c | 7 +++
>>  1 file changed, 3 insertions(+), 4 deletions(-)
>> 
>> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>> index ddebfac847..309b4c56f4 100644
>> --- a/migration/qemu-file.c
>> +++ b/migration/qemu-file.c
>> @@ -300,7 +300,9 @@ void qemu_fflush(QEMUFile *f)
>> _error) < 0) {
>>  qemu_file_set_error_obj(f, -EIO, local_error);
>>  } else {
>> -f->total_transferred += iov_size(f->iov, f->iovcnt);
>> +uint64_t size = iov_size(f->iov, f->iovcnt);
>> +qemu_file_acct_rate_limit(f, size);
>> +f->total_transferred += size;
>>  }
>>  
>>  qemu_iovec_release_ram(f);
>> @@ -527,7 +529,6 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t 
>> *buf, size_t size,
>>  return;
>>  }
>>  
>> -f->rate_limit_used += size;
>>  add_to_iovec(f, buf, size, may_free);
>>  }
>>  
>> @@ -545,7 +546,6 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, 
>> size_t size)
>>  l = size;
>>  }
>>  memcpy(f->buf + f->buf_index, buf, l);
>> -f->rate_limit_used += l;
>>  add_buf_to_iovec(f, l);
>>  if (qemu_file_get_error(f)) {
>>  break;
>> @@ -562,7 +562,6 @@ void qemu_put_byte(QEMUFile *f, int v)
>>  }
>>  
>>  f->buf[f->buf_index] = v;
>> -f->rate_limit_used++;
>>  add_buf_to_iovec(f, 1);
>>  }
>
> This has a slight semantic behavioural change.

Yeap.

See the answer to Peter.  But three things came to mind:

a - the size of the buffer is small (between 32KB and 256KB depending
how you count it).  So we are going to call qemu_fflush() really
soon.

b - We are using this value to calculate how much we can send through
the wire.  Here we are saything how much we have accepted to send.

c - When using multifd the number of bytes that we send through the qemu
file is even smaller. migration-test multifd test send 300MB of data
through multifd channels and around 300KB on the qemu_file channel.


>
> By accounting for rate limit in the qemu_put functions, we ensure
> that we stop growing the iovec when rate limiting activates.
>
> If we only apply rate limit in the the flush function, that will
> let the  f->iov continue to accumulate buffers, while we have
> rate limited the actual transfer.

256KB maximum.  Our accounting has bigger errors than that.


> This makes me uneasy - it feels like a bad idea to continue to
> accumulate buffers if we're not ready to send them

I still think that the change is correct.  But as you and Peter have
concerns about it, I will think a bit more about it.

Thanks, Juan.

Re: [PATCH v2 2/2] tests/lcitool: Add mtools and xorriso and remove genisoimage as dependencies

2023-05-04 Thread Thomas Huth


On 04/05/2023 17.46, Ani Sinha wrote:

Bios bits avocado tests need mformat (provided by the mtools package) and
xorriso tools in order to run within gitlab CI containers. Add those
dependencies within the Dockerfiles so that containers can be built with
those tools present and bios bits avocado tests can be run there.

xorriso package conflicts with genisoimage package on some distributions.
Therefore, it is not possible to have both the packages at the same time
in the container image uniformly for all distribution flavors. Further,
on some distributions like RHEL, both xorriso and genisoimage
packages provide /usr/bin/genisoimage and on some other distributions like
Fedora, only genisoimage package provides the same utility.
Therefore, this change removes the dependency on geninsoimage for building
container images altogether keeping only xorriso package. At the same time,
cdrom-test.c is updated to use and check for existence of only xorrisofs.

CC: m...@redhat.com
CC: berra...@redhat.com
Signed-off-by: Ani Sinha 
---



Reviewed-by: Thomas Huth

Re: [PATCH 9/9] qemu-file: Account for rate_limit usage on qemu_fflush()

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:41PM +0200, Juan Quintela wrote:
> That is the moment we know we have transferred something.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/qemu-file.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> index ddebfac847..309b4c56f4 100644
> --- a/migration/qemu-file.c
> +++ b/migration/qemu-file.c
> @@ -300,7 +300,9 @@ void qemu_fflush(QEMUFile *f)
> _error) < 0) {
>  qemu_file_set_error_obj(f, -EIO, local_error);
>  } else {
> -f->total_transferred += iov_size(f->iov, f->iovcnt);
> +uint64_t size = iov_size(f->iov, f->iovcnt);
> +qemu_file_acct_rate_limit(f, size);
> +f->total_transferred += size;
>  }
>  
>  qemu_iovec_release_ram(f);
> @@ -527,7 +529,6 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t 
> *buf, size_t size,
>  return;
>  }
>  
> -f->rate_limit_used += size;
>  add_to_iovec(f, buf, size, may_free);
>  }
>  
> @@ -545,7 +546,6 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, 
> size_t size)
>  l = size;
>  }
>  memcpy(f->buf + f->buf_index, buf, l);
> -f->rate_limit_used += l;
>  add_buf_to_iovec(f, l);
>  if (qemu_file_get_error(f)) {
>  break;
> @@ -562,7 +562,6 @@ void qemu_put_byte(QEMUFile *f, int v)
>  }
>  
>  f->buf[f->buf_index] = v;
> -f->rate_limit_used++;
>  add_buf_to_iovec(f, 1);
>  }

This has a slight semantic behavioural change.

By accounting for rate limit in the qemu_put functions, we ensure
that we stop growing the iovec when rate limiting activates.

If we only apply rate limit in the the flush function, that will
let the  f->iov continue to accumulate buffers, while we have
rate limited the actual transfer. This makes me uneasy - it feels
like a bad idea to continue to accumulate buffers if we're not
ready to send them

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 5/9] qemu-file: No need to check for shutdown in qemu_file_rate_limit

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:37PM +0200, Juan Quintela wrote:
> After calling qemu_file_shutdown() we set the error as -EIO if there
> is no another previous error, so no need to check it here.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/qemu-file.c | 3 ---
>  1 file changed, 3 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 8/9] qemu-file: Make ram_control_save_page() use accessors for rate_limit

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:40PM +0200, Juan Quintela wrote:
> Signed-off-by: Juan Quintela 
> ---
>  migration/qemu-file.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 6/9] qemu-file: remove shutdown member

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:38PM +0200, Juan Quintela wrote:
> The first thing that we do after setting the shutdown value is set the
> error as -EIO if there is not a previous error.
> 
> So this value is reduntant.  Just remove it and use

  s/reduntant/redundant/

> qemu_file_get_error() in the places that it was tested.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/qemu-file.c | 10 +++---
>  1 file changed, 3 insertions(+), 7 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 3/9] qemu-file: make qemu_file_[sg]et_rate_limit() use an uint64_t

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:35PM +0200, Juan Quintela wrote:
> It is really size_t.  Everything else uses uint64_t, so move this to
> uint64_t as well.  A size can't be negative anyways.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/migration.c | 6 +++---
>  migration/qemu-file.c | 8 
>  migration/qemu-file.h | 4 ++--
>  3 files changed, 9 insertions(+), 9 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 2/9] migration: qemu_file_total_transferred() function is monotonic

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:34PM +0200, Juan Quintela wrote:
> So delta_bytes can only be greater or equal to zero.  Never negative.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/block.c | 8 +---
>  1 file changed, 1 insertion(+), 7 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 7/9] qemu-file: Make total_transferred an uint64_t

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:39PM +0200, Juan Quintela wrote:
> Change all the functions that use it.  It was already passed as
> uint64_t.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/block.c | 5 ++---
>  migration/qemu-file.c | 8 
>  migration/qemu-file.h | 4 ++--
>  migration/savevm.c| 6 ++
>  migration/vmstate.c   | 2 +-
>  5 files changed, 11 insertions(+), 14 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 4/9] qemu-file: Make rate_limit_used an uint64_t

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:36PM +0200, Juan Quintela wrote:
> Change all the functions that use it.  It was already passed as
> uint64_t.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/qemu-file.c | 4 ++--
>  migration/qemu-file.h | 2 +-
>  2 files changed, 3 insertions(+), 3 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 1/9] migration: max_postcopy_bandwidth is a size parameter

2023-05-04 Thread Daniel P . Berrangé

On Thu, May 04, 2023 at 01:38:33PM +0200, Juan Quintela wrote:
> So make everything that uses it uint64_t no int64_t.
> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/migration.c | 4 ++--
>  migration/options.c   | 2 +-
>  migration/options.h   | 2 +-
>  3 files changed, 4 insertions(+), 4 deletions(-)

Reviewed-by: Daniel P. Berrangé 

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH v2 2/2] tests/lcitool: Add mtools and xorriso and remove genisoimage as dependencies

2023-05-04 Thread Ani Sinha

Bios bits avocado tests need mformat (provided by the mtools package) and
xorriso tools in order to run within gitlab CI containers. Add those
dependencies within the Dockerfiles so that containers can be built with
those tools present and bios bits avocado tests can be run there.

xorriso package conflicts with genisoimage package on some distributions.
Therefore, it is not possible to have both the packages at the same time
in the container image uniformly for all distribution flavors. Further,
on some distributions like RHEL, both xorriso and genisoimage
packages provide /usr/bin/genisoimage and on some other distributions like
Fedora, only genisoimage package provides the same utility.
Therefore, this change removes the dependency on geninsoimage for building
container images altogether keeping only xorriso package. At the same time,
cdrom-test.c is updated to use and check for existence of only xorrisofs.

CC: m...@redhat.com
CC: berra...@redhat.com
Signed-off-by: Ani Sinha 
---
 .gitlab-ci.d/cirrus/freebsd-13.vars|  2 +-
 .gitlab-ci.d/cirrus/macos-12.vars  |  2 +-
 tests/docker/dockerfiles/alpine.docker |  3 ++-
 tests/docker/dockerfiles/centos8.docker|  3 ++-
 tests/docker/dockerfiles/debian-amd64-cross.docker |  3 ++-
 tests/docker/dockerfiles/debian-amd64.docker   |  3 ++-
 tests/docker/dockerfiles/debian-arm64-cross.docker |  3 ++-
 tests/docker/dockerfiles/debian-armel-cross.docker |  3 ++-
 tests/docker/dockerfiles/debian-armhf-cross.docker |  3 ++-
 .../dockerfiles/debian-mips64el-cross.docker   |  3 ++-
 .../docker/dockerfiles/debian-mipsel-cross.docker  |  3 ++-
 .../docker/dockerfiles/debian-ppc64el-cross.docker |  3 ++-
 tests/docker/dockerfiles/debian-s390x-cross.docker |  3 ++-
 tests/docker/dockerfiles/fedora-win32-cross.docker |  3 ++-
 tests/docker/dockerfiles/fedora-win64-cross.docker |  3 ++-
 tests/docker/dockerfiles/fedora.docker |  3 ++-
 tests/docker/dockerfiles/opensuse-leap.docker  |  3 ++-
 tests/docker/dockerfiles/ubuntu2004.docker |  3 ++-
 tests/docker/dockerfiles/ubuntu2204.docker |  3 ++-
 tests/lcitool/projects/qemu.yml|  3 ++-
 tests/qtest/cdrom-test.c   | 14 +++---
 21 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/.gitlab-ci.d/cirrus/freebsd-13.vars 
b/.gitlab-ci.d/cirrus/freebsd-13.vars
index 7622c849b2..facb649f5b 100644
--- a/.gitlab-ci.d/cirrus/freebsd-13.vars
+++ b/.gitlab-ci.d/cirrus/freebsd-13.vars
@@ -11,6 +11,6 @@ MAKE='/usr/local/bin/gmake'
 NINJA='/usr/local/bin/ninja'
 PACKAGING_COMMAND='pkg'
 PIP3='/usr/local/bin/pip-3.8'
-PKGS='alsa-lib bash bison bzip2 ca_root_nss capstone4 ccache 
cdrkit-genisoimage cmocka ctags curl cyrus-sasl dbus diffutils dtc flex 
fusefs-libs3 gettext git glib gmake gnutls gsed gtk3 json-c libepoxy libffi 
libgcrypt libjpeg-turbo libnfs libslirp libspice-server libssh libtasn1 llvm 
lzo2 meson ncurses nettle ninja opencv pixman pkgconf png py39-numpy 
py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme py39-yaml python3 
rpm2cpio sdl2 sdl2_image snappy sndio socat spice-protocol tesseract usbredir 
virglrenderer vte3 zstd'
+PKGS='alsa-lib bash bison bzip2 ca_root_nss capstone4 ccache cmocka ctags curl 
cyrus-sasl dbus diffutils dtc flex fusefs-libs3 gettext git glib gmake gnutls 
gsed gtk3 json-c libepoxy libffi libgcrypt libjpeg-turbo libnfs libslirp 
libspice-server libssh libtasn1 llvm lzo2 meson mtools ncurses nettle ninja 
opencv pixman pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx 
py39-sphinx_rtd_theme py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy sndio 
socat spice-protocol tesseract usbredir virglrenderer vte3 xorriso zstd'
 PYPI_PKGS=''
 PYTHON='/usr/local/bin/python3'
diff --git a/.gitlab-ci.d/cirrus/macos-12.vars 
b/.gitlab-ci.d/cirrus/macos-12.vars
index da6aa6469b..ceb294e153 100644
--- a/.gitlab-ci.d/cirrus/macos-12.vars
+++ b/.gitlab-ci.d/cirrus/macos-12.vars
@@ -11,6 +11,6 @@ MAKE='/opt/homebrew/bin/gmake'
 NINJA='/opt/homebrew/bin/ninja'
 PACKAGING_COMMAND='brew'
 PIP3='/opt/homebrew/bin/pip3'
-PKGS='bash bc bison bzip2 capstone ccache cmocka ctags curl dbus diffutils dtc 
flex gcovr gettext git glib gnu-sed gnutls gtk+3 jemalloc jpeg-turbo json-c 
libepoxy libffi libgcrypt libiscsi libnfs libpng libslirp libssh libtasn1 
libusb llvm lzo make meson ncurses nettle ninja pixman pkg-config python3 
rpm2cpio sdl2 sdl2_image snappy socat sparse spice-protocol tesseract usbredir 
vde vte3 zlib zstd'
+PKGS='bash bc bison bzip2 capstone ccache cmocka ctags curl dbus diffutils dtc 
flex gcovr gettext git glib gnu-sed gnutls gtk+3 jemalloc jpeg-turbo json-c 
libepoxy libffi libgcrypt libiscsi libnfs libpng libslirp libssh libtasn1 
libusb llvm lzo make meson mtools ncurses nettle ninja pixman pkg-config 
python3 rpm2cpio sdl2 sdl2_image snappy socat sparse spice-protocol tesseract 
usbredir vde vte3 xorriso zlib zstd'
 PYPI_PKGS='PyYAML numpy

Re: [PATCH 0/9] QEMU file cleanups

2023-05-04 Thread Juan Quintela

Peter Xu  wrote:
> On Thu, May 04, 2023 at 04:56:46PM +0200, Juan Quintela wrote:
>> Peter Xu  wrote:
>> > On Thu, May 04, 2023 at 01:38:32PM +0200, Juan Quintela wrote:
>> >> - convince and review code to see that everything is uint64_t.
>> >
>> > One general question to patches regarding this - what's the major benefit
>> > of using uint64_t?
>> >
>> > It doubles the possible numbers to hold, but it's already 64bits so I don't
>> > think it matters a lot.
>> 
>> We were checking for negatives even when that can't be.
>> And we are doing this dance of
>> 
>> int64_t x, y;
>> uint64_t a, b;
>> 
>> x = a;
>> b = y;
>> 
>> This is always confusing and not always right.
>
> Yeah this is confusing, but if anything can go wrong with this I assume we
> could have some bigger problem anyway..
>
>> 
>> > The thing is we're removing some code trying to
>> > detect negative which seems to be still helpful to detect e.g. overflows
>> > (even though I don't think it'll happen).  I just still think it's good to
>> > know when overflow happens, and not sure what I missed on benefits of using
>> > unsigned here.
>> 
>> If you grep through the code, you see that half of the things are
>> int64_t and the other half is uint64_t.  I find it always confusing.
>
> Right, I'm personally curious whether we should just use int64_t always
> unless necessary. :) Another good thing with int64_t is it's also suitable
> for error report when used in retvals.

It is used by sizes.  We don't return errors right now.
And if it is negative, we need to check that if it is negative, even
when no code uses the negative functionality.

> But no strong opinion here, I don't think that's a huge deal for now.
> Having such an alignment on types makes sense to me.

Thanks, Juan.

Re: [PATCH 0/9] QEMU file cleanups

2023-05-04 Thread Peter Xu

On Thu, May 04, 2023 at 04:56:46PM +0200, Juan Quintela wrote:
> Peter Xu  wrote:
> > On Thu, May 04, 2023 at 01:38:32PM +0200, Juan Quintela wrote:
> >> - convince and review code to see that everything is uint64_t.
> >
> > One general question to patches regarding this - what's the major benefit
> > of using uint64_t?
> >
> > It doubles the possible numbers to hold, but it's already 64bits so I don't
> > think it matters a lot.
> 
> We were checking for negatives even when that can't be.
> And we are doing this dance of
> 
> int64_t x, y;
> uint64_t a, b;
> 
> x = a;
> b = y;
> 
> This is always confusing and not always right.

Yeah this is confusing, but if anything can go wrong with this I assume we
could have some bigger problem anyway..

> 
> > The thing is we're removing some code trying to
> > detect negative which seems to be still helpful to detect e.g. overflows
> > (even though I don't think it'll happen).  I just still think it's good to
> > know when overflow happens, and not sure what I missed on benefits of using
> > unsigned here.
> 
> If you grep through the code, you see that half of the things are
> int64_t and the other half is uint64_t.  I find it always confusing.

Right, I'm personally curious whether we should just use int64_t always
unless necessary. :) Another good thing with int64_t is it's also suitable
for error report when used in retvals.

But no strong opinion here, I don't think that's a huge deal for now.
Having such an alignment on types makes sense to me.

Thanks,

-- 
Peter Xu

Re: [PATCH v2 12/20] mirror: Require GRAPH_RDLOCK for accessing a node's parent list

2023-05-04 Thread Kevin Wolf

Am 04.05.2023 um 15:17 hat Eric Blake geschrieben:
> On Thu, May 04, 2023 at 01:57:42PM +0200, Kevin Wolf wrote:
> > This adds GRAPH_RDLOCK annotations to declare that functions accessing
> > the parent list of a node need to hold a reader lock for the graph. As
> > it happens, they already do.
> > 
> > Signed-off-by: Kevin Wolf 
> > ---
> >  block/mirror.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> Reviewed-by: Eric Blake 

Oops, sorry for failing to pick up your R-b for this patch in v1 (seems
I only changed the subject line and forgot about the reviews), and
thanks for renewing it. :-)

Kevin

Re: [PATCH 9/9] qemu-file: Account for rate_limit usage on qemu_fflush()

2023-05-04 Thread Juan Quintela

Peter Xu  wrote:
> On Thu, May 04, 2023 at 01:38:41PM +0200, Juan Quintela wrote:
>> That is the moment we know we have transferred something.
>> 
>> Signed-off-by: Juan Quintela 
>
> There'll be a slight side effect that qemu_file_rate_limit() can be
> triggered later than before because data cached in the qemufile won't be
> accounted until flushed.
>
> Two limits here:
>
> - IO_BUF_SIZE==32K, the real buffer
> - MAX_IOV_SIZE==64 (I think), the async buffer to put guest page ptrs
>   directly, on x86 it's 64*4K=256K
>
> So the impact should be no more than 288KB on x86.  Looks still fine..

This was on purpose.  We are counting data as sent when it has not been
sent yet.

With this change, we account for the data written when we do the real
write.

And yes, I fully agree that with the size that we have it shouldn't make
much of a difference in the speed calculation.

The difference here is that I will move that counter somewhere else, and
the less places that I have to use it the better O:-)

> Reviewed-by: Peter Xu 

Thanks.

Re: [PATCH 0/9] QEMU file cleanups

2023-05-04 Thread Juan Quintela

Peter Xu  wrote:
> On Thu, May 04, 2023 at 01:38:32PM +0200, Juan Quintela wrote:
>> - convince and review code to see that everything is uint64_t.
>
> One general question to patches regarding this - what's the major benefit
> of using uint64_t?
>
> It doubles the possible numbers to hold, but it's already 64bits so I don't
> think it matters a lot.

We were checking for negatives even when that can't be.
And we are doing this dance of

int64_t x, y;
uint64_t a, b;

x = a;
b = y;

This is always confusing and not always right.

> The thing is we're removing some code trying to
> detect negative which seems to be still helpful to detect e.g. overflows
> (even though I don't think it'll happen).  I just still think it's good to
> know when overflow happens, and not sure what I missed on benefits of using
> unsigned here.

If you grep through the code, you see that half of the things are
int64_t and the other half is uint64_t.  I find it always confusing.


> I've reviewed all the rest patches and all look good here.

Thanks very much.

Re: [PATCH 0/9] QEMU file cleanups

2023-05-04 Thread Peter Xu

On Thu, May 04, 2023 at 01:38:32PM +0200, Juan Quintela wrote:
> - convince and review code to see that everything is uint64_t.

One general question to patches regarding this - what's the major benefit
of using uint64_t?

It doubles the possible numbers to hold, but it's already 64bits so I don't
think it matters a lot.  The thing is we're removing some code trying to
detect negative which seems to be still helpful to detect e.g. overflows
(even though I don't think it'll happen).  I just still think it's good to
know when overflow happens, and not sure what I missed on benefits of using
unsigned here.

I've reviewed all the rest patches and all look good here.

Thanks,

-- 
Peter Xu

Re: [PATCH 9/9] qemu-file: Account for rate_limit usage on qemu_fflush()

2023-05-04 Thread Peter Xu

On Thu, May 04, 2023 at 01:38:41PM +0200, Juan Quintela wrote:
> That is the moment we know we have transferred something.
> 
> Signed-off-by: Juan Quintela 

There'll be a slight side effect that qemu_file_rate_limit() can be
triggered later than before because data cached in the qemufile won't be
accounted until flushed.

Two limits here:

- IO_BUF_SIZE==32K, the real buffer
- MAX_IOV_SIZE==64 (I think), the async buffer to put guest page ptrs
  directly, on x86 it's 64*4K=256K

So the impact should be no more than 288KB on x86.  Looks still fine..

Reviewed-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH 8/9] qemu-file: Make ram_control_save_page() use accessors for rate_limit

2023-05-04 Thread Peter Xu

On Thu, May 04, 2023 at 01:38:40PM +0200, Juan Quintela wrote:
> Signed-off-by: Juan Quintela 

Reviewed-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH 6/9] qemu-file: remove shutdown member

2023-05-04 Thread Peter Xu

On Thu, May 04, 2023 at 01:38:38PM +0200, Juan Quintela wrote:
> The first thing that we do after setting the shutdown value is set the
> error as -EIO if there is not a previous error.
> 
> So this value is reduntant.  Just remove it and use
> qemu_file_get_error() in the places that it was tested.
> 
> Signed-off-by: Juan Quintela 

Nit: I'd squash this with previous.

Reviewed-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH 5/9] qemu-file: No need to check for shutdown in qemu_file_rate_limit

2023-05-04 Thread Peter Xu

On Thu, May 04, 2023 at 01:38:37PM +0200, Juan Quintela wrote:
> After calling qemu_file_shutdown() we set the error as -EIO if there
> is no another previous error, so no need to check it here.
> 
> Signed-off-by: Juan Quintela 

Reviewed-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH v2 12/20] mirror: Require GRAPH_RDLOCK for accessing a node's parent list

2023-05-04 Thread Eric Blake

On Thu, May 04, 2023 at 01:57:42PM +0200, Kevin Wolf wrote:
> This adds GRAPH_RDLOCK annotations to declare that functions accessing
> the parent list of a node need to hold a reader lock for the graph. As
> it happens, they already do.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/mirror.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Reviewed-by: Eric Blake 

> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 80fa345071..b5c4ae31f3 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -1416,7 +1416,7 @@ static MirrorOp *coroutine_fn 
> active_write_prepare(MirrorBlockJob *s,
>  return op;
>  }
>  
> -static void coroutine_fn active_write_settle(MirrorOp *op)
> +static void coroutine_fn GRAPH_RDLOCK active_write_settle(MirrorOp *op)
>  {
>  uint64_t start_chunk = op->offset / op->s->granularity;
>  uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
> -- 
> 2.40.1
> 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v4 3/5] parallels: Add checking and repairing duplicate offsets in BAT

2023-05-04 Thread Alexander Ivanov


Yes, there is a bug. Thank you.

On 4/29/23 00:15, Mike Maslenkin wrote:

There is another issue with host_cluster_index() function.
After this patchset applied `qemu-img check -f parallels  some_disk`
aborts for  empty (just created) disk image.
The problem is that host_cluster_index() returns 0 and then
bitmap_new(0) rises an abort.

For default empty disk s->header->data_off=2048, and
res->image_end_offset = 1048576, so:
static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t 1048576)
{
   off = 1048576  - le32_to_cpu(2048) << 9;
   return 0 / 1048576;
}

Could you check this case?

Regards,
Mike.

On Thu, Apr 27, 2023 at 3:29 PM Alexander Ivanov
 wrote:

Good point. Thank you.

Best regards,
Alexander Ivanov

On 4/26/23 23:56, Mike Maslenkin wrote:

On Mon, Apr 24, 2023 at 12:44 PM Alexander Ivanov
 wrote:

Cluster offsets must be unique among all the BAT entries. Find duplicate
offsets in the BAT and fix it by copying the content of the relevant
cluster to a newly allocated cluster and set the new cluster offset to the
duplicated entry.

Add host_cluster_index() helper to deduplicate the code.

Move parallels_fix_leak() call to parallels_co_check() to fix both types
of leak: real corruption and a leak produced by allocate_clusters()
during deduplication.

Signed-off-by: Alexander Ivanov 
---
   block/parallels.c | 134 --
   1 file changed, 129 insertions(+), 5 deletions(-)

diff --git a/block/parallels.c b/block/parallels.c
index ec89ed894b..3b992e8173 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -136,6 +136,12 @@ static int cluster_remainder(BDRVParallelsState *s, 
int64_t sector_num,
   return MIN(nb_sectors, ret);
   }

+static uint32_t host_cluster_index(BDRVParallelsState *s, int64_t off)
+{
+off -= s->header->data_off << BDRV_SECTOR_BITS;
+return off / s->cluster_size;
+}
+

I guess  there should be:
off -= le32_to_cpu(s->header->data_off) << BDRV_SECTOR_BITS

Regards,
Mike.


--
Best regards,
Alexander Ivanov

[PATCH v2 13/20] block: Mark bdrv_co_get_allocated_file_size() and callers GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

From: Emanuele Giuseppe Esposito 

This adds GRAPH_RDLOCK annotations to declare that callers of
bdrv_co_get_allocated_file_size() need to hold a reader lock for the
graph.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
---
 include/block/block-io.h | 7 +--
 include/block/block_int-common.h | 2 +-
 block.c  | 4 +++-
 block/vmdk.c | 2 +-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index 5dab88521d..fb2adb31c7 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -84,8 +84,11 @@ int64_t coroutine_mixed_fn bdrv_nb_sectors(BlockDriverState 
*bs);
 int64_t coroutine_fn GRAPH_RDLOCK bdrv_co_getlength(BlockDriverState *bs);
 int64_t co_wrapper_mixed_bdrv_rdlock bdrv_getlength(BlockDriverState *bs);
 
-int64_t coroutine_fn bdrv_co_get_allocated_file_size(BlockDriverState *bs);
-int64_t co_wrapper bdrv_get_allocated_file_size(BlockDriverState *bs);
+int64_t coroutine_fn GRAPH_RDLOCK
+bdrv_co_get_allocated_file_size(BlockDriverState *bs);
+
+int64_t co_wrapper_bdrv_rdlock
+bdrv_get_allocated_file_size(BlockDriverState *bs);
 
 BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
BlockDriverState *in_bs, Error **errp);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 6fb28cd8fa..6e0365d8f2 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -685,7 +685,7 @@ struct BlockDriver {
 int64_t coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_getlength)(
 BlockDriverState *bs);
 
-int64_t coroutine_fn (*bdrv_co_get_allocated_file_size)(
+int64_t coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_get_allocated_file_size)(
 BlockDriverState *bs);
 
 BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs,
diff --git a/block.c b/block.c
index abec940867..3ccb935950 100644
--- a/block.c
+++ b/block.c
@@ -5750,7 +5750,8 @@ exit:
  * sums the size of all data-bearing children.  (This excludes backing
  * children.)
  */
-static int64_t coroutine_fn bdrv_sum_allocated_file_size(BlockDriverState *bs)
+static int64_t coroutine_fn GRAPH_RDLOCK
+bdrv_sum_allocated_file_size(BlockDriverState *bs)
 {
 BdrvChild *child;
 int64_t child_size, sum = 0;
@@ -5778,6 +5779,7 @@ int64_t coroutine_fn 
bdrv_co_get_allocated_file_size(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
 IO_CODE();
+assert_bdrv_graph_readable();
 
 if (!drv) {
 return -ENOMEDIUM;
diff --git a/block/vmdk.c b/block/vmdk.c
index 11b553ef25..fddbd1c86c 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -2845,7 +2845,7 @@ static void vmdk_close(BlockDriverState *bs)
 error_free(s->migration_blocker);
 }
 
-static int64_t coroutine_fn
+static int64_t coroutine_fn GRAPH_RDLOCK
 vmdk_co_get_allocated_file_size(BlockDriverState *bs)
 {
 int i;
-- 
2.40.1

[PATCH v2 16/20] block: Mark BlockDriver callbacks for amend job GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

From: Emanuele Giuseppe Esposito 

This adds GRAPH_RDLOCK annotations to declare that callers of amend
callbacks in BlockDriver need to hold a reader lock for the graph.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int-common.h | 12 ++--
 block/amend.c|  8 +++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 88ce7f9d9e..37d094796e 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -204,12 +204,13 @@ struct BlockDriver {
  * to allow driver-specific initialization code that requires
  * the BQL, like setting up specific permission flags.
  */
-int (*bdrv_amend_pre_run)(BlockDriverState *bs, Error **errp);
+int GRAPH_RDLOCK_PTR (*bdrv_amend_pre_run)(
+BlockDriverState *bs, Error **errp);
 /*
  * This function is invoked under BQL after .bdrv_co_amend()
  * to allow cleaning up what was done in .bdrv_amend_pre_run().
  */
-void (*bdrv_amend_clean)(BlockDriverState *bs);
+void GRAPH_RDLOCK_PTR (*bdrv_amend_clean)(BlockDriverState *bs);
 
 /*
  * Return true if @to_replace can be replaced by a BDS with the
@@ -463,10 +464,9 @@ struct BlockDriver {
 
 int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
 
-int coroutine_fn (*bdrv_co_amend)(BlockDriverState *bs,
-  BlockdevAmendOptions *opts,
-  bool force,
-  Error **errp);
+int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_amend)(
+BlockDriverState *bs, BlockdevAmendOptions *opts, bool force,
+Error **errp);
 
 /* aio */
 BlockAIOCB * GRAPH_RDLOCK_PTR (*bdrv_aio_preadv)(BlockDriverState *bs,
diff --git a/block/amend.c b/block/amend.c
index bc4bb7b416..53a410247c 100644
--- a/block/amend.c
+++ b/block/amend.c
@@ -46,6 +46,7 @@ static int coroutine_fn blockdev_amend_run(Job *job, Error 
**errp)
 {
 BlockdevAmendJob *s = container_of(job, BlockdevAmendJob, common);
 int ret;
+GRAPH_RDLOCK_GUARD();
 
 job_progress_set_remaining(>common, 1);
 ret = s->bs->drv->bdrv_co_amend(s->bs, s->opts, s->force, errp);
@@ -54,7 +55,8 @@ static int coroutine_fn blockdev_amend_run(Job *job, Error 
**errp)
 return ret;
 }
 
-static int blockdev_amend_pre_run(BlockdevAmendJob *s, Error **errp)
+static int GRAPH_RDLOCK
+blockdev_amend_pre_run(BlockdevAmendJob *s, Error **errp)
 {
 if (s->bs->drv->bdrv_amend_pre_run) {
 return s->bs->drv->bdrv_amend_pre_run(s->bs, errp);
@@ -67,9 +69,11 @@ static void blockdev_amend_free(Job *job)
 {
 BlockdevAmendJob *s = container_of(job, BlockdevAmendJob, common);
 
+bdrv_graph_rdlock_main_loop();
 if (s->bs->drv->bdrv_amend_clean) {
 s->bs->drv->bdrv_amend_clean(s->bs);
 }
+bdrv_graph_rdunlock_main_loop();
 
 bdrv_unref(s->bs);
 }
@@ -93,6 +97,8 @@ void qmp_x_blockdev_amend(const char *job_id,
 BlockDriver *drv = bdrv_find_format(fmt);
 BlockDriverState *bs;
 
+GRAPH_RDLOCK_GUARD_MAINLOOP();
+
 bs = bdrv_lookup_bs(NULL, node_name, errp);
 if (!bs) {
 return;
-- 
2.40.1

[PATCH v2 17/20] block: Mark bdrv_query_bds_stats() and callers GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

This adds GRAPH_RDLOCK annotations to declare that callers of
bdrv_query_bds_stats() need to hold a reader lock for the graph because
it accesses the children list of a node.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qapi.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/qapi.c b/block/qapi.c
index c84147849d..71f2751257 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -604,8 +604,8 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, 
BlockBackend *blk)
 = bdrv_latency_histogram_stats([BLOCK_ACCT_FLUSH]);
 }
 
-static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs,
-bool blk_level)
+static BlockStats * GRAPH_RDLOCK
+bdrv_query_bds_stats(BlockDriverState *bs, bool blk_level)
 {
 BdrvChild *parent_child;
 BlockDriverState *filter_or_cow_bs;
@@ -713,6 +713,8 @@ BlockStatsList *qmp_query_blockstats(bool has_query_nodes,
 BlockBackend *blk;
 BlockDriverState *bs;
 
+GRAPH_RDLOCK_GUARD_MAINLOOP();
+
 /* Just to be safe if query_nodes is not always initialized */
 if (has_query_nodes && query_nodes) {
 for (bs = bdrv_next_node(NULL); bs; bs = bdrv_next_node(bs)) {
-- 
2.40.1

[PATCH v2 11/20] vhdx: Require GRAPH_RDLOCK for accessing a node's parent list

2023-05-04 Thread Kevin Wolf

This adds GRAPH_RDLOCK annotations to declare that functions accessing
the parent list of a node need to hold a reader lock for the graph. As
it happens, they already do.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/vhdx.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/vhdx.c b/block/vhdx.c
index 00777da91a..b20b1edf11 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1506,8 +1506,9 @@ exit:
  * There are 2 headers, and the highest sequence number will represent
  * the active header
  */
-static int vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
-   uint32_t log_size)
+static int coroutine_fn GRAPH_RDLOCK
+vhdx_create_new_headers(BlockBackend *blk, uint64_t image_size,
+uint32_t log_size)
 {
 BlockDriverState *bs = blk_bs(blk);
 BdrvChild *child;
@@ -1897,8 +1898,8 @@ exit:
  *. ~ --- ~  ~  ~ ---.
  *   1MB
  */
-static int coroutine_fn vhdx_co_create(BlockdevCreateOptions *opts,
-   Error **errp)
+static int coroutine_fn GRAPH_RDLOCK
+vhdx_co_create(BlockdevCreateOptions *opts, Error **errp)
 {
 BlockdevCreateOptionsVhdx *vhdx_opts;
 BlockBackend *blk = NULL;
-- 
2.40.1

[PATCH v2 08/20] block: .bdrv_open is non-coroutine and unlocked

2023-05-04 Thread Kevin Wolf

Drivers were a bit confused about whether .bdrv_open can run in a
coroutine and whether or not it holds a graph lock.

It cannot keep a graph lock from the caller across the whole function
because it both changes the graph (requires a writer lock) and does I/O
(requires a reader lock). Therefore, it should take these locks
internally as needed.

The functions used to be called in coroutine context during image
creation. This was buggy for other reasons, and as of commit 32192301,
all block drivers go through no_co_wrappers. So it is not called in
coroutine context any more.

Fix qcow2 and qed to work with the correct assumptions: The graph lock
needs to be taken internally instead of just assuming it's already
there, and the coroutine path is dead code that can be removed.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int-common.h |  8 
 block.c  |  6 +++---
 block/qcow2.c| 15 ++-
 block/qed.c  | 18 --
 4 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 013d419444..6fb28cd8fa 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -236,12 +236,12 @@ struct BlockDriver {
 void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
 void (*bdrv_join_options)(QDict *options, QDict *old_options);
 
-int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags,
- Error **errp);
+int GRAPH_UNLOCKED_PTR (*bdrv_open)(
+BlockDriverState *bs, QDict *options, int flags, Error **errp);
 
 /* Protocol drivers should implement this instead of bdrv_open */
-int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
-  Error **errp);
+int GRAPH_UNLOCKED_PTR (*bdrv_file_open)(
+BlockDriverState *bs, QDict *options, int flags, Error **errp);
 void (*bdrv_close)(BlockDriverState *bs);
 
 int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_create)(
diff --git a/block.c b/block.c
index 20d5ee0959..abec940867 100644
--- a/block.c
+++ b/block.c
@@ -1610,9 +1610,9 @@ out:
  * bdrv_refresh_total_sectors() which polls when called from non-coroutine
  * context.
  */
-static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
-const char *node_name, QDict *options,
-int open_flags, Error **errp)
+static int no_coroutine_fn GRAPH_UNLOCKED
+bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, const char *node_name,
+ QDict *options, int open_flags, Error **errp)
 {
 Error *local_err = NULL;
 int i, ret;
diff --git a/block/qcow2.c b/block/qcow2.c
index 01742b3ebe..5bde3b8401 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1891,7 +1891,7 @@ static void coroutine_fn qcow2_open_entry(void *opaque)
 QCow2OpenCo *qoc = opaque;
 BDRVQcow2State *s = qoc->bs->opaque;
 
-assume_graph_lock(); /* FIXME */
+GRAPH_RDLOCK_GUARD();
 
 qemu_co_mutex_lock(>lock);
 qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, true,
@@ -1920,14 +1920,11 @@ static int qcow2_open(BlockDriverState *bs, QDict 
*options, int flags,
 /* Initialise locks */
 qemu_co_mutex_init(>lock);
 
-if (qemu_in_coroutine()) {
-/* From bdrv_co_create.  */
-qcow2_open_entry();
-} else {
-assert(qemu_get_current_aio_context() == qemu_get_aio_context());
-qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, ));
-BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
-}
+assert(!qemu_in_coroutine());
+assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+qemu_coroutine_enter(qemu_coroutine_create(qcow2_open_entry, ));
+BDRV_POLL_WHILE(bs, qoc.ret == -EINPROGRESS);
+
 return qoc.ret;
 }
 
diff --git a/block/qed.c b/block/qed.c
index aff2a2076e..be9ff0fb34 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -557,11 +557,13 @@ typedef struct QEDOpenCo {
 int ret;
 } QEDOpenCo;
 
-static void coroutine_fn GRAPH_RDLOCK bdrv_qed_open_entry(void *opaque)
+static void coroutine_fn bdrv_qed_open_entry(void *opaque)
 {
 QEDOpenCo *qoc = opaque;
 BDRVQEDState *s = qoc->bs->opaque;
 
+GRAPH_RDLOCK_GUARD();
+
 qemu_co_mutex_lock(>table_lock);
 qoc->ret = bdrv_qed_do_open(qoc->bs, qoc->options, qoc->flags, qoc->errp);
 qemu_co_mutex_unlock(>table_lock);
@@ -579,21 +581,17 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict 
*options, int flags,
 };
 int ret;
 
-assume_graph_lock(); /* FIXME */
-
 ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
 if (ret < 0) {
 return ret;
 }
 
 bdrv_qed_init_state(bs);
-if (qemu_in_coroutine()) {
-bdrv_qed_open_entry();
-} else {
-assert(qemu_get_current_aio_context() ==

[PATCH v2 20/20] block: Mark bdrv_refresh_limits() and callers GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

This adds GRAPH_RDLOCK annotations to declare that callers of
bdrv_refresh_limits() need to hold a reader lock for the graph because
it accesses the children list of a node.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-global-state.h | 5 -
 include/block/block_int-common.h   | 3 ++-
 block.c| 9 +
 block/io.c | 1 -
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index f234bca0b6..2d93423d35 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -133,7 +133,10 @@ int bdrv_reopen_set_read_only(BlockDriverState *bs, bool 
read_only,
 BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
   const char *backing_file);
 void bdrv_refresh_filename(BlockDriverState *bs);
-void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error 
**errp);
+
+void GRAPH_RDLOCK
+bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp);
+
 int bdrv_commit(BlockDriverState *bs);
 int bdrv_make_empty(BdrvChild *c, Error **errp);
 int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 024ded4fc2..4909876756 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -334,7 +334,8 @@ struct BlockDriver {
 int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
 bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
 
-void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
+void GRAPH_RDLOCK_PTR (*bdrv_refresh_limits)(
+BlockDriverState *bs, Error **errp);
 
 /*
  * Returns 1 if newly created images are guaranteed to contain only
diff --git a/block.c b/block.c
index 1bc766c778..dad9a4fa43 100644
--- a/block.c
+++ b/block.c
@@ -1667,7 +1667,10 @@ bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv, 
const char *node_name,
 return ret;
 }
 
+bdrv_graph_rdlock_main_loop();
 bdrv_refresh_limits(bs, NULL, _err);
+bdrv_graph_rdunlock_main_loop();
+
 if (local_err) {
 error_propagate(errp, local_err);
 return -EINVAL;
@@ -3419,7 +3422,9 @@ static int 
bdrv_set_file_or_backing_noperm(BlockDriverState *parent_bs,
 }
 
 out:
+bdrv_graph_rdlock_main_loop();
 bdrv_refresh_limits(parent_bs, tran, NULL);
+bdrv_graph_rdunlock_main_loop();
 
 return 0;
 }
@@ -4917,7 +4922,9 @@ static void bdrv_reopen_commit(BDRVReopenState 
*reopen_state)
 qdict_del(bs->explicit_options, "backing");
 qdict_del(bs->options, "backing");
 
+bdrv_graph_rdlock_main_loop();
 bdrv_refresh_limits(bs, NULL, NULL);
+bdrv_graph_rdunlock_main_loop();
 bdrv_refresh_total_sectors(bs, bs->total_sectors);
 }
 
@@ -5316,7 +5323,9 @@ int bdrv_append(BlockDriverState *bs_new, 
BlockDriverState *bs_top,
 out:
 tran_finalize(tran, ret);
 
+bdrv_graph_rdlock_main_loop();
 bdrv_refresh_limits(bs_top, NULL, NULL);
+bdrv_graph_rdunlock_main_loop();
 
 if (new_context && old_context != new_context) {
 aio_context_release(new_context);
diff --git a/block/io.c b/block/io.c
index 3bf9ef9d87..58557f2f96 100644
--- a/block/io.c
+++ b/block/io.c
@@ -160,7 +160,6 @@ void bdrv_refresh_limits(BlockDriverState *bs, Transaction 
*tran, Error **errp)
 bool have_limits;
 
 GLOBAL_STATE_CODE();
-assume_graph_lock(); /* FIXME */
 
 if (tran) {
 BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1);
-- 
2.40.1

[PATCH v2 15/20] block: Mark bdrv_co_debug_event() GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

From: Emanuele Giuseppe Esposito 

This adds GRAPH_RDLOCK annotations to declare that callers of
bdrv_co_debug_event() need to hold a reader lock for the graph.

Unfortunately we cannot use a co_wrapper_bdrv_rdlock (i.e. make the
coroutine wrapper a no_coroutine_fn), because the function is called
(using the BLKDBG_EVENT macro) by mixed functions that run both in
coroutine and non-coroutine context (for example many of the functions
in qcow2-cluster.c and qcow2-refcount.c).

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-io.h | 9 +
 include/block/block_int-common.h | 4 ++--
 block.c  | 2 ++
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index bba7f957e1..1f612ec5bd 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -205,10 +205,11 @@ void *qemu_try_blockalign0(BlockDriverState *bs, size_t 
size);
 void bdrv_enable_copy_on_read(BlockDriverState *bs);
 void bdrv_disable_copy_on_read(BlockDriverState *bs);
 
-void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs,
-  BlkdebugEvent event);
-void co_wrapper_mixed bdrv_debug_event(BlockDriverState *bs,
-   BlkdebugEvent event);
+void coroutine_fn GRAPH_RDLOCK
+bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent event);
+
+void co_wrapper_mixed_bdrv_rdlock
+bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);
 
 #define BLKDBG_EVENT(child, evt) \
 do { \
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index ee77903f72..88ce7f9d9e 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -735,8 +735,8 @@ struct BlockDriver {
 int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_check)(
 BlockDriverState *bs, BdrvCheckResult *result, BdrvCheckMode fix);
 
-void coroutine_fn (*bdrv_co_debug_event)(BlockDriverState *bs,
- BlkdebugEvent event);
+void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_debug_event)(
+BlockDriverState *bs, BlkdebugEvent event);
 
 /* io queue for linux-aio */
 void coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_io_plug)(BlockDriverState 
*bs);
diff --git a/block.c b/block.c
index a6deaf8ad1..1bc766c778 100644
--- a/block.c
+++ b/block.c
@@ -6399,6 +6399,8 @@ BlockStatsSpecific 
*bdrv_get_specific_stats(BlockDriverState *bs)
 void coroutine_fn bdrv_co_debug_event(BlockDriverState *bs, BlkdebugEvent 
event)
 {
 IO_CODE();
+assert_bdrv_graph_readable();
+
 if (!bs || !bs->drv || !bs->drv->bdrv_co_debug_event) {
 return;
 }
-- 
2.40.1

[PATCH v2 07/20] graph-lock: Fix GRAPH_RDLOCK_GUARD*() to be reader lock

2023-05-04 Thread Kevin Wolf

GRAPH_RDLOCK_GUARD() and GRAPH_RDLOCK_GUARD_MAINLOOP() only take a
reader lock for the graph, so the correct annotation for them to use is
TSA_ASSERT_SHARED rather than TSA_ASSERT.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
---
 include/block/graph-lock.h | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/block/graph-lock.h b/include/block/graph-lock.h
index f17d1588e7..7574a2de5b 100644
--- a/include/block/graph-lock.h
+++ b/include/block/graph-lock.h
@@ -205,12 +205,12 @@ typedef struct GraphLockable { } GraphLockable;
 #define GML_OBJ_() (&(GraphLockable) { })
 
 /*
- * This is not marked as TSA_ACQUIRE() because TSA doesn't understand the
+ * This is not marked as TSA_ACQUIRE_SHARED() because TSA doesn't understand 
the
  * cleanup attribute and would therefore complain that the graph is never
- * unlocked. TSA_ASSERT() makes sure that the following calls know that we
- * hold the lock while unlocking is left unchecked.
+ * unlocked. TSA_ASSERT_SHARED() makes sure that the following calls know that
+ * we hold the lock while unlocking is left unchecked.
  */
-static inline GraphLockable * TSA_ASSERT(graph_lock) TSA_NO_TSA coroutine_fn
+static inline GraphLockable * TSA_ASSERT_SHARED(graph_lock) TSA_NO_TSA 
coroutine_fn
 graph_lockable_auto_lock(GraphLockable *x)
 {
 bdrv_graph_co_rdlock();
@@ -249,12 +249,12 @@ typedef struct GraphLockableMainloop { } 
GraphLockableMainloop;
 #define GMLML_OBJ_() (&(GraphLockableMainloop) { })
 
 /*
- * This is not marked as TSA_ACQUIRE() because TSA doesn't understand the
+ * This is not marked as TSA_ACQUIRE_SHARED() because TSA doesn't understand 
the
  * cleanup attribute and would therefore complain that the graph is never
- * unlocked. TSA_ASSERT() makes sure that the following calls know that we
- * hold the lock while unlocking is left unchecked.
+ * unlocked. TSA_ASSERT_SHARED() makes sure that the following calls know that
+ * we hold the lock while unlocking is left unchecked.
  */
-static inline GraphLockableMainloop * TSA_ASSERT(graph_lock) TSA_NO_TSA
+static inline GraphLockableMainloop * TSA_ASSERT_SHARED(graph_lock) TSA_NO_TSA
 graph_lockable_auto_lock_mainloop(GraphLockableMainloop *x)
 {
 bdrv_graph_rdlock_main_loop();
-- 
2.40.1

[PATCH v2 01/20] qcow2: Don't call bdrv_getlength() in coroutine_fns

2023-05-04 Thread Kevin Wolf

There is a bdrv_co_getlength() now, which should be used in coroutine
context.

This requires adding GRAPH_RDLOCK to some functions so that this still
compiles with TSA because bdrv_co_getlength() is GRAPH_RDLOCK.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qcow2.h  |  4 +++-
 block/qcow2-refcount.c |  2 +-
 block/qcow2.c  | 19 +--
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index c75decc38a..4f67eb912a 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -895,7 +895,9 @@ int qcow2_change_refcount_order(BlockDriverState *bs, int 
refcount_order,
 void *cb_opaque, Error **errp);
 int coroutine_fn GRAPH_RDLOCK qcow2_shrink_reftable(BlockDriverState *bs);
 int64_t coroutine_fn qcow2_get_last_cluster(BlockDriverState *bs, int64_t 
size);
-int coroutine_fn qcow2_detect_metadata_preallocation(BlockDriverState *bs);
+
+int coroutine_fn GRAPH_RDLOCK
+qcow2_detect_metadata_preallocation(BlockDriverState *bs);
 
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index b2a81ff707..4cf91bd955 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -3715,7 +3715,7 @@ int coroutine_fn 
qcow2_detect_metadata_preallocation(BlockDriverState *bs)
 
 qemu_co_mutex_assert_locked(>lock);
 
-file_length = bdrv_getlength(bs->file->bs);
+file_length = bdrv_co_getlength(bs->file->bs);
 if (file_length < 0) {
 return file_length;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index fe5def438e..94cf59af8b 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2089,11 +2089,10 @@ static void qcow2_join_options(QDict *options, QDict 
*old_options)
 }
 }
 
-static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
-  bool want_zero,
-  int64_t offset, int64_t count,
-  int64_t *pnum, int64_t *map,
-  BlockDriverState **file)
+static int coroutine_fn GRAPH_RDLOCK
+qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+  int64_t count, int64_t *pnum, int64_t *map,
+  BlockDriverState **file)
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t host_offset;
@@ -3235,7 +3234,7 @@ preallocate_co(BlockDriverState *bs, uint64_t offset, 
uint64_t new_length,
  * all of the allocated clusters (otherwise we get failing reads after
  * EOF). Extend the image to the last allocated sector.
  */
-file_length = bdrv_getlength(s->data_file->bs);
+file_length = bdrv_co_getlength(s->data_file->bs);
 if (file_length < 0) {
 error_setg_errno(errp, -file_length, "Could not get file size");
 ret = file_length;
@@ -4098,7 +4097,7 @@ qcow2_co_copy_range_from(BlockDriverState *bs,
 case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
 case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
 if (bs->backing && bs->backing->bs) {
-int64_t backing_length = bdrv_getlength(bs->backing->bs);
+int64_t backing_length = bdrv_co_getlength(bs->backing->bs);
 if (src_offset >= backing_length) {
 cur_write_flags |= BDRV_REQ_ZERO_WRITE;
 } else {
@@ -4293,7 +4292,7 @@ qcow2_co_truncate(BlockDriverState *bs, int64_t offset, 
bool exact,
 goto fail;
 }
 
-old_file_size = bdrv_getlength(bs->file->bs);
+old_file_size = bdrv_co_getlength(bs->file->bs);
 if (old_file_size < 0) {
 error_setg_errno(errp, -old_file_size,
  "Failed to inquire current file length");
@@ -4386,7 +4385,7 @@ qcow2_co_truncate(BlockDriverState *bs, int64_t offset, 
bool exact,
 break;
 }
 
-old_file_size = bdrv_getlength(bs->file->bs);
+old_file_size = bdrv_co_getlength(bs->file->bs);
 if (old_file_size < 0) {
 error_setg_errno(errp, -old_file_size,
  "Failed to inquire current file length");
@@ -4694,7 +4693,7 @@ qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
  * align end of file to a sector boundary to ease reading with
  * sector based I/Os
  */
-int64_t len = bdrv_getlength(bs->file->bs);
+int64_t len = bdrv_co_getlength(bs->file->bs);
 if (len < 0) {
 return len;
 }
-- 
2.40.1

[PATCH v2 19/20] block: Mark bdrv_recurse_can_replace() and callers GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

This adds GRAPH_RDLOCK annotations to declare that callers of
bdrv_recurse_can_replace() need to hold a reader lock for the graph
because it accesses the children list of a node.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-global-state.h | 5 +++--
 include/block/block_int-common.h   | 4 ++--
 include/block/block_int-global-state.h | 4 ++--
 block/blkverify.c  | 5 +++--
 block/mirror.c | 4 
 block/quorum.c | 4 ++--
 blockdev.c | 3 +++
 7 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index ec3ddb17a8..f234bca0b6 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -163,8 +163,9 @@ int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts 
*opts,
Error **errp);
 
 /* check if a named node can be replaced when doing drive-mirror */
-BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
-const char *node_name, Error **errp);
+BlockDriverState * GRAPH_RDLOCK
+check_to_replace_node(BlockDriverState *parent_bs, const char *node_name,
+  Error **errp);
 
 int no_coroutine_fn bdrv_activate(BlockDriverState *bs, Error **errp);
 
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 37d094796e..024ded4fc2 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -217,8 +217,8 @@ struct BlockDriver {
  * same data as @bs without it affecting @bs's behavior (that is,
  * without it being visible to @bs's parents).
  */
-bool (*bdrv_recurse_can_replace)(BlockDriverState *bs,
- BlockDriverState *to_replace);
+bool GRAPH_RDLOCK_PTR (*bdrv_recurse_can_replace)(
+BlockDriverState *bs, BlockDriverState *to_replace);
 
 int (*bdrv_probe_device)(const char *filename);
 
diff --git a/include/block/block_int-global-state.h 
b/include/block/block_int-global-state.h
index 902406eb99..da5fb31089 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -225,8 +225,8 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, 
uint64_t shared,
  */
 int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp);
 
-bool bdrv_recurse_can_replace(BlockDriverState *bs,
-  BlockDriverState *to_replace);
+bool GRAPH_RDLOCK bdrv_recurse_can_replace(BlockDriverState *bs,
+   BlockDriverState *to_replace);
 
 /*
  * Default implementation for BlockDriver.bdrv_child_perm() that can
diff --git a/block/blkverify.c b/block/blkverify.c
index 1c16f86b2e..7326461f30 100644
--- a/block/blkverify.c
+++ b/block/blkverify.c
@@ -265,8 +265,9 @@ static int coroutine_fn GRAPH_RDLOCK 
blkverify_co_flush(BlockDriverState *bs)
 return bdrv_co_flush(s->test_file->bs);
 }
 
-static bool blkverify_recurse_can_replace(BlockDriverState *bs,
-  BlockDriverState *to_replace)
+static bool GRAPH_RDLOCK
+blkverify_recurse_can_replace(BlockDriverState *bs,
+  BlockDriverState *to_replace)
 {
 BDRVBlkverifyState *s = bs->opaque;
 
diff --git a/block/mirror.c b/block/mirror.c
index e48ed0af31..717442ca4d 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -747,7 +747,10 @@ static int mirror_exit_common(Job *job)
  * Cannot use check_to_replace_node() here, because that would
  * check for an op blocker on @to_replace, and we have our own
  * there.
+ *
+ * TODO Pull out the writer lock from bdrv_replace_node() to here
  */
+bdrv_graph_rdlock_main_loop();
 if (bdrv_recurse_can_replace(src, to_replace)) {
 bdrv_replace_node(to_replace, target_bs, _err);
 } else {
@@ -756,6 +759,7 @@ static int mirror_exit_common(Job *job)
"would not lead to an abrupt change of visible data",
to_replace->node_name, target_bs->node_name);
 }
+bdrv_graph_rdunlock_main_loop();
 bdrv_drained_end(target_bs);
 if (local_err) {
 error_report_err(local_err);
diff --git a/block/quorum.c b/block/quorum.c
index ff5a0a2da3..f28758cf2b 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -825,8 +825,8 @@ static coroutine_fn GRAPH_RDLOCK int 
quorum_co_flush(BlockDriverState *bs)
 return result;
 }
 
-static bool quorum_recurse_can_replace(BlockDriverState *bs,
-   BlockDriverState *to_replace)
+static bool GRAPH_RDLOCK
+quorum_recurse_can_replace(BlockDriverState *bs, BlockDriverState *to_replace)
 {
 BDRVQuorumState *s = bs->opaque;
 int i;
diff --git a/blockdev.c

[PATCH v2 06/20] graph-lock: Add GRAPH_UNLOCKED(_PTR)

2023-05-04 Thread Kevin Wolf

For some functions, it is part of their interface to be called without
holding the graph lock. Add a new macro to document this.

The macro expands to TSA_EXCLUDES(), which is a relatively weak check
because it passes in cases where the compiler just doesn't know if the
lock is held. Function pointers can't be checked at all. Therefore, its
primary purpose is documentation.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/graph-lock.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/block/graph-lock.h b/include/block/graph-lock.h
index ac0fef8605..f17d1588e7 100644
--- a/include/block/graph-lock.h
+++ b/include/block/graph-lock.h
@@ -73,6 +73,7 @@ extern BdrvGraphLock graph_lock;
  */
 #define GRAPH_WRLOCK TSA_REQUIRES(graph_lock)
 #define GRAPH_RDLOCK TSA_REQUIRES_SHARED(graph_lock)
+#define GRAPH_UNLOCKED TSA_EXCLUDES(graph_lock)
 
 /*
  * TSA annotations are not part of function types, so checks are defeated when
@@ -83,6 +84,7 @@ extern BdrvGraphLock graph_lock;
  */
 #define GRAPH_RDLOCK_PTR TSA_GUARDED_BY(graph_lock)
 #define GRAPH_WRLOCK_PTR TSA_GUARDED_BY(graph_lock)
+#define GRAPH_UNLOCKED_PTR
 
 /*
  * register_aiocontext:
-- 
2.40.1

[PATCH v2 18/20] block: Mark bdrv_query_block_graph_info() and callers GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

This adds GRAPH_RDLOCK annotations to declare that callers of
bdrv_query_block_graph_info() need to hold a reader lock for the graph
because it accesses the children list of a node.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/qapi.h | 7 ---
 qemu-img.c   | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/block/qapi.h b/include/block/qapi.h
index 8773b9b191..18d48ddb70 100644
--- a/include/block/qapi.h
+++ b/include/block/qapi.h
@@ -25,6 +25,7 @@
 #ifndef BLOCK_QAPI_H
 #define BLOCK_QAPI_H
 
+#include "block/graph-lock.h"
 #include "block/snapshot.h"
 #include "qapi/qapi-types-block-core.h"
 
@@ -43,9 +44,9 @@ void bdrv_query_image_info(BlockDriverState *bs,
bool flat,
bool skip_implicit_filters,
Error **errp);
-void bdrv_query_block_graph_info(BlockDriverState *bs,
- BlockGraphInfo **p_info,
- Error **errp);
+void GRAPH_RDLOCK
+bdrv_query_block_graph_info(BlockDriverState *bs, BlockGraphInfo **p_info,
+Error **errp);
 
 void bdrv_snapshot_dump(QEMUSnapshotInfo *sn);
 void bdrv_image_info_specific_dump(ImageInfoSpecific *info_spec,
diff --git a/qemu-img.c b/qemu-img.c
index 9aeac69fa6..9f9f0a7629 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2938,6 +2938,8 @@ static BlockGraphInfoList *collect_image_info_list(bool 
image_opts,
 }
 bs = blk_bs(blk);
 
+GRAPH_RDLOCK_GUARD_MAINLOOP();
+
 /*
  * Note that the returned BlockGraphInfo object will not have
  * information about this image's backing node, because we have opened
-- 
2.40.1

[PATCH v2 14/20] block: Mark bdrv_co_get_info() and callers GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

From: Emanuele Giuseppe Esposito 

This adds GRAPH_RDLOCK annotations to declare that callers of
bdrv_co_get_info() need to hold a reader lock for the graph.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-io.h |  7 +--
 include/block/block_int-common.h |  4 ++--
 block.c  |  2 ++
 block/crypto.c   |  2 +-
 block/io.c   | 11 +--
 block/mirror.c   |  8 ++--
 block/raw-format.c   |  2 +-
 7 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/include/block/block-io.h b/include/block/block-io.h
index fb2adb31c7..bba7f957e1 100644
--- a/include/block/block-io.h
+++ b/include/block/block-io.h
@@ -167,8 +167,11 @@ const char *bdrv_get_node_name(const BlockDriverState *bs);
 const char *bdrv_get_device_name(const BlockDriverState *bs);
 const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
 
-int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
-int co_wrapper_mixed bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+int coroutine_fn GRAPH_RDLOCK
+bdrv_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+
+int co_wrapper_mixed_bdrv_rdlock
+bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
 
 ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
   Error **errp);
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 6e0365d8f2..ee77903f72 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -699,8 +699,8 @@ struct BlockDriver {
 BlockDriverState *bs, int64_t offset, int64_t bytes,
 QEMUIOVector *qiov, size_t qiov_offset);
 
-int coroutine_fn (*bdrv_co_get_info)(BlockDriverState *bs,
- BlockDriverInfo *bdi);
+int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_get_info)(
+BlockDriverState *bs, BlockDriverInfo *bdi);
 
 ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs,
  Error **errp);
diff --git a/block.c b/block.c
index 3ccb935950..a6deaf8ad1 100644
--- a/block.c
+++ b/block.c
@@ -6349,6 +6349,8 @@ int coroutine_fn bdrv_co_get_info(BlockDriverState *bs, 
BlockDriverInfo *bdi)
 int ret;
 BlockDriver *drv = bs->drv;
 IO_CODE();
+assert_bdrv_graph_readable();
+
 /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
 if (!drv) {
 return -ENOMEDIUM;
diff --git a/block/crypto.c b/block/crypto.c
index 8fd3ad0054..30093cff9b 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -736,7 +736,7 @@ fail:
 return ret;
 }
 
-static int coroutine_fn
+static int coroutine_fn GRAPH_RDLOCK
 block_crypto_co_get_info_luks(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
 BlockDriverInfo subbdi;
diff --git a/block/io.c b/block/io.c
index 6fa1993374..3bf9ef9d87 100644
--- a/block/io.c
+++ b/block/io.c
@@ -727,10 +727,9 @@ BdrvTrackedRequest *coroutine_fn 
bdrv_co_get_self_request(BlockDriverState *bs)
 /**
  * Round a region to cluster boundaries
  */
-void coroutine_fn bdrv_round_to_clusters(BlockDriverState *bs,
-int64_t offset, int64_t bytes,
-int64_t *cluster_offset,
-int64_t *cluster_bytes)
+void coroutine_fn GRAPH_RDLOCK
+bdrv_round_to_clusters(BlockDriverState *bs, int64_t offset, int64_t bytes,
+   int64_t *cluster_offset, int64_t *cluster_bytes)
 {
 BlockDriverInfo bdi;
 IO_CODE();
@@ -744,7 +743,7 @@ void coroutine_fn bdrv_round_to_clusters(BlockDriverState 
*bs,
 }
 }
 
-static coroutine_fn int bdrv_get_cluster_size(BlockDriverState *bs)
+static int coroutine_fn GRAPH_RDLOCK bdrv_get_cluster_size(BlockDriverState 
*bs)
 {
 BlockDriverInfo bdi;
 int ret;
@@ -1800,7 +1799,7 @@ fail:
 return ret;
 }
 
-static inline int coroutine_fn
+static inline int coroutine_fn GRAPH_RDLOCK
 bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
   BdrvTrackedRequest *req, int flags)
 {
diff --git a/block/mirror.c b/block/mirror.c
index b5c4ae31f3..e48ed0af31 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -576,8 +576,10 @@ static uint64_t coroutine_fn 
mirror_iteration(MirrorBlockJob *s)
 } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
 int64_t target_offset;
 int64_t target_bytes;
-bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
-   _offset, _bytes);
+WITH_GRAPH_RDLOCK_GUARD() {
+bdrv_round_to_clusters(blk_bs(s->target), offset, io_bytes,
+   _offset, _bytes);
+}
 if (target_offset == offset &&

[PATCH v2 04/20] block: Don't call no_coroutine_fns in qmp_block_resize()

2023-05-04 Thread Kevin Wolf

This QMP handler runs in a coroutine, so it must use the corresponding
no_co_wrappers instead.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 blockdev.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 2c1752a403..e464daea58 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2440,7 +2440,7 @@ void coroutine_fn qmp_block_resize(const char *device, 
const char *node_name,
 return;
 }
 
-blk = blk_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL, errp);
+blk = blk_co_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL, errp);
 if (!blk) {
 return;
 }
@@ -2455,7 +2455,7 @@ void coroutine_fn qmp_block_resize(const char *device, 
const char *node_name,
 
 bdrv_co_lock(bs);
 bdrv_drained_end(bs);
-blk_unref(blk);
+blk_co_unref(blk);
 bdrv_co_unlock(bs);
 }
 
-- 
2.40.1

[PATCH v2 12/20] mirror: Require GRAPH_RDLOCK for accessing a node's parent list

2023-05-04 Thread Kevin Wolf

This adds GRAPH_RDLOCK annotations to declare that functions accessing
the parent list of a node need to hold a reader lock for the graph. As
it happens, they already do.

Signed-off-by: Kevin Wolf 
---
 block/mirror.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/mirror.c b/block/mirror.c
index 80fa345071..b5c4ae31f3 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1416,7 +1416,7 @@ static MirrorOp *coroutine_fn 
active_write_prepare(MirrorBlockJob *s,
 return op;
 }
 
-static void coroutine_fn active_write_settle(MirrorOp *op)
+static void coroutine_fn GRAPH_RDLOCK active_write_settle(MirrorOp *op)
 {
 uint64_t start_chunk = op->offset / op->s->granularity;
 uint64_t end_chunk = DIV_ROUND_UP(op->offset + op->bytes,
-- 
2.40.1

[PATCH v2 10/20] nbd: Mark nbd_co_do_establish_connection() and callers GRAPH_RDLOCK

2023-05-04 Thread Kevin Wolf

From: Emanuele Giuseppe Esposito 

This adds GRAPH_RDLOCK annotations to declare that callers of
nbd_co_do_establish_connection() need to hold a reader lock for the
graph.

Signed-off-by: Emanuele Giuseppe Esposito 
Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/coroutines.h |  5 +++--
 block/nbd.c| 39 +--
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/block/coroutines.h b/block/coroutines.h
index dd9f3d449b..f3226682d6 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -61,7 +61,7 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos);
 int coroutine_fn GRAPH_RDLOCK
 bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
 
-int coroutine_fn
+int coroutine_fn GRAPH_RDLOCK
 nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking,
Error **errp);
 
@@ -85,7 +85,8 @@ bdrv_common_block_status_above(BlockDriverState *bs,
int64_t *map,
BlockDriverState **file,
int *depth);
-int co_wrapper_mixed
+
+int co_wrapper_mixed_bdrv_rdlock
 nbd_do_establish_connection(BlockDriverState *bs, bool blocking, Error **errp);
 
 #endif /* BLOCK_COROUTINES_H */
diff --git a/block/nbd.c b/block/nbd.c
index d3ee256844..a3f8f8a9d5 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -322,6 +322,7 @@ int coroutine_fn 
nbd_co_do_establish_connection(BlockDriverState *bs,
 int ret;
 IO_CODE();
 
+assert_bdrv_graph_readable();
 assert(!s->ioc);
 
 s->ioc = nbd_co_establish_connection(s->conn, >info, blocking, errp);
@@ -369,7 +370,7 @@ static bool nbd_client_connecting(BDRVNBDState *s)
 }
 
 /* Called with s->requests_lock taken.  */
-static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
+static void coroutine_fn GRAPH_RDLOCK nbd_reconnect_attempt(BDRVNBDState *s)
 {
 int ret;
 bool blocking = s->state == NBD_CLIENT_CONNECTING_WAIT;
@@ -480,9 +481,9 @@ static coroutine_fn int nbd_receive_replies(BDRVNBDState 
*s, uint64_t handle)
 }
 }
 
-static int coroutine_fn nbd_co_send_request(BlockDriverState *bs,
-NBDRequest *request,
-QEMUIOVector *qiov)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_co_send_request(BlockDriverState *bs, NBDRequest *request,
+QEMUIOVector *qiov)
 {
 BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 int rc, i = -1;
@@ -1171,8 +1172,9 @@ static int coroutine_fn 
nbd_co_receive_blockstatus_reply(BDRVNBDState *s,
 return iter.ret;
 }
 
-static int coroutine_fn nbd_co_request(BlockDriverState *bs, NBDRequest 
*request,
-   QEMUIOVector *write_qiov)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_co_request(BlockDriverState *bs, NBDRequest *request,
+   QEMUIOVector *write_qiov)
 {
 int ret, request_ret;
 Error *local_err = NULL;
@@ -1208,9 +1210,9 @@ static int coroutine_fn nbd_co_request(BlockDriverState 
*bs, NBDRequest *request
 return ret ? ret : request_ret;
 }
 
-static int coroutine_fn nbd_client_co_preadv(BlockDriverState *bs, int64_t 
offset,
- int64_t bytes, QEMUIOVector *qiov,
- BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
 int ret, request_ret;
 Error *local_err = NULL;
@@ -1266,9 +1268,9 @@ static int coroutine_fn 
nbd_client_co_preadv(BlockDriverState *bs, int64_t offse
 return ret ? ret : request_ret;
 }
 
-static int coroutine_fn nbd_client_co_pwritev(BlockDriverState *bs, int64_t 
offset,
-  int64_t bytes, QEMUIOVector 
*qiov,
-  BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_pwritev(BlockDriverState *bs, int64_t offset, int64_t bytes,
+  QEMUIOVector *qiov, BdrvRequestFlags flags)
 {
 BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 NBDRequest request = {
@@ -1291,8 +1293,9 @@ static int coroutine_fn 
nbd_client_co_pwritev(BlockDriverState *bs, int64_t offs
 return nbd_co_request(bs, , qiov);
 }
 
-static int coroutine_fn nbd_client_co_pwrite_zeroes(BlockDriverState *bs, 
int64_t offset,
-int64_t bytes, 
BdrvRequestFlags flags)
+static int coroutine_fn GRAPH_RDLOCK
+nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t 
bytes,
+BdrvRequestFlags flags)
 {
 BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 NBDRequest request = {
@@ -1326,7 +1329,7 @@ static int coroutine_fn

[PATCH v2 05/20] test-bdrv-drain: Don't modify the graph in coroutines

2023-05-04 Thread Kevin Wolf

test-bdrv-drain contains a few test cases that are run both in coroutine
and non-coroutine context. Running the entire code including the setup
and shutdown in coroutines is incorrect because graph modifications can
generally not happen in coroutines.

Change the test so that creating and destroying the test nodes and
BlockBackends always happens outside of coroutine context.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
---
 tests/unit/test-bdrv-drain.c | 112 +++
 1 file changed, 75 insertions(+), 37 deletions(-)

diff --git a/tests/unit/test-bdrv-drain.c b/tests/unit/test-bdrv-drain.c
index d9d3807062..9a4c5e59d6 100644
--- a/tests/unit/test-bdrv-drain.c
+++ b/tests/unit/test-bdrv-drain.c
@@ -188,6 +188,25 @@ static void do_drain_begin_unlocked(enum drain_type 
drain_type, BlockDriverState
 }
 }
 
+static BlockBackend * no_coroutine_fn test_setup(void)
+{
+BlockBackend *blk;
+BlockDriverState *bs, *backing;
+
+blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
+bs = bdrv_new_open_driver(_test, "test-node", BDRV_O_RDWR,
+  _abort);
+blk_insert_bs(blk, bs, _abort);
+
+backing = bdrv_new_open_driver(_test, "backing", 0, _abort);
+bdrv_set_backing_hd(bs, backing, _abort);
+
+bdrv_unref(backing);
+bdrv_unref(bs);
+
+return blk;
+}
+
 static void do_drain_end_unlocked(enum drain_type drain_type, BlockDriverState 
*bs)
 {
 if (drain_type != BDRV_DRAIN_ALL) {
@@ -199,25 +218,19 @@ static void do_drain_end_unlocked(enum drain_type 
drain_type, BlockDriverState *
 }
 }
 
-static void test_drv_cb_common(enum drain_type drain_type, bool recursive)
+static void test_drv_cb_common(BlockBackend *blk, enum drain_type drain_type,
+   bool recursive)
 {
-BlockBackend *blk;
-BlockDriverState *bs, *backing;
+BlockDriverState *bs = blk_bs(blk);
+BlockDriverState *backing = bs->backing->bs;
 BDRVTestState *s, *backing_s;
 BlockAIOCB *acb;
 int aio_ret;
 
 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, 0);
 
-blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
-bs = bdrv_new_open_driver(_test, "test-node", BDRV_O_RDWR,
-  _abort);
 s = bs->opaque;
-blk_insert_bs(blk, bs, _abort);
-
-backing = bdrv_new_open_driver(_test, "backing", 0, _abort);
 backing_s = backing->opaque;
-bdrv_set_backing_hd(bs, backing, _abort);
 
 /* Simple bdrv_drain_all_begin/end pair, check that CBs are called */
 g_assert_cmpint(s->drain_count, ==, 0);
@@ -252,44 +265,53 @@ static void test_drv_cb_common(enum drain_type 
drain_type, bool recursive)
 
 g_assert_cmpint(s->drain_count, ==, 0);
 g_assert_cmpint(backing_s->drain_count, ==, 0);
-
-bdrv_unref(backing);
-bdrv_unref(bs);
-blk_unref(blk);
 }
 
 static void test_drv_cb_drain_all(void)
 {
-test_drv_cb_common(BDRV_DRAIN_ALL, true);
+BlockBackend *blk = test_setup();
+test_drv_cb_common(blk, BDRV_DRAIN_ALL, true);
+blk_unref(blk);
 }
 
 static void test_drv_cb_drain(void)
 {
-test_drv_cb_common(BDRV_DRAIN, false);
+BlockBackend *blk = test_setup();
+test_drv_cb_common(blk, BDRV_DRAIN, false);
+blk_unref(blk);
+}
+
+static void coroutine_fn test_drv_cb_co_drain_all_entry(void)
+{
+BlockBackend *blk = blk_all_next(NULL);
+test_drv_cb_common(blk, BDRV_DRAIN_ALL, true);
 }
 
 static void test_drv_cb_co_drain_all(void)
 {
-call_in_coroutine(test_drv_cb_drain_all);
+BlockBackend *blk = test_setup();
+call_in_coroutine(test_drv_cb_co_drain_all_entry);
+blk_unref(blk);
 }
 
-static void test_drv_cb_co_drain(void)
+static void coroutine_fn test_drv_cb_co_drain_entry(void)
 {
-call_in_coroutine(test_drv_cb_drain);
+BlockBackend *blk = blk_all_next(NULL);
+test_drv_cb_common(blk, BDRV_DRAIN, false);
 }
 
-static void test_quiesce_common(enum drain_type drain_type, bool recursive)
+static void test_drv_cb_co_drain(void)
 {
-BlockBackend *blk;
-BlockDriverState *bs, *backing;
-
-blk = blk_new(qemu_get_aio_context(), BLK_PERM_ALL, BLK_PERM_ALL);
-bs = bdrv_new_open_driver(_test, "test-node", BDRV_O_RDWR,
-  _abort);
-blk_insert_bs(blk, bs, _abort);
+BlockBackend *blk = test_setup();
+call_in_coroutine(test_drv_cb_co_drain_entry);
+blk_unref(blk);
+}
 
-backing = bdrv_new_open_driver(_test, "backing", 0, _abort);
-bdrv_set_backing_hd(bs, backing, _abort);
+static void test_quiesce_common(BlockBackend *blk, enum drain_type drain_type,
+bool recursive)
+{
+BlockDriverState *bs = blk_bs(blk);
+BlockDriverState *backing = bs->backing->bs;
 
 g_assert_cmpint(bs->quiesce_counter, ==, 0);
 g_assert_cmpint(backing->quiesce_counter, ==, 0);
@@ -307,30 +329,46 @@ static void test_quiesce_common(enum drain_type 
drain_type, bool recursive)

[PATCH v2 09/20] nbd: Remove nbd_co_flush() wrapper function

2023-05-04 Thread Kevin Wolf

The only thing nbd_co_flush() does is call nbd_client_co_flush(). Just
use that function directly in the BlockDriver definitions and remove the
wrapper.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/nbd.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index bf2894ad5c..d3ee256844 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -1920,11 +1920,6 @@ fail:
 return ret;
 }
 
-static int coroutine_fn nbd_co_flush(BlockDriverState *bs)
-{
-return nbd_client_co_flush(bs);
-}
-
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
@@ -2120,7 +2115,7 @@ static BlockDriver bdrv_nbd = {
 .bdrv_co_pwritev= nbd_client_co_pwritev,
 .bdrv_co_pwrite_zeroes  = nbd_client_co_pwrite_zeroes,
 .bdrv_close = nbd_close,
-.bdrv_co_flush_to_os= nbd_co_flush,
+.bdrv_co_flush_to_os= nbd_client_co_flush,
 .bdrv_co_pdiscard   = nbd_client_co_pdiscard,
 .bdrv_refresh_limits= nbd_refresh_limits,
 .bdrv_co_truncate   = nbd_co_truncate,
@@ -2148,7 +2143,7 @@ static BlockDriver bdrv_nbd_tcp = {
 .bdrv_co_pwritev= nbd_client_co_pwritev,
 .bdrv_co_pwrite_zeroes  = nbd_client_co_pwrite_zeroes,
 .bdrv_close = nbd_close,
-.bdrv_co_flush_to_os= nbd_co_flush,
+.bdrv_co_flush_to_os= nbd_client_co_flush,
 .bdrv_co_pdiscard   = nbd_client_co_pdiscard,
 .bdrv_refresh_limits= nbd_refresh_limits,
 .bdrv_co_truncate   = nbd_co_truncate,
@@ -2176,7 +2171,7 @@ static BlockDriver bdrv_nbd_unix = {
 .bdrv_co_pwritev= nbd_client_co_pwritev,
 .bdrv_co_pwrite_zeroes  = nbd_client_co_pwrite_zeroes,
 .bdrv_close = nbd_close,
-.bdrv_co_flush_to_os= nbd_co_flush,
+.bdrv_co_flush_to_os= nbd_client_co_flush,
 .bdrv_co_pdiscard   = nbd_client_co_pdiscard,
 .bdrv_refresh_limits= nbd_refresh_limits,
 .bdrv_co_truncate   = nbd_co_truncate,
-- 
2.40.1

[PATCH v2 00/20] Graph locking, part 3 (more block drivers)

2023-05-04 Thread Kevin Wolf

The first few patches in this series fix coroutine correctness problems
that have existed for a while, but only actually start to make things
fail in practice with stricter checks that we're going to introduce with
the graph locking work.

The rest of the series makes sure that the graph lock is held in more
block driver functions that access the children or parents list of a
block node. This is incremental work and more patches are yet to come.

v2:
- Rebased on current git master
- Improved some commit messages
- Patch 5: Added missing coroutine_fn annotations in test
- Patch 7: Updated comments, too

I didn't remove the assertion in patch 13 yet which Stefan was
questioning. I can remove it while applying if we decide that we really
don't want it.

Emanuele Giuseppe Esposito (5):
  nbd: Mark nbd_co_do_establish_connection() and callers GRAPH_RDLOCK
  block: Mark bdrv_co_get_allocated_file_size() and callers GRAPH_RDLOCK
  block: Mark bdrv_co_get_info() and callers GRAPH_RDLOCK
  block: Mark bdrv_co_debug_event() GRAPH_RDLOCK
  block: Mark BlockDriver callbacks for amend job GRAPH_RDLOCK

Kevin Wolf (15):
  qcow2: Don't call bdrv_getlength() in coroutine_fns
  block: Consistently call bdrv_activate() outside coroutine
  block: bdrv/blk_co_unref() for calls in coroutine context
  block: Don't call no_coroutine_fns in qmp_block_resize()
  test-bdrv-drain: Don't modify the graph in coroutines
  graph-lock: Add GRAPH_UNLOCKED(_PTR)
  graph-lock: Fix GRAPH_RDLOCK_GUARD*() to be reader lock
  block: .bdrv_open is non-coroutine and unlocked
  nbd: Remove nbd_co_flush() wrapper function
  vhdx: Require GRAPH_RDLOCK for accessing a node's parent list
  mirror: Require GRAPH_RDLOCK for accessing a node's parent list
  block: Mark bdrv_query_bds_stats() and callers GRAPH_RDLOCK
  block: Mark bdrv_query_block_graph_info() and callers GRAPH_RDLOCK
  block: Mark bdrv_recurse_can_replace() and callers GRAPH_RDLOCK
  block: Mark bdrv_refresh_limits() and callers GRAPH_RDLOCK

 block/coroutines.h  |   5 +-
 block/qcow2.h   |   4 +-
 include/block/block-global-state.h  |  19 +++-
 include/block/block-io.h|  23 ++--
 include/block/block_int-common.h|  37 +++
 include/block/block_int-global-state.h  |   4 +-
 include/block/graph-lock.h  |  18 ++--
 include/block/qapi.h|   7 +-
 include/sysemu/block-backend-global-state.h |   5 +-
 block.c |  25 -
 block/amend.c   |   8 +-
 block/blkverify.c   |   5 +-
 block/block-backend.c   |  10 +-
 block/crypto.c  |   8 +-
 block/io.c  |  12 +--
 block/mirror.c  |  14 ++-
 block/nbd.c |  50 +
 block/parallels.c   |   6 +-
 block/qapi.c|   6 +-
 block/qcow.c|   6 +-
 block/qcow2-refcount.c  |   2 +-
 block/qcow2.c   |  48 -
 block/qed.c |  24 ++---
 block/quorum.c  |   4 +-
 block/raw-format.c  |   2 +-
 block/vdi.c |   6 +-
 block/vhdx.c|  15 +--
 block/vmdk.c|  20 ++--
 block/vpc.c |   6 +-
 blockdev.c  |   7 +-
 qemu-img.c  |   2 +
 tests/unit/test-bdrv-drain.c| 112 +---
 32 files changed, 310 insertions(+), 210 deletions(-)

-- 
2.40.1

[PATCH v2 02/20] block: Consistently call bdrv_activate() outside coroutine

2023-05-04 Thread Kevin Wolf

Migration code can call bdrv_activate() in coroutine context, whereas
other callers call it outside of coroutines. As it calls other code that
is not supposed to run in coroutines, standardise on running outside of
coroutines.

This adds a no_co_wrapper to switch to the main loop before calling
bdrv_activate().

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-global-state.h |  6 +-
 block/block-backend.c  | 10 +-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index 399200a9a3..2c312cc774 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -166,7 +166,11 @@ int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts 
*opts,
 BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
 const char *node_name, Error **errp);
 
-int bdrv_activate(BlockDriverState *bs, Error **errp);
+int no_coroutine_fn bdrv_activate(BlockDriverState *bs, Error **errp);
+
+int coroutine_fn no_co_wrapper
+bdrv_co_activate(BlockDriverState *bs, Error **errp);
+
 void bdrv_activate_all(Error **errp);
 int bdrv_inactivate_all(void);
 
diff --git a/block/block-backend.c b/block/block-backend.c
index fc530ded6a..e37d55d3e9 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2024,7 +2024,15 @@ void blk_activate(BlockBackend *blk, Error **errp)
 return;
 }
 
-bdrv_activate(bs, errp);
+/*
+ * Migration code can call this function in coroutine context, so leave
+ * coroutine context if necessary.
+ */
+if (qemu_in_coroutine()) {
+bdrv_co_activate(bs, errp);
+} else {
+bdrv_activate(bs, errp);
+}
 }
 
 bool coroutine_fn blk_co_is_inserted(BlockBackend *blk)
-- 
2.40.1

[PATCH v2 03/20] block: bdrv/blk_co_unref() for calls in coroutine context

2023-05-04 Thread Kevin Wolf

These functions must not be called in coroutine context, because they
need write access to the graph.

Signed-off-by: Kevin Wolf 
Reviewed-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block-global-state.h  |  3 ++-
 include/sysemu/block-backend-global-state.h |  5 -
 block.c |  2 +-
 block/crypto.c  |  6 +++---
 block/parallels.c   |  6 +++---
 block/qcow.c|  6 +++---
 block/qcow2.c   | 14 +++---
 block/qed.c |  6 +++---
 block/vdi.c |  6 +++---
 block/vhdx.c|  6 +++---
 block/vmdk.c| 18 +-
 block/vpc.c |  6 +++---
 12 files changed, 44 insertions(+), 40 deletions(-)

diff --git a/include/block/block-global-state.h 
b/include/block/block-global-state.h
index 2c312cc774..ec3ddb17a8 100644
--- a/include/block/block-global-state.h
+++ b/include/block/block-global-state.h
@@ -218,7 +218,8 @@ void bdrv_img_create(const char *filename, const char *fmt,
  bool quiet, Error **errp);
 
 void bdrv_ref(BlockDriverState *bs);
-void bdrv_unref(BlockDriverState *bs);
+void no_coroutine_fn bdrv_unref(BlockDriverState *bs);
+void coroutine_fn no_co_wrapper bdrv_co_unref(BlockDriverState *bs);
 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
 BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
  BlockDriverState *child_bs,
diff --git a/include/sysemu/block-backend-global-state.h 
b/include/sysemu/block-backend-global-state.h
index 2b6d27db7c..fa83f9389c 100644
--- a/include/sysemu/block-backend-global-state.h
+++ b/include/sysemu/block-backend-global-state.h
@@ -42,7 +42,10 @@ blk_co_new_open(const char *filename, const char *reference, 
QDict *options,
 
 int blk_get_refcnt(BlockBackend *blk);
 void blk_ref(BlockBackend *blk);
-void blk_unref(BlockBackend *blk);
+
+void no_coroutine_fn blk_unref(BlockBackend *blk);
+void coroutine_fn no_co_wrapper blk_co_unref(BlockBackend *blk);
+
 void blk_remove_all_bs(void);
 BlockBackend *blk_by_name(const char *name);
 BlockBackend *blk_next(BlockBackend *blk);
diff --git a/block.c b/block.c
index 5ec1a3897e..20d5ee0959 100644
--- a/block.c
+++ b/block.c
@@ -680,7 +680,7 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver 
*drv,
 
 ret = 0;
 out:
-blk_unref(blk);
+blk_co_unref(blk);
 return ret;
 }
 
diff --git a/block/crypto.c b/block/crypto.c
index ca67289187..8fd3ad0054 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -355,7 +355,7 @@ block_crypto_co_create_generic(BlockDriverState *bs, 
int64_t size,
 ret = 0;
  cleanup:
 qcrypto_block_free(crypto);
-blk_unref(blk);
+blk_co_unref(blk);
 return ret;
 }
 
@@ -661,7 +661,7 @@ block_crypto_co_create_luks(BlockdevCreateOptions 
*create_options, Error **errp)
 
 ret = 0;
 fail:
-bdrv_unref(bs);
+bdrv_co_unref(bs);
 return ret;
 }
 
@@ -730,7 +730,7 @@ fail:
 bdrv_co_delete_file_noerr(bs);
 }
 
-bdrv_unref(bs);
+bdrv_co_unref(bs);
 qapi_free_QCryptoBlockCreateOptions(create_opts);
 qobject_unref(cryptoopts);
 return ret;
diff --git a/block/parallels.c b/block/parallels.c
index 013684801a..b49c35929e 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -613,8 +613,8 @@ static int coroutine_fn 
parallels_co_create(BlockdevCreateOptions* opts,
 
 ret = 0;
 out:
-blk_unref(blk);
-bdrv_unref(bs);
+blk_co_unref(blk);
+bdrv_co_unref(bs);
 return ret;
 
 exit:
@@ -691,7 +691,7 @@ parallels_co_create_opts(BlockDriver *drv, const char 
*filename,
 
 done:
 qobject_unref(qdict);
-bdrv_unref(bs);
+bdrv_co_unref(bs);
 qapi_free_BlockdevCreateOptions(create_options);
 return ret;
 }
diff --git a/block/qcow.c b/block/qcow.c
index 490e4f819e..a0c701f578 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -915,8 +915,8 @@ static int coroutine_fn 
qcow_co_create(BlockdevCreateOptions *opts,
 g_free(tmp);
 ret = 0;
 exit:
-blk_unref(qcow_blk);
-bdrv_unref(bs);
+blk_co_unref(qcow_blk);
+bdrv_co_unref(bs);
 qcrypto_block_free(crypto);
 return ret;
 }
@@ -1015,7 +1015,7 @@ qcow_co_create_opts(BlockDriver *drv, const char 
*filename,
 fail:
 g_free(backing_fmt);
 qobject_unref(qdict);
-bdrv_unref(bs);
+bdrv_co_unref(bs);
 qapi_free_BlockdevCreateOptions(create_options);
 return ret;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index 94cf59af8b..01742b3ebe 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3705,7 +3705,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, 
Error **errp)
 goto out;
 }
 
-blk_unref(blk);
+blk_co_unref(blk);
 blk = NULL;
 
 /*
@@ -3785,7 +3785,7 @@

[PATCH 7/9] qemu-file: Make total_transferred an uint64_t

2023-05-04 Thread Juan Quintela

Change all the functions that use it.  It was already passed as
uint64_t.

Signed-off-by: Juan Quintela 
---
 migration/block.c | 5 ++---
 migration/qemu-file.c | 8 
 migration/qemu-file.h | 4 ++--
 migration/savevm.c| 6 ++
 migration/vmstate.c   | 2 +-
 5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index 3499f75e37..a37678ce95 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -747,8 +747,7 @@ static int block_save_setup(QEMUFile *f, void *opaque)
 static int block_save_iterate(QEMUFile *f, void *opaque)
 {
 int ret;
-int64_t last_bytes = qemu_file_total_transferred(f);
-int64_t delta_bytes;
+uint64_t last_bytes = qemu_file_total_transferred(f);
 
 trace_migration_block_save("iterate", block_mig_state.submitted,
block_mig_state.transferred);
@@ -800,7 +799,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
 }
 
 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
-delta_bytes = qemu_file_total_transferred(f) - last_bytes;
+uint64_t delta_bytes = qemu_file_total_transferred(f) - last_bytes;
 return (delta_bytes > 0);
 }
 
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 7b32ef45a9..e97d180f17 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -51,7 +51,7 @@ struct QEMUFile {
 uint64_t rate_limit_used;
 
 /* The sum of bytes transferred on the wire */
-int64_t total_transferred;
+uint64_t total_transferred;
 
 int buf_index;
 int buf_size; /* 0 when writing */
@@ -718,9 +718,9 @@ int coroutine_mixed_fn qemu_get_byte(QEMUFile *f)
 return result;
 }
 
-int64_t qemu_file_total_transferred_fast(QEMUFile *f)
+uint64_t qemu_file_total_transferred_fast(QEMUFile *f)
 {
-int64_t ret = f->total_transferred;
+uint64_t ret = f->total_transferred;
 int i;
 
 for (i = 0; i < f->iovcnt; i++) {
@@ -730,7 +730,7 @@ int64_t qemu_file_total_transferred_fast(QEMUFile *f)
 return ret;
 }
 
-int64_t qemu_file_total_transferred(QEMUFile *f)
+uint64_t qemu_file_total_transferred(QEMUFile *f)
 {
 qemu_fflush(f);
 return f->total_transferred;
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 55ef5a2ac7..d758e7f10b 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -83,7 +83,7 @@ int qemu_fclose(QEMUFile *f);
  *
  * Returns: the total bytes transferred
  */
-int64_t qemu_file_total_transferred(QEMUFile *f);
+uint64_t qemu_file_total_transferred(QEMUFile *f);
 
 /*
  * qemu_file_total_transferred_fast:
@@ -95,7 +95,7 @@ int64_t qemu_file_total_transferred(QEMUFile *f);
  *
  * Returns: the total bytes transferred and queued
  */
-int64_t qemu_file_total_transferred_fast(QEMUFile *f);
+uint64_t qemu_file_total_transferred_fast(QEMUFile *f);
 
 /*
  * put_buffer without copying the buffer.
diff --git a/migration/savevm.c b/migration/savevm.c
index a9d0a88e62..032044b1d5 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -927,11 +927,9 @@ static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
JSONWriter *vmdesc)
 {
-int64_t old_offset, size;
-
-old_offset = qemu_file_total_transferred_fast(f);
+uint64_t old_offset = qemu_file_total_transferred_fast(f);
 se->ops->save_state(f, se->opaque);
-size = qemu_file_total_transferred_fast(f) - old_offset;
+uint64_t size = qemu_file_total_transferred_fast(f) - old_offset;
 
 if (vmdesc) {
 json_writer_int64(vmdesc, "size", size);
diff --git a/migration/vmstate.c b/migration/vmstate.c
index 83ca4c7d3e..351f56104e 100644
--- a/migration/vmstate.c
+++ b/migration/vmstate.c
@@ -349,7 +349,7 @@ int vmstate_save_state_v(QEMUFile *f, const 
VMStateDescription *vmsd,
 void *first_elem = opaque + field->offset;
 int i, n_elems = vmstate_n_elems(opaque, field);
 int size = vmstate_size(opaque, field);
-int64_t old_offset, written_bytes;
+uint64_t old_offset, written_bytes;
 JSONWriter *vmdesc_loop = vmdesc;
 
 trace_vmstate_save_state_loop(vmsd->name, field->name, n_elems);
-- 
2.40.0

[PATCH 3/9] qemu-file: make qemu_file_[sg]et_rate_limit() use an uint64_t

2023-05-04 Thread Juan Quintela

It is really size_t.  Everything else uses uint64_t, so move this to
uint64_t as well.  A size can't be negative anyways.

Signed-off-by: Juan Quintela 
---
 migration/migration.c | 6 +++---
 migration/qemu-file.c | 8 
 migration/qemu-file.h | 4 ++--
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index 232e387109..ee75c6cfbd 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2119,7 +2119,7 @@ static int postcopy_start(MigrationState *ms)
  */
 /* 0 max-postcopy-bandwidth means unlimited */
 if (!bandwidth) {
-qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
+qemu_file_set_rate_limit(ms->to_dst_file, UINT64_MAX);
 } else {
 qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / 
XFER_LIMIT_RATIO);
 }
@@ -2301,7 +2301,7 @@ static void migration_completion(MigrationState *s)
 }
 if (ret >= 0) {
 s->block_inactive = !migrate_colo();
-qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
+qemu_file_set_rate_limit(s->to_dst_file, UINT64_MAX);
 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
  s->block_inactive);
 }
@@ -3049,7 +3049,7 @@ static void *bg_migration_thread(void *opaque)
 rcu_register_thread();
 object_ref(OBJECT(s));
 
-qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
+qemu_file_set_rate_limit(s->to_dst_file, UINT64_MAX);
 
 setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 /*
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index ee04240a21..b322b363cf 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -43,7 +43,7 @@ struct QEMUFile {
  * Maximum amount of data in bytes to transfer during one
  * rate limiting time window
  */
-int64_t rate_limit_max;
+uint64_t rate_limit_max;
 /*
  * Total amount of data in bytes queued for transfer
  * during this rate limiting time window
@@ -748,18 +748,18 @@ int qemu_file_rate_limit(QEMUFile *f)
 if (qemu_file_get_error(f)) {
 return 1;
 }
-if (f->rate_limit_max > 0 && f->rate_limit_used > f->rate_limit_max) {
+if (f->rate_limit_used > f->rate_limit_max) {
 return 1;
 }
 return 0;
 }
 
-int64_t qemu_file_get_rate_limit(QEMUFile *f)
+uint64_t qemu_file_get_rate_limit(QEMUFile *f)
 {
 return f->rate_limit_max;
 }
 
-void qemu_file_set_rate_limit(QEMUFile *f, int64_t limit)
+void qemu_file_set_rate_limit(QEMUFile *f, uint64_t limit)
 {
 f->rate_limit_max = limit;
 }
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index d16cd50448..9e158c00f6 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -138,8 +138,8 @@ void qemu_file_reset_rate_limit(QEMUFile *f);
  * need to be applied to the rate limiting calcuations
  */
 void qemu_file_acct_rate_limit(QEMUFile *f, int64_t len);
-void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
-int64_t qemu_file_get_rate_limit(QEMUFile *f);
+void qemu_file_set_rate_limit(QEMUFile *f, uint64_t new_rate);
+uint64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error_obj(QEMUFile *f, Error **errp);
 int qemu_file_get_error_obj_any(QEMUFile *f1, QEMUFile *f2, Error **errp);
 void qemu_file_set_error_obj(QEMUFile *f, int ret, Error *err);
-- 
2.40.0

[PATCH 1/9] migration: max_postcopy_bandwidth is a size parameter

2023-05-04 Thread Juan Quintela

So make everything that uses it uint64_t no int64_t.

Signed-off-by: Juan Quintela 
---
 migration/migration.c | 4 ++--
 migration/options.c   | 2 +-
 migration/options.h   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index feb5ab7493..232e387109 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2056,7 +2056,7 @@ static int postcopy_start(MigrationState *ms)
 QIOChannelBuffer *bioc;
 QEMUFile *fb;
 int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-int64_t bandwidth = migrate_max_postcopy_bandwidth();
+uint64_t bandwidth = migrate_max_postcopy_bandwidth();
 bool restart_block = false;
 int cur_state = MIGRATION_STATUS_ACTIVE;
 
@@ -3176,7 +3176,7 @@ fail:
 void migrate_fd_connect(MigrationState *s, Error *error_in)
 {
 Error *local_err = NULL;
-int64_t rate_limit;
+uint64_t rate_limit;
 bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
 
 /*
diff --git a/migration/options.c b/migration/options.c
index 7395787960..2e759cc306 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -717,7 +717,7 @@ uint64_t migrate_max_bandwidth(void)
 return s->parameters.max_bandwidth;
 }
 
-int64_t migrate_max_postcopy_bandwidth(void)
+uint64_t migrate_max_postcopy_bandwidth(void)
 {
 MigrationState *s = migrate_get_current();
 
diff --git a/migration/options.h b/migration/options.h
index 09841d6a63..5cca3326d6 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -85,7 +85,7 @@ int migrate_decompress_threads(void);
 uint64_t migrate_downtime_limit(void);
 uint8_t migrate_max_cpu_throttle(void);
 uint64_t migrate_max_bandwidth(void);
-int64_t migrate_max_postcopy_bandwidth(void);
+uint64_t migrate_max_postcopy_bandwidth(void);
 int migrate_multifd_channels(void);
 MultiFDCompression migrate_multifd_compression(void);
 int migrate_multifd_zlib_level(void);
-- 
2.40.0

[PATCH 0/9] QEMU file cleanups

2023-05-04 Thread Juan Quintela

Hi

While I am trying to put order in the atomic counters, I made in this series:

- convince and review code to see that everything is uint64_t.

- f->shutdown is not needed.  When we shutdown the file we put an
  error there if there is none.  So remove it.
- Make more clear how we use rate_limit.

Please review.

It is based on my previous series to the list:
Subject: [PATCH 0/2] More migration stats
Based-on: Message-Id: <20230504103357.22130-1-quint...@redhat.com>

Juan Quintela (9):
  migration: max_postcopy_bandwidth is a size parameter
  migration: qemu_file_total_transferred() function is monotonic
  qemu-file: make qemu_file_[sg]et_rate_limit() use an uint64_t
  qemu-file: Make rate_limit_used an uint64_t
  qemu-file: No need to check for shutdown in qemu_file_rate_limit
  qemu-file: remove shutdown member
  qemu-file: Make total_transferred an uint64_t
  qemu-file: Make ram_control_save_page() use accessors for rate_limit
  qemu-file: Account for rate_limit usage on qemu_fflush()

 migration/block.c | 13 +++--
 migration/migration.c | 10 +-
 migration/options.c   |  2 +-
 migration/options.h   |  2 +-
 migration/qemu-file.c | 42 +-
 migration/qemu-file.h | 10 +-
 migration/savevm.c|  6 ++
 migration/vmstate.c   |  2 +-
 8 files changed, 35 insertions(+), 52 deletions(-)

-- 
2.40.0

[PATCH 4/9] qemu-file: Make rate_limit_used an uint64_t

2023-05-04 Thread Juan Quintela

Change all the functions that use it.  It was already passed as
uint64_t.

Signed-off-by: Juan Quintela 
---
 migration/qemu-file.c | 4 ++--
 migration/qemu-file.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index b322b363cf..5aa990b82a 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -48,7 +48,7 @@ struct QEMUFile {
  * Total amount of data in bytes queued for transfer
  * during this rate limiting time window
  */
-int64_t rate_limit_used;
+uint64_t rate_limit_used;
 
 /* The sum of bytes transferred on the wire */
 int64_t total_transferred;
@@ -769,7 +769,7 @@ void qemu_file_reset_rate_limit(QEMUFile *f)
 f->rate_limit_used = 0;
 }
 
-void qemu_file_acct_rate_limit(QEMUFile *f, int64_t len)
+void qemu_file_acct_rate_limit(QEMUFile *f, uint64_t len)
 {
 f->rate_limit_used += len;
 }
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 9e158c00f6..55ef5a2ac7 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -137,7 +137,7 @@ void qemu_file_reset_rate_limit(QEMUFile *f);
  * out of band from the main file object I/O methods, and
  * need to be applied to the rate limiting calcuations
  */
-void qemu_file_acct_rate_limit(QEMUFile *f, int64_t len);
+void qemu_file_acct_rate_limit(QEMUFile *f, uint64_t len);
 void qemu_file_set_rate_limit(QEMUFile *f, uint64_t new_rate);
 uint64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error_obj(QEMUFile *f, Error **errp);
-- 
2.40.0

[PATCH 9/9] qemu-file: Account for rate_limit usage on qemu_fflush()

2023-05-04 Thread Juan Quintela

That is the moment we know we have transferred something.

Signed-off-by: Juan Quintela 
---
 migration/qemu-file.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index ddebfac847..309b4c56f4 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -300,7 +300,9 @@ void qemu_fflush(QEMUFile *f)
_error) < 0) {
 qemu_file_set_error_obj(f, -EIO, local_error);
 } else {
-f->total_transferred += iov_size(f->iov, f->iovcnt);
+uint64_t size = iov_size(f->iov, f->iovcnt);
+qemu_file_acct_rate_limit(f, size);
+f->total_transferred += size;
 }
 
 qemu_iovec_release_ram(f);
@@ -527,7 +529,6 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, 
size_t size,
 return;
 }
 
-f->rate_limit_used += size;
 add_to_iovec(f, buf, size, may_free);
 }
 
@@ -545,7 +546,6 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, 
size_t size)
 l = size;
 }
 memcpy(f->buf + f->buf_index, buf, l);
-f->rate_limit_used += l;
 add_buf_to_iovec(f, l);
 if (qemu_file_get_error(f)) {
 break;
@@ -562,7 +562,6 @@ void qemu_put_byte(QEMUFile *f, int v)
 }
 
 f->buf[f->buf_index] = v;
-f->rate_limit_used++;
 add_buf_to_iovec(f, 1);
 }
 
-- 
2.40.0

[PATCH 6/9] qemu-file: remove shutdown member

2023-05-04 Thread Juan Quintela

The first thing that we do after setting the shutdown value is set the
error as -EIO if there is not a previous error.

So this value is reduntant.  Just remove it and use
qemu_file_get_error() in the places that it was tested.

Signed-off-by: Juan Quintela 
---
 migration/qemu-file.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 771aba7369..7b32ef45a9 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -63,8 +63,6 @@ struct QEMUFile {
 
 int last_error;
 Error *last_error_obj;
-/* has the file has been shutdown */
-bool shutdown;
 };
 
 /*
@@ -78,8 +76,6 @@ int qemu_file_shutdown(QEMUFile *f)
 {
 int ret = 0;
 
-f->shutdown = true;
-
 /*
  * We must set qemufile error before the real shutdown(), otherwise
  * there can be a race window where we thought IO all went though
@@ -294,7 +290,7 @@ void qemu_fflush(QEMUFile *f)
 return;
 }
 
-if (f->shutdown) {
+if (qemu_file_get_error(f)) {
 return;
 }
 if (f->iovcnt > 0) {
@@ -407,7 +403,7 @@ static ssize_t coroutine_mixed_fn qemu_fill_buffer(QEMUFile 
*f)
 f->buf_index = 0;
 f->buf_size = pending;
 
-if (f->shutdown) {
+if (qemu_file_get_error(f)) {
 return 0;
 }
 
@@ -496,7 +492,7 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, 
size_t size,
 } else {
 if (f->iovcnt >= MAX_IOV_SIZE) {
 /* Should only happen if a previous fflush failed */
-assert(f->shutdown || !qemu_file_is_writable(f));
+assert(qemu_file_get_error(f) || !qemu_file_is_writable(f));
 return 1;
 }
 if (may_free) {
-- 
2.40.0

[PATCH 8/9] qemu-file: Make ram_control_save_page() use accessors for rate_limit

2023-05-04 Thread Juan Quintela

Signed-off-by: Juan Quintela 
---
 migration/qemu-file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index e97d180f17..ddebfac847 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -362,7 +362,7 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t 
block_offset,
 int ret = f->hooks->save_page(f, block_offset,
   offset, size, bytes_sent);
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
-f->rate_limit_used += size;
+qemu_file_acct_rate_limit(f, size);
 }
 
 if (ret != RAM_SAVE_CONTROL_DELAYED &&
-- 
2.40.0

[PATCH 2/9] migration: qemu_file_total_transferred() function is monotonic

2023-05-04 Thread Juan Quintela

So delta_bytes can only be greater or equal to zero.  Never negative.

Signed-off-by: Juan Quintela 
---
 migration/block.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/migration/block.c b/migration/block.c
index 6d532ac7a2..3499f75e37 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -801,13 +801,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
 
 qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 delta_bytes = qemu_file_total_transferred(f) - last_bytes;
-if (delta_bytes > 0) {
-return 1;
-} else if (delta_bytes < 0) {
-return -1;
-} else {
-return 0;
-}
+return (delta_bytes > 0);
 }
 
 /* Called with iothread lock taken.  */
-- 
2.40.0

[PATCH 5/9] qemu-file: No need to check for shutdown in qemu_file_rate_limit

2023-05-04 Thread Juan Quintela

After calling qemu_file_shutdown() we set the error as -EIO if there
is no another previous error, so no need to check it here.

Signed-off-by: Juan Quintela 
---
 migration/qemu-file.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 5aa990b82a..771aba7369 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -742,9 +742,6 @@ int64_t qemu_file_total_transferred(QEMUFile *f)
 
 int qemu_file_rate_limit(QEMUFile *f)
 {
-if (f->shutdown) {
-return 1;
-}
 if (qemu_file_get_error(f)) {
 return 1;
 }
-- 
2.40.0

1 2 >

1 - 100 of 112 matches

Mail list logo