date:20220211

[PULL 2/3] iotests: Test blockdev-reopen with iothreads and throttling

2022-02-11 Thread Kevin Wolf

The 'throttle' block driver implements .bdrv_co_drain_end, so
blockdev-reopen will have to wait for it to complete in the polling
loop at the end of qmp_blockdev_reopen(). This makes AIO_WAIT_WHILE()
release the AioContext lock, which causes a crash if the lock hasn't
correctly been taken.

Signed-off-by: Kevin Wolf 
Message-Id: <20220203140534.36522-3-kw...@redhat.com>
Reviewed-by: Hanna Reitz 
Signed-off-by: Kevin Wolf 
---
 tests/qemu-iotests/245 | 36 +---
 tests/qemu-iotests/245.out |  4 ++--
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245
index 24ac43f70e..8cbed7821b 100755
--- a/tests/qemu-iotests/245
+++ b/tests/qemu-iotests/245
@@ -1138,12 +1138,13 @@ class TestBlockdevReopen(iotests.QMPTestCase):
 self.assertEqual(self.get_node('hd1'), None)
 self.assert_qmp(self.get_node('hd2'), 'ro', True)
 
-def run_test_iothreads(self, iothread_a, iothread_b, errmsg = None):
-opts = hd_opts(0)
+def run_test_iothreads(self, iothread_a, iothread_b, errmsg = None,
+   opts_a = None, opts_b = None):
+opts = opts_a or hd_opts(0)
 result = self.vm.qmp('blockdev-add', conv_keys = False, **opts)
 self.assert_qmp(result, 'return', {})
 
-opts2 = hd_opts(2)
+opts2 = opts_b or hd_opts(2)
 result = self.vm.qmp('blockdev-add', conv_keys = False, **opts2)
 self.assert_qmp(result, 'return', {})
 
@@ -1194,6 +1195,35 @@ class TestBlockdevReopen(iotests.QMPTestCase):
 def test_iothreads_switch_overlay(self):
 self.run_test_iothreads('', 'iothread0')
 
+def test_iothreads_with_throttling(self):
+# Create a throttle-group object
+opts = { 'qom-type': 'throttle-group', 'id': 'group0',
+ 'limits': { 'iops-total': 1000 } }
+result = self.vm.qmp('object-add', conv_keys = False, **opts)
+self.assert_qmp(result, 'return', {})
+
+# Options with a throttle filter between format and protocol
+opts = [
+{
+'driver': iotests.imgfmt,
+'node-name': f'hd{idx}',
+'file' : {
+'node-name': f'hd{idx}-throttle',
+'driver': 'throttle',
+'throttle-group': 'group0',
+'file': {
+'driver': 'file',
+'node-name': f'hd{idx}-file',
+'filename': hd_path[idx],
+},
+},
+}
+for idx in (0, 2)
+]
+
+self.run_test_iothreads('iothread0', 'iothread0', None,
+opts[0], opts[1])
+
 if __name__ == '__main__':
 iotests.activate_logging()
 iotests.main(supported_fmts=["qcow2"],
diff --git a/tests/qemu-iotests/245.out b/tests/qemu-iotests/245.out
index 4eced19294..a4e04a3266 100644
--- a/tests/qemu-iotests/245.out
+++ b/tests/qemu-iotests/245.out
@@ -17,8 +17,8 @@ read 1/1 bytes at offset 262152
 read 1/1 bytes at offset 262160
 1 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
-...
+
 --
-Ran 25 tests
+Ran 26 tests
 
 OK
-- 
2.34.1

[PULL 3/3] hw/block/fdc-isa: Respect QOM properties when building AML

2022-02-11 Thread Kevin Wolf

From: Bernhard Beschow 

Other ISA devices such as serial-isa use the properties in their
build_aml functions. fdc-isa not using them is probably an oversight.

Signed-off-by: Bernhard Beschow 
Message-Id: <20220209191558.30393-1-shen...@gmail.com>
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Kevin Wolf 
---
 hw/block/fdc-isa.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/block/fdc-isa.c b/hw/block/fdc-isa.c
index 3bf64e0665..ab663dce93 100644
--- a/hw/block/fdc-isa.c
+++ b/hw/block/fdc-isa.c
@@ -216,6 +216,7 @@ int cmos_get_fd_drive_type(FloppyDriveType fd0)
 
 static void fdc_isa_build_aml(ISADevice *isadev, Aml *scope)
 {
+FDCtrlISABus *isa = ISA_FDC(isadev);
 Aml *dev;
 Aml *crs;
 int i;
@@ -227,11 +228,13 @@ static void fdc_isa_build_aml(ISADevice *isadev, Aml 
*scope)
 };
 
 crs = aml_resource_template();
-aml_append(crs, aml_io(AML_DECODE16, 0x03F2, 0x03F2, 0x00, 0x04));
-aml_append(crs, aml_io(AML_DECODE16, 0x03F7, 0x03F7, 0x00, 0x01));
-aml_append(crs, aml_irq_no_flags(6));
 aml_append(crs,
-aml_dma(AML_COMPATIBILITY, AML_NOTBUSMASTER, AML_TRANSFER8, 2));
+aml_io(AML_DECODE16, isa->iobase + 2, isa->iobase + 2, 0x00, 0x04));
+aml_append(crs,
+aml_io(AML_DECODE16, isa->iobase + 7, isa->iobase + 7, 0x00, 0x01));
+aml_append(crs, aml_irq_no_flags(isa->irq));
+aml_append(crs,
+aml_dma(AML_COMPATIBILITY, AML_NOTBUSMASTER, AML_TRANSFER8, isa->dma));
 
 dev = aml_device("FDC0");
 aml_append(dev, aml_name_decl("_HID", aml_eisaid("PNP0700")));
-- 
2.34.1

[PULL 1/3] block: Lock AioContext for drain_end in blockdev-reopen

2022-02-11 Thread Kevin Wolf

bdrv_subtree_drained_end() requires the caller to hold the AioContext
lock for the drained node. Not doing this for nodes outside of the main
AioContext leads to crashes when AIO_WAIT_WHILE() needs to wait and
tries to temporarily release the lock.

Fixes: 3908b7a8994fa5ef7a89aa58cd5a02fc58141592
Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2046659
Reported-by: Qing Wang 
Signed-off-by: Kevin Wolf 
Message-Id: <20220203140534.36522-2-kw...@redhat.com>
Reviewed-by: Hanna Reitz 
Signed-off-by: Kevin Wolf 
---
 blockdev.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/blockdev.c b/blockdev.c
index 8197165bb5..42e098b458 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3530,6 +3530,7 @@ void qmp_blockdev_reopen(BlockdevOptionsList 
*reopen_list, Error **errp)
 {
 BlockReopenQueue *queue = NULL;
 GSList *drained = NULL;
+GSList *p;
 
 /* Add each one of the BDS that we want to reopen to the queue */
 for (; reopen_list != NULL; reopen_list = reopen_list->next) {
@@ -3579,7 +3580,15 @@ void qmp_blockdev_reopen(BlockdevOptionsList 
*reopen_list, Error **errp)
 
 fail:
 bdrv_reopen_queue_free(queue);
-g_slist_free_full(drained, (GDestroyNotify) bdrv_subtree_drained_end);
+for (p = drained; p; p = p->next) {
+BlockDriverState *bs = p->data;
+AioContext *ctx = bdrv_get_aio_context(bs);
+
+aio_context_acquire(ctx);
+bdrv_subtree_drained_end(bs);
+aio_context_release(ctx);
+}
+g_slist_free(drained);
 }
 
 void qmp_blockdev_del(const char *node_name, Error **errp)
-- 
2.34.1

[PULL 0/3] Block layer patches

2022-02-11 Thread Kevin Wolf

The following changes since commit 0a301624c2f4ced3331ffd5bce85b4274fe132af:

  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20220208' 
into staging (2022-02-08 11:40:08 +)

are available in the Git repository at:

  https://gitlab.com/kmwolf/qemu.git tags/for-upstream

for you to fetch changes up to fdb8541b2e4f6ff60f435fbb5a5e1df20c275a86:

  hw/block/fdc-isa: Respect QOM properties when building AML (2022-02-11 
17:37:26 +0100)


Block layer patches

- Fix crash in blockdev-reopen with iothreads
- fdc-isa: Respect QOM properties when building AML


Bernhard Beschow (1):
  hw/block/fdc-isa: Respect QOM properties when building AML

Kevin Wolf (2):
  block: Lock AioContext for drain_end in blockdev-reopen
  iotests: Test blockdev-reopen with iothreads and throttling

 blockdev.c | 11 ++-
 hw/block/fdc-isa.c | 11 +++
 tests/qemu-iotests/245 | 36 +---
 tests/qemu-iotests/245.out |  4 ++--
 4 files changed, 52 insertions(+), 10 deletions(-)

Re: [PATCH v2 1/8] tests/qemu-iotests/testrunner: Allow parallel test invocations

2022-02-11 Thread Kevin Wolf

Am 11.02.2022 um 17:14 hat Hanna Reitz geschrieben:
> On 11.02.22 17:00, Kevin Wolf wrote:
> > Am 11.02.2022 um 14:53 hat Thomas Huth geschrieben:
> > > On 11/02/2022 10.29, Kevin Wolf wrote:
> > > > Am 09.02.2022 um 11:15 hat Thomas Huth geschrieben:
> > > > > If multiple tests run in parallel, they must use unique file
> > > > > names for the test output.
> > > > > 
> > > > > Suggested-by: Hanna Reitz 
> > > > > Signed-off-by: Thomas Huth 
> > > > > ---
> > > > >tests/qemu-iotests/testrunner.py | 2 +-
> > > > >1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > 
> > > > > diff --git a/tests/qemu-iotests/testrunner.py 
> > > > > b/tests/qemu-iotests/testrunner.py
> > > > > index 0eace147b8..9d20f51bb1 100644
> > > > > --- a/tests/qemu-iotests/testrunner.py
> > > > > +++ b/tests/qemu-iotests/testrunner.py
> > > > > @@ -259,7 +259,7 @@ def do_run_test(self, test: str, mp: bool) -> 
> > > > > TestResult:
> > > > >"""
> > > > >f_test = Path(test)
> > > > > -f_bad = Path(f_test.name + '.out.bad')
> > > > > +f_bad = Path(f'{os.getpid()}-{f_test.name}.out.bad')
> > > > >f_notrun = Path(f_test.name + '.notrun')
> > > > >f_casenotrun = Path(f_test.name + '.casenotrun')
> > > > >f_reference = Path(self.find_reference(test))
> > > > Hmm... It does make sense, but nobody ever cleans those files up.
> > > > Currently, the next run of the test will just overwrite the existing
> > > > file or delete it when the test succeeds. So after running the test
> > > > suite, you have .out.bad files for every failed test, but not for those
> > > > that succeeded.
> > > > 
> > > > After this change, won't the test directory accumulate tons of .out.bad
> > > > files over time?
> > > True ... but we certainly want to keep the file for failed tests for 
> > > further
> > > analysis instead of immediately deleting them ... maybe it would be enough
> > > to encode the image format (qcow2, qed, vmdk, ...) into the output name,
> > > instead of using the PID, so that "make check SPEED=thorough" works as
> > > expected here?
> > It depends on what the supported use case for test suites running in
> > parallel is. If it's just for testing multiple formats at the same time,
> > then this would work, yes.
> > 
> > I could think of more test runs that you might want to do in parallel,
> > like different protocols, different image format options, maybe even
> > different host file system. I'm not sure if all (or any) of these are
> > relevant, though.
> > 
> > Supporting only things that "make check" uses might be a good
> > compromise.
> 
> Personally and originally, I wrote that diff to allow me to actually run the
> very same test many times in parallel.  If an error occurs only very rarely,
> then I like running like 24 loops of the same test with exactly the same
> configuration (just different TEST_DIRs, of course) in parallel.
> 
> The fact that the .out.bad files tend to accumulate is why I haven’t sent it
> upstream so far.  Personally, I like Vladimir’s idea to put them into
> TEST_DIR, but probably just because this works so well for my usual case
> where TEST_DIR is on tmpfs and I thus don’t have to clean it up.

I think it could actually work fine because if you don't override
TEST_DIR, it's the same every time, and then you get the old behaviour,
just with the .out.bad files moved into scratch/.

Kevin

Re: [PATCH 0/4] Make qemu-img dd more flexible

2022-02-11 Thread Hanna Reitz


On 11.02.22 17:31, Eric Blake wrote:

On Thu, Feb 10, 2022 at 02:31:19PM +0100, Fabian Ebner wrote:

Adds support for reading from stdin and writing to stdout (when raw
format is used), as well as overriding the size of the output and
input image/stream.

Additionally, the options -n for skipping output image creation and -l
for loading a snapshot are made available like for convert.

Without looking at the series itself, I want to refer back to earlier
times that someone proposed improving 'qemu-img dd':

https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg00636.html
https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg02618.html

As well as the observation that when we originally allowed 'qemu-img
dd' to be added, the end goal was that if 'qemu-img dd' can't operate
as a thin wrapper around 'qemu-img convert', then 'qemu-img convert'
needs to be made more powerful first.  Every time we diverge on what
the two uses can do, rather than keeping dd as a thin wrapper, we add
to our maintenance burden.

Sadly, there is a lot of technical debt in this area ('qemu-img dd
skip= count=' is STILL broken, more than 4 years after I first
proposed a potential patch), where no one has spent the necessary time
to improve the situation.


Note that by now (in contrast to 2018), we have FUSE disk exports, and I 
even have a script that uses them to let you run dd on any image:


https://gitlab.com/hreitz/qemu-scripts/-/blob/main/qemu-dd.py

Which is nice, because it gives you feature parity with dd, because it 
simply runs dd.


(The main problem with the script is that it lives in that personal repo 
of mine and so nobody but me knows about it.  Suggestions to improve 
that situation are more than welcome.)


Now, the qemu storage daemon does not support loading qcow2 snapshots 
(as far as I’m aware), which is proposed in patch 4 of this series.  But 
I think that just means that it would be nice if the QSD could support that.


Hanna

Re: [PATCH] hw/block/fdc-isa: Respect QOM properties when building AML

2022-02-11 Thread Kevin Wolf

Am 09.02.2022 um 20:15 hat Bernhard Beschow geschrieben:
> Other ISA devices such as serial-isa use the properties in their
> build_aml functions. fdc-isa not using them is probably an oversight.
> 
> Signed-off-by: Bernhard Beschow 

Thanks, applied to the block branch.

Kevin

Re: [PATCH 1/6] tests/qemu-iotests: Improve the check for GNU sed

2022-02-11 Thread Thomas Huth


On 11/02/2022 17.14, Eric Blake wrote:

On Tue, Feb 08, 2022 at 03:52:19PM +0100, Thomas Huth wrote:

The current code with $SED has been introduced almost three years
ago already...


   Can’t we just do `alias sed=gsed`?


Maybe ... but let's ask Philippe and Kevin first, who Signed-off
commit bde36af1ab4f476 that introduced the current way with $SED:
What's your opinion about this?


This commit was to have check-block working on the OpenBSD VM image.


Sure. The question was whether using an alias as suggested by Hanna would be
nicer instead of using $SED ?


Scripting with aliases becomes a nightmare to debug, since it is
relatively uncommon.  In particular, in bash, you have to explicitly
opt in to using aliases (contrary to POSIX sh where aliases are
available to scripts at startup).


shopt -s expand_aliases
... as I just learnt the hard way ;-)


Using $SED everywhere may require
more hunting, but it is more obvious when reading a test that "oh
yeah, I might be using extensions that the default 'sed' can't
support" than a script that blindly uses 'sed' and depends on it
aliasing to a more-capable sed at a distance.

The other question is how many GNU sed features are we actually
depending on?  Which tests break if we have BSD sed or busybox sed?
Can we rewrite those sed scripts to avoid GNU extensions?  But
auditing for s/sed/$SED/ seems easier than auditing for which
non-portable sed extensions we depend on.


The most obvious part are the filter functions in common.filter - we're 
using "-r" here that is not part of the POSIX sed as far as I can see.


Not sure whether anybody really wants to rewrite all sed statements for full 
portability, but maybe we could also introduce a wrapper for GNU sed like this:


if ! command -v gsed >/dev/null 2>&1; then
if sed --version | grep -v 'not GNU sed' | grep 'GNUx sed' \
   > /dev/null 2>&1;
then
gsed()
{
sed $*
}
else
gsed()
{
_notrun "GNU sed not available"
}
fi
fi

... then we could simply use "gsed" everywhere we depend on the GNU 
behavior, and the tests don't look as ugly as with the $SED ?


 Thomas

Re: [PATCH 0/4] Make qemu-img dd more flexible

2022-02-11 Thread Eric Blake

On Thu, Feb 10, 2022 at 02:31:19PM +0100, Fabian Ebner wrote:
> Adds support for reading from stdin and writing to stdout (when raw
> format is used), as well as overriding the size of the output and
> input image/stream.
> 
> Additionally, the options -n for skipping output image creation and -l
> for loading a snapshot are made available like for convert.

Without looking at the series itself, I want to refer back to earlier
times that someone proposed improving 'qemu-img dd':

https://lists.gnu.org/archive/html/qemu-devel/2021-02/msg00636.html
https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg02618.html

As well as the observation that when we originally allowed 'qemu-img
dd' to be added, the end goal was that if 'qemu-img dd' can't operate
as a thin wrapper around 'qemu-img convert', then 'qemu-img convert'
needs to be made more powerful first.  Every time we diverge on what
the two uses can do, rather than keeping dd as a thin wrapper, we add
to our maintenance burden.

Sadly, there is a lot of technical debt in this area ('qemu-img dd
skip= count=' is STILL broken, more than 4 years after I first
proposed a potential patch), where no one has spent the necessary time
to improve the situation.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [PATCH 1/6] tests/qemu-iotests: Improve the check for GNU sed

2022-02-11 Thread Eric Blake

On Tue, Feb 08, 2022 at 03:52:19PM +0100, Thomas Huth wrote:
> > > The current code with $SED has been introduced almost three years
> > > ago already...
> > > 
> > > >   Can’t we just do `alias sed=gsed`?
> > > 
> > > Maybe ... but let's ask Philippe and Kevin first, who Signed-off
> > > commit bde36af1ab4f476 that introduced the current way with $SED:
> > > What's your opinion about this?
> > 
> > This commit was to have check-block working on the OpenBSD VM image.
> 
> Sure. The question was whether using an alias as suggested by Hanna would be
> nicer instead of using $SED ?

Scripting with aliases becomes a nightmare to debug, since it is
relatively uncommon.  In particular, in bash, you have to explicitly
opt in to using aliases (contrary to POSIX sh where aliases are
available to scripts at startup).  Using $SED everywhere may require
more hunting, but it is more obvious when reading a test that "oh
yeah, I might be using extensions that the default 'sed' can't
support" than a script that blindly uses 'sed' and depends on it
aliasing to a more-capable sed at a distance.

The other question is how many GNU sed features are we actually
depending on?  Which tests break if we have BSD sed or busybox sed?
Can we rewrite those sed scripts to avoid GNU extensions?  But
auditing for s/sed/$SED/ seems easier than auditing for which
non-portable sed extensions we depend on.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2 1/8] tests/qemu-iotests/testrunner: Allow parallel test invocations

2022-02-11 Thread Hanna Reitz


On 11.02.22 17:00, Kevin Wolf wrote:

Am 11.02.2022 um 14:53 hat Thomas Huth geschrieben:

On 11/02/2022 10.29, Kevin Wolf wrote:

Am 09.02.2022 um 11:15 hat Thomas Huth geschrieben:

If multiple tests run in parallel, they must use unique file
names for the test output.

Suggested-by: Hanna Reitz 
Signed-off-by: Thomas Huth 
---
   tests/qemu-iotests/testrunner.py | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/testrunner.py b/tests/qemu-iotests/testrunner.py
index 0eace147b8..9d20f51bb1 100644
--- a/tests/qemu-iotests/testrunner.py
+++ b/tests/qemu-iotests/testrunner.py
@@ -259,7 +259,7 @@ def do_run_test(self, test: str, mp: bool) -> TestResult:
   """
   f_test = Path(test)
-f_bad = Path(f_test.name + '.out.bad')
+f_bad = Path(f'{os.getpid()}-{f_test.name}.out.bad')
   f_notrun = Path(f_test.name + '.notrun')
   f_casenotrun = Path(f_test.name + '.casenotrun')
   f_reference = Path(self.find_reference(test))

Hmm... It does make sense, but nobody ever cleans those files up.
Currently, the next run of the test will just overwrite the existing
file or delete it when the test succeeds. So after running the test
suite, you have .out.bad files for every failed test, but not for those
that succeeded.

After this change, won't the test directory accumulate tons of .out.bad
files over time?

True ... but we certainly want to keep the file for failed tests for further
analysis instead of immediately deleting them ... maybe it would be enough
to encode the image format (qcow2, qed, vmdk, ...) into the output name,
instead of using the PID, so that "make check SPEED=thorough" works as
expected here?

It depends on what the supported use case for test suites running in
parallel is. If it's just for testing multiple formats at the same time,
then this would work, yes.

I could think of more test runs that you might want to do in parallel,
like different protocols, different image format options, maybe even
different host file system. I'm not sure if all (or any) of these are
relevant, though.

Supporting only things that "make check" uses might be a good
compromise.


Personally and originally, I wrote that diff to allow me to actually run 
the very same test many times in parallel.  If an error occurs only very 
rarely, then I like running like 24 loops of the same test with exactly 
the same configuration (just different TEST_DIRs, of course) in parallel.


The fact that the .out.bad files tend to accumulate is why I haven’t 
sent it upstream so far.  Personally, I like Vladimir’s idea to put them 
into TEST_DIR, but probably just because this works so well for my usual 
case where TEST_DIR is on tmpfs and I thus don’t have to clean it up.


Hanna

Re: [PATCH v2 1/8] tests/qemu-iotests/testrunner: Allow parallel test invocations

2022-02-11 Thread Kevin Wolf

Am 11.02.2022 um 14:53 hat Thomas Huth geschrieben:
> On 11/02/2022 10.29, Kevin Wolf wrote:
> > Am 09.02.2022 um 11:15 hat Thomas Huth geschrieben:
> > > If multiple tests run in parallel, they must use unique file
> > > names for the test output.
> > > 
> > > Suggested-by: Hanna Reitz 
> > > Signed-off-by: Thomas Huth 
> > > ---
> > >   tests/qemu-iotests/testrunner.py | 2 +-
> > >   1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/tests/qemu-iotests/testrunner.py 
> > > b/tests/qemu-iotests/testrunner.py
> > > index 0eace147b8..9d20f51bb1 100644
> > > --- a/tests/qemu-iotests/testrunner.py
> > > +++ b/tests/qemu-iotests/testrunner.py
> > > @@ -259,7 +259,7 @@ def do_run_test(self, test: str, mp: bool) -> 
> > > TestResult:
> > >   """
> > >   f_test = Path(test)
> > > -f_bad = Path(f_test.name + '.out.bad')
> > > +f_bad = Path(f'{os.getpid()}-{f_test.name}.out.bad')
> > >   f_notrun = Path(f_test.name + '.notrun')
> > >   f_casenotrun = Path(f_test.name + '.casenotrun')
> > >   f_reference = Path(self.find_reference(test))
> > 
> > Hmm... It does make sense, but nobody ever cleans those files up.
> > Currently, the next run of the test will just overwrite the existing
> > file or delete it when the test succeeds. So after running the test
> > suite, you have .out.bad files for every failed test, but not for those
> > that succeeded.
> > 
> > After this change, won't the test directory accumulate tons of .out.bad
> > files over time?
> 
> True ... but we certainly want to keep the file for failed tests for further
> analysis instead of immediately deleting them ... maybe it would be enough
> to encode the image format (qcow2, qed, vmdk, ...) into the output name,
> instead of using the PID, so that "make check SPEED=thorough" works as
> expected here?

It depends on what the supported use case for test suites running in
parallel is. If it's just for testing multiple formats at the same time,
then this would work, yes.

I could think of more test runs that you might want to do in parallel,
like different protocols, different image format options, maybe even
different host file system. I'm not sure if all (or any) of these are
relevant, though.

Supporting only things that "make check" uses might be a good
compromise.

Kevin

Re: [PATCH 5/6] test-bdrv-drain.c: remove test_detach_by_parent_cb()

2022-02-11 Thread Kevin Wolf

Am 08.02.2022 um 16:36 hat Emanuele Giuseppe Esposito geschrieben:
> This test uses a callback of an I/O function (blk_aio_preadv)
> to modify the graph, using bdrv_attach_child.
> This is simply not allowed anymore. I/O cannot change the graph.

The callback of an I/O function isn't I/O, though. It is code _after_
the I/O has completed. If this doesn't work any more, it feels like this
is a bug.

I think it becomes a bit more obvious when you translate the AIO into
the equivalent coroutine function:

void coroutine_fn foo(...)
{
GLOBAL_STATE_CODE();

blk_co_preadv(...);
detach_by_parent_aio_cb(...);
}

This is obviously correct code that must work. Calling an I/O function
from a GS function is allowed, and of course the GS function may
continue to do GS stuff after the I/O function returned.

(Actually, I'm not sure if it really works when blk is not in the main
AioContext, but your API split patches claim that it does, so let's
assume for the moment that this is already true :-))

> Before "block/io.c: make bdrv_do_drained_begin_quiesce static
> and introduce bdrv_drained_begin_no_poll", the test would simply
> be at risk of failure, because if bdrv_replace_child_noperm()
> (called to modify the graph) would call a drain,
> then one callback of .drained_begin() is bdrv_do_drained_begin_quiesce,
> that specifically asserts that we are not in a coroutine.
> 
> Now that we fixed the behavior, the drain will invoke a bh in the
> main loop, so we don't have such problem. However, this test is still
> illegal and fails because we forbid graph changes from I/O paths.
> 
> Once we add the required subtree_drains to protect
> bdrv_replace_child_noperm(), the key problem in this test is in:

Probably a question for a different patch, but why is a subtree drain
required instead of just a normal node drain? It feels like a bigger
hammer than what is needed for replacing a single child.

> acb = blk_aio_preadv(blk, 0, &qiov, 0, detach_by_parent_aio_cb, NULL);
> /* Drain and check the expected result */
> bdrv_subtree_drained_begin(parent_b);
> 
> because the detach_by_parent_aio_cb calls detach_indirect_bh(), that
> modifies the graph and is invoked during bdrv_subtree_drained_begin().
> The call stack is the following:
> 1. blk_aio_preadv() creates a coroutine, increments in_flight counter
> and enters the coroutine running blk_aio_read_entry()
> 2. blk_aio_read_entry() performs the read and then schedules a bh to
>complete (blk_aio_complete)
> 3. at this point, subtree_drained_begin() kicks in and waits for all
>in_flight requests, polling
> 4. polling allows the bh to be scheduled, so blk_aio_complete runs
> 5. blk_aio_complete *first* invokes the callback
>(detach_by_parent_aio_cb) and then decrements the in_flight counter
> 6. Here we have the problem: detach_by_parent_aio_cb modifies the graph,
>so both bdrv_unref_child() and bdrv_attach_child() will have
>subtree_drains inside. And this causes a deadlock, because the
>nested drain will wait for in_flight counter to go to zero, which
>is only happening once the drain itself finishes.

So the problem has nothing to do with detach_by_parent_aio_cb() being
an I/O function in the sense of locking rules (which it isn't), but with
calling a drain operation while having the in_flight counter increased.

In other words, an AIO callback like detach_by_parent_aio_cb() must
never call drain - and it doesn't before your changes to
bdrv_replace_child_noperm() break it. How confident are we that this
only breaks tests and not real code?

Part of the problem is probably that drain is serving two slightly
different purposes inside the block layer (just make sure that nothing
touches the node any more) and callers outside of the block layer (make
sure that everything has completed and no more callbacks will be
called). This bug sits exactly in the difference between those two
concepts - we're waiting for more than we would have to wait for, and it
causes a deadlock in this combination.

I guess it could be fixed if BlockBackend accounted for requests that
are already completed, but their callback hasn't yet. blk_drain() would
then have to wait for those requests, but blk_root_drained_poll()
wouldn't because these requests don't affect the root node any more.

> Different story is test_detach_by_driver_cb(): in this case,
> detach_by_parent_aio_cb() does not call detach_indirect_bh(),
> but it is instead called as a bh running in the main loop by
> detach_by_driver_cb_drained_begin(), the callback for
> .drained_begin().
> 
> This test was added in 231281ab42 and part of the series
> "Drain fixes and cleanups, part 3"
> https://lists.nongnu.org/archive/html/qemu-block/2018-05/msg01132.html
> but as explained above I believe that it is not valid anymore, and
> can be discarded.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 

I feel throwing tests away just because they don't pass any more is a
bit too simple. :-)

K

[PATCH v7 31/31] job.h: assertions in the callers of JobDriver function pointers

2022-02-11 Thread Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito 
---
 job.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/job.c b/job.c
index 54db80df66..075c6f3a20 100644
--- a/job.c
+++ b/job.c
@@ -381,6 +381,8 @@ void job_ref(Job *job)
 
 void job_unref(Job *job)
 {
+GLOBAL_STATE_CODE();
+
 if (--job->refcnt == 0) {
 assert(job->status == JOB_STATUS_NULL);
 assert(!timer_pending(&job->sleep_timer));
@@ -602,6 +604,7 @@ bool job_user_paused(Job *job)
 void job_user_resume(Job *job, Error **errp)
 {
 assert(job);
+GLOBAL_STATE_CODE();
 if (!job->user_paused || job->pause_count <= 0) {
 error_setg(errp, "Can't resume a job that was not paused");
 return;
@@ -672,6 +675,7 @@ static void job_update_rc(Job *job)
 static void job_commit(Job *job)
 {
 assert(!job->ret);
+GLOBAL_STATE_CODE();
 if (job->driver->commit) {
 job->driver->commit(job);
 }
@@ -680,6 +684,7 @@ static void job_commit(Job *job)
 static void job_abort(Job *job)
 {
 assert(job->ret);
+GLOBAL_STATE_CODE();
 if (job->driver->abort) {
 job->driver->abort(job);
 }
@@ -687,6 +692,7 @@ static void job_abort(Job *job)
 
 static void job_clean(Job *job)
 {
+GLOBAL_STATE_CODE();
 if (job->driver->clean) {
 job->driver->clean(job);
 }
@@ -726,6 +732,7 @@ static int job_finalize_single(Job *job)
 
 static void job_cancel_async(Job *job, bool force)
 {
+GLOBAL_STATE_CODE();
 if (job->driver->cancel) {
 force = job->driver->cancel(job, force);
 } else {
@@ -825,6 +832,7 @@ static void job_completed_txn_abort(Job *job)
 
 static int job_prepare(Job *job)
 {
+GLOBAL_STATE_CODE();
 if (job->ret == 0 && job->driver->prepare) {
 job->ret = job->driver->prepare(job);
 job_update_rc(job);
@@ -952,6 +960,7 @@ static void coroutine_fn job_co_entry(void *opaque)
 Job *job = opaque;
 
 assert(job && job->driver && job->driver->run);
+assert(job->aio_context == qemu_get_current_aio_context());
 job_pause_point(job);
 job->ret = job->driver->run(job, &job->err);
 job->deferred_to_main_loop = true;
@@ -1054,6 +1063,7 @@ void job_complete(Job *job, Error **errp)
 {
 /* Should not be reachable via external interface for internal jobs */
 assert(job->id);
+GLOBAL_STATE_CODE();
 if (job_apply_verb(job, JOB_VERB_COMPLETE, errp)) {
 return;
 }
-- 
2.31.1

[PATCH v7 22/31] include/block/snapshot: global state API + assertions

2022-02-11 Thread Emanuele Giuseppe Esposito

Snapshots run also under the BQL, so they all are
in the global state API. The aiocontext lock that they hold
is currently an overkill and in future could be removed.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/snapshot.c | 28 
 include/block/snapshot.h | 13 +++--
 migration/savevm.c   |  2 ++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/block/snapshot.c b/block/snapshot.c
index ccacda8bd5..d6f53c3065 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -57,6 +57,8 @@ int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo 
*sn_info,
 QEMUSnapshotInfo *sn_tab, *sn;
 int nb_sns, i, ret;
 
+GLOBAL_STATE_CODE();
+
 ret = -ENOENT;
 nb_sns = bdrv_snapshot_list(bs, &sn_tab);
 if (nb_sns < 0) {
@@ -105,6 +107,7 @@ bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs,
 bool ret = false;
 
 assert(id || name);
+GLOBAL_STATE_CODE();
 
 nb_sns = bdrv_snapshot_list(bs, &sn_tab);
 if (nb_sns < 0) {
@@ -200,6 +203,7 @@ static BlockDriverState 
*bdrv_snapshot_fallback(BlockDriverState *bs)
 int bdrv_can_snapshot(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
+GLOBAL_STATE_CODE();
 if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
 return 0;
 }
@@ -220,6 +224,9 @@ int bdrv_snapshot_create(BlockDriverState *bs,
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
+
+GLOBAL_STATE_CODE();
+
 if (!drv) {
 return -ENOMEDIUM;
 }
@@ -240,6 +247,8 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
 BdrvChild **fallback_ptr;
 int ret, open_ret;
 
+GLOBAL_STATE_CODE();
+
 if (!drv) {
 error_setg(errp, "Block driver is closed");
 return -ENOMEDIUM;
@@ -348,6 +357,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs,
 BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
 int ret;
 
+GLOBAL_STATE_CODE();
+
 if (!drv) {
 error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
 return -ENOMEDIUM;
@@ -380,6 +391,8 @@ int bdrv_snapshot_list(BlockDriverState *bs,
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
+
+GLOBAL_STATE_CODE();
 if (!drv) {
 return -ENOMEDIUM;
 }
@@ -419,6 +432,8 @@ int bdrv_snapshot_load_tmp(BlockDriverState *bs,
 {
 BlockDriver *drv = bs->drv;
 
+GLOBAL_STATE_CODE();
+
 if (!drv) {
 error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs));
 return -ENOMEDIUM;
@@ -447,6 +462,8 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState 
*bs,
 int ret;
 Error *local_err = NULL;
 
+GLOBAL_STATE_CODE();
+
 ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, &local_err);
 if (ret == -ENOENT || ret == -EINVAL) {
 error_free(local_err);
@@ -515,6 +532,8 @@ bool bdrv_all_can_snapshot(bool has_devices, strList 
*devices,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) 
{
 return false;
 }
@@ -549,6 +568,8 @@ int bdrv_all_delete_snapshot(const char *name,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) 
{
 return -1;
 }
@@ -588,6 +609,8 @@ int bdrv_all_goto_snapshot(const char *name,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) 
{
 return -1;
 }
@@ -622,6 +645,8 @@ int bdrv_all_has_snapshot(const char *name,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) 
{
 return -1;
 }
@@ -663,6 +688,7 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
 {
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
+GLOBAL_STATE_CODE();
 
 if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) 
{
 return -1;
@@ -703,6 +729,8 @@ BlockDriverState *bdrv_all_find_vmstate_bs(const char 
*vmstate_bs,
 g_autoptr(GList) bdrvs = NULL;
 GList *iterbdrvs;
 
+GLOBAL_STATE_CODE();
+
 if (bdrv_all_get_snapshot_devices(has_devices, devices, &bdrvs, errp) < 0) 
{
 return NULL;
 }
diff --git a/include/block/snapshot.h b/include/block/snapshot.h
index 940345692f..50ff924710 100644
--- a/include/block/snapshot.h
+++ b/include/block/snapshot.h
@@ -45,6 +45,13 @@ typedef struct QEMUSnapshotInfo {
 uint64_t icount; /* record/replay step */
 } QEMUSnapshotInfo;
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ *

[PATCH v7 27/31] block_int-common.h: split function pointers in BdrvChildClass

2022-02-11 Thread Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/block/block_int-common.h | 81 ++--
 1 file changed, 47 insertions(+), 34 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index f05ebb0da3..5a04c778e4 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -830,19 +830,16 @@ struct BdrvChildClass {
  */
 bool parent_is_bds;
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
 void (*inherit_options)(BdrvChildRole role, bool parent_is_format,
 int *child_flags, QDict *child_options,
 int parent_flags, QDict *parent_options);
-
 void (*change_media)(BdrvChild *child, bool load);
-void (*resize)(BdrvChild *child);
-
-/*
- * Returns a name that is supposedly more useful for human users than the
- * node name for identifying the node in question (in particular, a BB
- * name), or NULL if the parent can't provide a better name.
- */
-const char *(*get_name)(BdrvChild *child);
 
 /*
  * Returns a malloced string that describes the parent of the child for a
@@ -852,6 +849,47 @@ struct BdrvChildClass {
  */
 char *(*get_parent_desc)(BdrvChild *child);
 
+/*
+ * Notifies the parent that the child has been activated/inactivated (e.g.
+ * when migration is completing) and it can start/stop requesting
+ * permissions and doing I/O on it.
+ */
+void (*activate)(BdrvChild *child, Error **errp);
+int (*inactivate)(BdrvChild *child);
+
+void (*attach)(BdrvChild *child);
+void (*detach)(BdrvChild *child);
+
+/*
+ * Notifies the parent that the filename of its child has changed (e.g.
+ * because the direct child was removed from the backing chain), so that it
+ * can update its reference.
+ */
+int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
+   const char *filename, Error **errp);
+
+bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx,
+GSList **ignore, Error **errp);
+void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore);
+
+AioContext *(*get_parent_aio_context)(BdrvChild *child);
+
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
+void (*resize)(BdrvChild *child);
+
+/*
+ * Returns a name that is supposedly more useful for human users than the
+ * node name for identifying the node in question (in particular, a BB
+ * name), or NULL if the parent can't provide a better name.
+ */
+const char *(*get_name)(BdrvChild *child);
+
 /*
  * If this pair of functions is implemented, the parent doesn't issue new
  * requests after returning from .drained_begin() until .drained_end() is
@@ -876,31 +914,6 @@ struct BdrvChildClass {
  * activity on the child has stopped.
  */
 bool (*drained_poll)(BdrvChild *child);
-
-/*
- * Notifies the parent that the child has been activated/inactivated (e.g.
- * when migration is completing) and it can start/stop requesting
- * permissions and doing I/O on it.
- */
-void (*activate)(BdrvChild *child, Error **errp);
-int (*inactivate)(BdrvChild *child);
-
-void (*attach)(BdrvChild *child);
-void (*detach)(BdrvChild *child);
-
-/*
- * Notifies the parent that the filename of its child has changed (e.g.
- * because the direct child was removed from the backing chain), so that it
- * can update its reference.
- */
-int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
-   const char *filename, Error **errp);
-
-bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx,
-GSList **ignore, Error **errp);
-void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore);
-
-AioContext *(*get_parent_aio_context)(BdrvChild *child);
 };
 
 extern const BdrvChildClass child_of_bds;
-- 
2.31.1

[PATCH v7 30/31] job.h: split function pointers in JobDriver

2022-02-11 Thread Emanuele Giuseppe Esposito

The job API will be handled separately in another serie.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/qemu/job.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 6e67b6977f..c105b31076 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -169,6 +169,12 @@ typedef struct Job {
  * Callbacks and other information about a Job driver.
  */
 struct JobDriver {
+
+/*
+ * These fields are initialized when this object is created,
+ * and are never changed afterwards
+ */
+
 /** Derived Job struct size */
 size_t instance_size;
 
@@ -184,9 +190,18 @@ struct JobDriver {
  * aborted. If it returns zero, the job moves into the WAITING state. If it
  * is the last job to complete in its transaction, all jobs in the
  * transaction move from WAITING to PENDING.
+ *
+ * This callback must be run in the job's context.
  */
 int coroutine_fn (*run)(Job *job, Error **errp);
 
+/*
+ * Functions run without regard to the BQL that may run in any
+ * arbitrary thread. These functions do not need to be thread-safe
+ * because the caller ensures that they are invoked from one
+ * thread at time.
+ */
+
 /**
  * If the callback is not NULL, it will be invoked when the job transitions
  * into the paused state.  Paused jobs must not perform any asynchronous
@@ -201,6 +216,13 @@ struct JobDriver {
  */
 void coroutine_fn (*resume)(Job *job);
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /**
  * Called when the job is resumed by the user (i.e. user_paused becomes
  * false). .user_resume is called before .resume.
-- 
2.31.1

[PATCH v7 29/31] block-backend-common.h: split function pointers in BlockDevOps

2022-02-11 Thread Emanuele Giuseppe Esposito

Assertions in the callers of the function pointrs are already
added by previous patches.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Philippe Mathieu-Daudé 
---
 include/sysemu/block-backend-common.h | 28 ++-
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/sysemu/block-backend-common.h 
b/include/sysemu/block-backend-common.h
index 6963bbf45a..2391679c56 100644
--- a/include/sysemu/block-backend-common.h
+++ b/include/sysemu/block-backend-common.h
@@ -27,6 +27,14 @@
 
 /* Callbacks for block device models */
 typedef struct BlockDevOps {
+
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /*
  * Runs when virtual media changed (monitor commands eject, change)
  * Argument load is true on load and false on eject.
@@ -44,16 +52,26 @@ typedef struct BlockDevOps {
  * true, even if they do not support eject requests.
  */
 void (*eject_request_cb)(void *opaque, bool force);
-/*
- * Is the virtual tray open?
- * Device models implement this only when the device has a tray.
- */
-bool (*is_tray_open)(void *opaque);
+
 /*
  * Is the virtual medium locked into the device?
  * Device models implement this only when device has such a lock.
  */
 bool (*is_medium_locked)(void *opaque);
+
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
+/*
+ * Is the virtual tray open?
+ * Device models implement this only when the device has a tray.
+ */
+bool (*is_tray_open)(void *opaque);
+
 /*
  * Runs when the size changed (e.g. monitor command block_resize)
  */
-- 
2.31.1

[PATCH v7 26/31] block_int-common.h: assertions in the callers of BlockDriver function pointers

2022-02-11 Thread Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c| 18 ++
 block/create.c |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/block.c b/block.c
index 7cacb5a1a7..d1f5cd2b39 100644
--- a/block.c
+++ b/block.c
@@ -529,6 +529,7 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque)
 
 CreateCo *cco = opaque;
 assert(cco->drv);
+GLOBAL_STATE_CODE();
 
 ret = cco->drv->bdrv_co_create_opts(cco->drv,
 cco->filename, cco->opts, &local_err);
@@ -1096,6 +1097,7 @@ int refresh_total_sectors(BlockDriverState *bs, int64_t 
hint)
 static void bdrv_join_options(BlockDriverState *bs, QDict *options,
   QDict *old_options)
 {
+GLOBAL_STATE_CODE();
 if (bs->drv && bs->drv->bdrv_join_options) {
 bs->drv->bdrv_join_options(options, old_options);
 } else {
@@ -1605,6 +1607,7 @@ static int bdrv_open_driver(BlockDriverState *bs, 
BlockDriver *drv,
 {
 Error *local_err = NULL;
 int i, ret;
+GLOBAL_STATE_CODE();
 
 bdrv_assign_node_name(bs, node_name, &local_err);
 if (local_err) {
@@ -1996,6 +1999,8 @@ static int bdrv_fill_options(QDict **options, const char 
*filename,
 BlockDriver *drv = NULL;
 Error *local_err = NULL;
 
+GLOBAL_STATE_CODE();
+
 /*
  * Caution: while qdict_get_try_str() is fine, getting non-string
  * types would require more care.  When @options come from
@@ -2192,6 +2197,7 @@ static void bdrv_child_perm(BlockDriverState *bs, 
BlockDriverState *child_bs,
 uint64_t *nperm, uint64_t *nshared)
 {
 assert(bs->drv && bs->drv->bdrv_child_perm);
+GLOBAL_STATE_CODE();
 bs->drv->bdrv_child_perm(bs, c, role, reopen_queue,
  parent_perm, parent_shared,
  nperm, nshared);
@@ -2280,6 +2286,7 @@ static void bdrv_drv_set_perm_commit(void *opaque)
 {
 BlockDriverState *bs = opaque;
 uint64_t cumulative_perms, cumulative_shared_perms;
+GLOBAL_STATE_CODE();
 
 if (bs->drv->bdrv_set_perm) {
 bdrv_get_cumulative_perm(bs, &cumulative_perms,
@@ -2291,6 +2298,7 @@ static void bdrv_drv_set_perm_commit(void *opaque)
 static void bdrv_drv_set_perm_abort(void *opaque)
 {
 BlockDriverState *bs = opaque;
+GLOBAL_STATE_CODE();
 
 if (bs->drv->bdrv_abort_perm_update) {
 bs->drv->bdrv_abort_perm_update(bs);
@@ -2306,6 +2314,7 @@ static int bdrv_drv_set_perm(BlockDriverState *bs, 
uint64_t perm,
  uint64_t shared_perm, Transaction *tran,
  Error **errp)
 {
+GLOBAL_STATE_CODE();
 if (!bs->drv) {
 return 0;
 }
@@ -4372,6 +4381,7 @@ int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, 
Error **errp)
 
 assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 assert(bs_queue != NULL);
+GLOBAL_STATE_CODE();
 
 QTAILQ_FOREACH(bs_entry, bs_queue, entry) {
 ctx = bdrv_get_aio_context(bs_entry->state.bs);
@@ -4637,6 +4647,7 @@ static int bdrv_reopen_prepare(BDRVReopenState 
*reopen_state,
 
 assert(reopen_state != NULL);
 assert(reopen_state->bs->drv != NULL);
+GLOBAL_STATE_CODE();
 drv = reopen_state->bs->drv;
 
 /* This function and each driver's bdrv_reopen_prepare() remove
@@ -4847,6 +4858,7 @@ static void bdrv_reopen_commit(BDRVReopenState 
*reopen_state)
 bs = reopen_state->bs;
 drv = bs->drv;
 assert(drv != NULL);
+GLOBAL_STATE_CODE();
 
 /* If there are any driver level actions to take */
 if (drv->bdrv_reopen_commit) {
@@ -4888,6 +4900,7 @@ static void bdrv_reopen_abort(BDRVReopenState 
*reopen_state)
 assert(reopen_state != NULL);
 drv = reopen_state->bs->drv;
 assert(drv != NULL);
+GLOBAL_STATE_CODE();
 
 if (drv->bdrv_reopen_abort) {
 drv->bdrv_reopen_abort(reopen_state);
@@ -4901,6 +4914,7 @@ static void bdrv_close(BlockDriverState *bs)
 BdrvChild *child, *next;
 
 assert(!bs->refcnt);
+GLOBAL_STATE_CODE();
 
 bdrv_drained_begin(bs); /* complete I/O */
 bdrv_flush(bs);
@@ -6722,6 +6736,8 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs)
 int ret;
 uint64_t cumulative_perms, cumulative_shared_perms;
 
+GLOBAL_STATE_CODE();
+
 if (!bs->drv) {
 return -ENOMEDIUM;
 }
@@ -7236,6 +7252,7 @@ static void bdrv_detach_aio_context(BlockDriverState *bs)
 BdrvAioNotifier *baf, *baf_tmp;
 
 assert(!bs->walking_aio_notifiers);
+GLOBAL_STATE_CODE();
 bs->walking_aio_notifiers = true;
 QLIST_FOREACH_SAFE(baf, &bs->aio_notifiers, list, baf_tmp) {
 if (baf->deleted) {
@@ -7263,6 +7280,7 @@ static void bdrv_attach_aio_context(BlockDriverState *bs,
 AioContext *new_context)
 {
 BdrvAioNotifier *ban, *ban_tmp;
+GLOBAL_STATE_CODE();
 
 if (bs->quiesce_counter) {
 aio_disable_external(new_context);
diff --git a/block/cre

[PATCH v7 25/31] block_int-common.h: split function pointers in BlockDriver

2022-02-11 Thread Emanuele Giuseppe Esposito

Similar to the header split, also the function pointers in BlockDriver
can be split in I/O and global state.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/block/block_int-common.h | 445 ---
 1 file changed, 237 insertions(+), 208 deletions(-)

diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index b92e3630fd..f05ebb0da3 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -96,6 +96,11 @@ typedef struct BdrvTrackedRequest {
 
 
 struct BlockDriver {
+/*
+ * These fields are initialized when this object is created,
+ * and are never changed afterwards.
+ */
+
 const char *format_name;
 int instance_size;
 
@@ -122,6 +127,69 @@ struct BlockDriver {
  */
 bool is_format;
 
+/*
+ * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
+ * this field set to true, except ones that are defined only by their
+ * child's bs.
+ * An example of the last type will be the quorum block driver.
+ */
+bool bdrv_needs_filename;
+
+/*
+ * Set if a driver can support backing files. This also implies the
+ * following semantics:
+ *
+ *  - Return status 0 of .bdrv_co_block_status means that corresponding
+ *blocks are not allocated in this layer of backing-chain
+ *  - For such (unallocated) blocks, read will:
+ *- fill buffer with zeros if there is no backing file
+ *- read from the backing file otherwise, where the block layer
+ *  takes care of reading zeros beyond EOF if backing file is short
+ */
+bool supports_backing;
+
+bool has_variable_length;
+
+/*
+ * Drivers setting this field must be able to work with just a plain
+ * filename with ':' as a prefix, and no other options.
+ * Options may be extracted from the filename by implementing
+ * bdrv_parse_filename.
+ */
+const char *protocol_name;
+
+/* List of options for creating images, terminated by name == NULL */
+QemuOptsList *create_opts;
+
+/* List of options for image amend */
+QemuOptsList *amend_opts;
+
+/*
+ * If this driver supports reopening images this contains a
+ * NULL-terminated list of the runtime options that can be
+ * modified. If an option in this list is unspecified during
+ * reopen then it _must_ be reset to its default value or return
+ * an error.
+ */
+const char *const *mutable_opts;
+
+/*
+ * Pointer to a NULL-terminated array of names of strong options
+ * that can be specified for bdrv_open(). A strong option is one
+ * that changes the data of a BDS.
+ * If this pointer is NULL, the array is considered empty.
+ * "filename" and "driver" are always considered strong.
+ */
+const char *const *strong_runtime_opts;
+
+
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /*
  * This function is invoked under BQL before .bdrv_co_amend()
  * (which in contrast does not necessarily run under the BQL)
@@ -143,7 +211,6 @@ struct BlockDriver {
 bool (*bdrv_recurse_can_replace)(BlockDriverState *bs,
  BlockDriverState *to_replace);
 
-int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
 int (*bdrv_probe_device)(const char *filename);
 
 /*
@@ -152,28 +219,8 @@ struct BlockDriver {
  */
 void (*bdrv_parse_filename)(const char *filename, QDict *options,
 Error **errp);
-/*
- * Drivers not implementing bdrv_parse_filename nor bdrv_open should have
- * this field set to true, except ones that are defined only by their
- * child's bs.
- * An example of the last type will be the quorum block driver.
- */
-bool bdrv_needs_filename;
-
-/*
- * Set if a driver can support backing files. This also implies the
- * following semantics:
- *
- *  - Return status 0 of .bdrv_co_block_status means that corresponding
- *blocks are not allocated in this layer of backing-chain
- *  - For such (unallocated) blocks, read will:
- *- fill buffer with zeros if there is no backing file
- *- read from the backing file otherwise, where the block layer
- *  takes care of reading zeros beyond EOF if backing file is short
- */
-bool supports_backing;
 
-/* For handling image reopen for split or non-split files */
+/* For handling image reopen for split or non-split files. */
 int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
BlockReopenQueue *queue, Error **errp);
 void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
@@ -189,7 +236,6 @@ struct BlockDriver {
   Error **errp);
 void (*bdrv_clo

[PATCH v7 23/31] block/copy-before-write.h: global state API + assertions

2022-02-11 Thread Emanuele Giuseppe Esposito

copy-before-write functions always run under BQL.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/copy-before-write.c | 2 ++
 block/copy-before-write.h | 7 +++
 2 files changed, 9 insertions(+)

diff --git a/block/copy-before-write.c b/block/copy-before-write.c
index c30a5ff8de..80b7684dba 100644
--- a/block/copy-before-write.c
+++ b/block/copy-before-write.c
@@ -223,6 +223,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
 QDict *opts;
 
 assert(source->total_sectors == target->total_sectors);
+GLOBAL_STATE_CODE();
 
 opts = qdict_new();
 qdict_put_str(opts, "driver", "copy-before-write");
@@ -245,6 +246,7 @@ BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
 
 void bdrv_cbw_drop(BlockDriverState *bs)
 {
+GLOBAL_STATE_CODE();
 bdrv_drop_filter(bs, &error_abort);
 bdrv_unref(bs);
 }
diff --git a/block/copy-before-write.h b/block/copy-before-write.h
index 51847e711a..6e72bb25e9 100644
--- a/block/copy-before-write.h
+++ b/block/copy-before-write.h
@@ -29,6 +29,13 @@
 #include "block/block_int.h"
 #include "block/block-copy.h"
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
   BlockDriverState *target,
   const char *filter_node_name,
-- 
2.31.1

[PATCH v7 24/31] block/coroutines: I/O and "I/O or GS" API

2022-02-11 Thread Emanuele Giuseppe Esposito

block coroutines functions run in different aiocontext, and are
not protected by the BQL. Therefore are I/O.

On the other side, generated_co_wrapper functions use BDRV_POLL_WHILE,
meaning the caller can either be the main loop or a specific iothread.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c   |  2 ++
 block/block-backend.c |  6 
 block/coroutines.h| 81 +++
 block/io.c|  3 ++
 block/nbd.c   |  1 +
 5 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/block.c b/block.c
index 29c88e8727..7cacb5a1a7 100644
--- a/block.c
+++ b/block.c
@@ -5453,6 +5453,7 @@ fail:
 int coroutine_fn bdrv_co_check(BlockDriverState *bs,
BdrvCheckResult *res, BdrvCheckMode fix)
 {
+IO_CODE();
 if (bs->drv == NULL) {
 return -ENOMEDIUM;
 }
@@ -6662,6 +6663,7 @@ int bdrv_activate(BlockDriverState *bs, Error **errp)
 int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
 Error *local_err = NULL;
+IO_CODE();
 
 assert(!(bs->open_flags & BDRV_O_INACTIVE));
 
diff --git a/block/block-backend.c b/block/block-backend.c
index d8a77a5832..d901e0e1d4 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1290,6 +1290,7 @@ blk_co_do_preadv(BlockBackend *blk, int64_t offset, 
int64_t bytes,
 {
 int ret;
 BlockDriverState *bs;
+IO_CODE();
 
 blk_wait_while_drained(blk);
 
@@ -1337,6 +1338,7 @@ blk_co_do_pwritev_part(BlockBackend *blk, int64_t offset, 
int64_t bytes,
 {
 int ret;
 BlockDriverState *bs;
+IO_CODE();
 
 blk_wait_while_drained(blk);
 
@@ -1656,6 +1658,8 @@ void blk_aio_cancel_async(BlockAIOCB *acb)
 int coroutine_fn
 blk_co_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
 {
+IO_CODE();
+
 blk_wait_while_drained(blk);
 
 if (!blk_is_available(blk)) {
@@ -1699,6 +1703,7 @@ int coroutine_fn
 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes)
 {
 int ret;
+IO_CODE();
 
 blk_wait_while_drained(blk);
 
@@ -1757,6 +1762,7 @@ int blk_pdiscard(BlockBackend *blk, int64_t offset, 
int64_t bytes)
 int coroutine_fn blk_co_do_flush(BlockBackend *blk)
 {
 blk_wait_while_drained(blk);
+IO_CODE();
 
 if (!blk_is_available(blk)) {
 return -ENOMEDIUM;
diff --git a/block/coroutines.h b/block/coroutines.h
index c8c14a29c8..b293e943c8 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -30,17 +30,17 @@
 /* For blk_bs() in generated block/block-gen.c */
 #include "sysemu/block-backend.h"
 
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
 int coroutine_fn bdrv_co_check(BlockDriverState *bs,
BdrvCheckResult *res, BdrvCheckMode fix);
 int coroutine_fn bdrv_co_invalidate_cache(BlockDriverState *bs, Error **errp);
 
-int generated_co_wrapper
-bdrv_preadv(BdrvChild *child, int64_t offset, unsigned int bytes,
-QEMUIOVector *qiov, BdrvRequestFlags flags);
-int generated_co_wrapper
-bdrv_pwritev(BdrvChild *child, int64_t offset, unsigned int bytes,
- QEMUIOVector *qiov, BdrvRequestFlags flags);
-
 int coroutine_fn
 bdrv_co_common_block_status_above(BlockDriverState *bs,
   BlockDriverState *base,
@@ -52,6 +52,51 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,
   int64_t *map,
   BlockDriverState **file,
   int *depth);
+
+int coroutine_fn bdrv_co_readv_vmstate(BlockDriverState *bs,
+   QEMUIOVector *qiov, int64_t pos);
+int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
+QEMUIOVector *qiov, int64_t pos);
+
+int coroutine_fn
+nbd_co_do_establish_connection(BlockDriverState *bs, Error **errp);
+
+
+int coroutine_fn
+blk_co_do_preadv(BlockBackend *blk, int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, BdrvRequestFlags flags);
+
+
+int coroutine_fn
+blk_co_do_pwritev_part(BlockBackend *blk, int64_t offset, int64_t bytes,
+   QEMUIOVector *qiov, size_t qiov_offset,
+   BdrvRequestFlags flags);
+
+int coroutine_fn
+blk_co_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
+
+int coroutine_fn
+blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
+
+int coroutine_fn blk_co_do_flush(BlockBackend *blk);
+
+
+/*
+ * "I/O or GS" API functions. These functions can run without
+ * the BQL, but only in one specific iothread/main loop.
+ *
+ * See include/block/block-io.h for more information about
+ * the "I/O or GS" API.
+ */
+
+int generated_co_wrapper
+bdrv_preadv(BdrvChild *child, int64_t offset, unsigned int bytes,
+QEMUIOVector *qiov, BdrvRequestFlags flags);
+
+int generated_co_wr

[PATCH v7 21/31] assertions for blockdev.h global state API

2022-02-11 Thread Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/block-backend.c |  3 +++
 blockdev.c| 16 
 2 files changed, 19 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index e5708702db..d8a77a5832 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -810,6 +810,7 @@ bool bdrv_is_root_node(BlockDriverState *bs)
  */
 DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return blk->legacy_dinfo;
 }
 
@@ -821,6 +822,7 @@ DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
 {
 assert(!blk->legacy_dinfo);
+GLOBAL_STATE_CODE();
 return blk->legacy_dinfo = dinfo;
 }
 
@@ -831,6 +833,7 @@ DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, 
DriveInfo *dinfo)
 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 {
 BlockBackend *blk = NULL;
+GLOBAL_STATE_CODE();
 
 while ((blk = blk_next(blk)) != NULL) {
 if (blk->legacy_dinfo == dinfo) {
diff --git a/blockdev.c b/blockdev.c
index 9e638f1982..ba20f5f1ab 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -113,6 +113,8 @@ void override_max_devs(BlockInterfaceType type, int 
max_devs)
 BlockBackend *blk;
 DriveInfo *dinfo;
 
+GLOBAL_STATE_CODE();
+
 if (max_devs <= 0) {
 return;
 }
@@ -142,6 +144,8 @@ void blockdev_mark_auto_del(BlockBackend *blk)
 DriveInfo *dinfo = blk_legacy_dinfo(blk);
 BlockJob *job;
 
+GLOBAL_STATE_CODE();
+
 if (!dinfo) {
 return;
 }
@@ -163,6 +167,7 @@ void blockdev_mark_auto_del(BlockBackend *blk)
 void blockdev_auto_del(BlockBackend *blk)
 {
 DriveInfo *dinfo = blk_legacy_dinfo(blk);
+GLOBAL_STATE_CODE();
 
 if (dinfo && dinfo->auto_del) {
 monitor_remove_blk(blk);
@@ -187,6 +192,8 @@ QemuOpts *drive_add(BlockInterfaceType type, int index, 
const char *file,
 {
 QemuOpts *opts;
 
+GLOBAL_STATE_CODE();
+
 opts = qemu_opts_parse_noisily(qemu_find_opts("drive"), optstr, false);
 if (!opts) {
 return NULL;
@@ -207,6 +214,8 @@ DriveInfo *drive_get(BlockInterfaceType type, int bus, int 
unit)
 BlockBackend *blk;
 DriveInfo *dinfo;
 
+GLOBAL_STATE_CODE();
+
 for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 dinfo = blk_legacy_dinfo(blk);
 if (dinfo && dinfo->type == type
@@ -229,6 +238,8 @@ void drive_check_orphaned(void)
 Location loc;
 bool orphans = false;
 
+GLOBAL_STATE_CODE();
+
 for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 dinfo = blk_legacy_dinfo(blk);
 /*
@@ -262,6 +273,7 @@ void drive_check_orphaned(void)
 
 DriveInfo *drive_get_by_index(BlockInterfaceType type, int index)
 {
+GLOBAL_STATE_CODE();
 return drive_get(type,
  drive_index_to_bus_id(type, index),
  drive_index_to_unit_id(type, index));
@@ -273,6 +285,8 @@ int drive_get_max_bus(BlockInterfaceType type)
 BlockBackend *blk;
 DriveInfo *dinfo;
 
+GLOBAL_STATE_CODE();
+
 max_bus = -1;
 for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 dinfo = blk_legacy_dinfo(blk);
@@ -759,6 +773,8 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType 
block_default_type,
 const char *filename;
 int i;
 
+GLOBAL_STATE_CODE();
+
 /* Change legacy command line options into QMP ones */
 static const struct {
 const char *from;
-- 
2.31.1

[PATCH v7 13/31] IO_CODE and IO_OR_GS_CODE for block_int I/O API

2022-02-11 Thread Emanuele Giuseppe Esposito

Mark all I/O functions with IO_CODE, and all "I/O OR GS" with
IO_OR_GS_CODE.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c  | 14 +-
 block/block-backend.c|  2 ++
 block/dirty-bitmap.c |  3 +++
 block/io.c   | 13 +
 include/block/block_int-io.h |  6 ++
 5 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 1fee5e41e8..496d2989d7 100644
--- a/block.c
+++ b/block.c
@@ -999,6 +999,7 @@ BlockDriver *bdrv_probe_all(const uint8_t *buf, int 
buf_size,
 {
 int score_max = 0, score;
 BlockDriver *drv = NULL, *d;
+IO_CODE();
 
 QLIST_FOREACH(d, &bdrv_drivers, list) {
 if (d->bdrv_probe) {
@@ -1051,6 +1052,7 @@ static int find_image_format(BlockBackend *file, const 
char *filename,
 int refresh_total_sectors(BlockDriverState *bs, int64_t hint)
 {
 BlockDriver *drv = bs->drv;
+IO_CODE();
 
 if (!drv) {
 return -ENOMEDIUM;
@@ -6195,6 +6197,7 @@ const char *bdrv_get_parent_name(const BlockDriverState 
*bs)
 {
 BdrvChild *c;
 const char *name;
+IO_CODE();
 
 /* If multiple parents have a name, just pick the first one. */
 QLIST_FOREACH(c, &bs->parents, next_parent) {
@@ -7931,6 +7934,8 @@ int bdrv_make_empty(BdrvChild *c, Error **errp)
  */
 BdrvChild *bdrv_cow_child(BlockDriverState *bs)
 {
+IO_CODE();
+
 if (!bs || !bs->drv) {
 return NULL;
 }
@@ -7954,6 +7959,7 @@ BdrvChild *bdrv_cow_child(BlockDriverState *bs)
 BdrvChild *bdrv_filter_child(BlockDriverState *bs)
 {
 BdrvChild *c;
+IO_CODE();
 
 if (!bs || !bs->drv) {
 return NULL;
@@ -7985,6 +7991,7 @@ BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
 {
 BdrvChild *cow_child = bdrv_cow_child(bs);
 BdrvChild *filter_child = bdrv_filter_child(bs);
+IO_CODE();
 
 /* Filter nodes cannot have COW backing files */
 assert(!(cow_child && filter_child));
@@ -8005,6 +8012,7 @@ BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs)
 BdrvChild *bdrv_primary_child(BlockDriverState *bs)
 {
 BdrvChild *c, *found = NULL;
+IO_CODE();
 
 QLIST_FOREACH(c, &bs->children, next) {
 if (c->role & BDRV_CHILD_PRIMARY) {
@@ -8067,6 +8075,7 @@ BlockDriverState 
*bdrv_skip_implicit_filters(BlockDriverState *bs)
  */
 BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
 {
+IO_CODE();
 return bdrv_do_skip_filters(bs, false);
 }
 
@@ -8076,6 +8085,7 @@ BlockDriverState *bdrv_skip_filters(BlockDriverState *bs)
  */
 BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
 {
+IO_CODE();
 return bdrv_skip_filters(bdrv_cow_bs(bdrv_skip_filters(bs)));
 }
 
@@ -8111,8 +8121,8 @@ static bool 
bdrv_bsc_range_overlaps_locked(BlockDriverState *bs,
  */
 bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum)
 {
+IO_CODE();
 RCU_READ_LOCK_GUARD();
-
 return bdrv_bsc_range_overlaps_locked(bs, offset, 1, pnum);
 }
 
@@ -8122,6 +8132,7 @@ bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t 
offset, int64_t *pnum)
 void bdrv_bsc_invalidate_range(BlockDriverState *bs,
int64_t offset, int64_t bytes)
 {
+IO_CODE();
 RCU_READ_LOCK_GUARD();
 
 if (bdrv_bsc_range_overlaps_locked(bs, offset, bytes, NULL)) {
@@ -8136,6 +8147,7 @@ void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, 
int64_t bytes)
 {
 BdrvBlockStatusCache *new_bsc = g_new(BdrvBlockStatusCache, 1);
 BdrvBlockStatusCache *old_bsc;
+IO_CODE();
 
 *new_bsc = (BdrvBlockStatusCache) {
 .valid = true,
diff --git a/block/block-backend.c b/block/block-backend.c
index 292481069f..d3dc13809f 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1115,6 +1115,7 @@ bool blk_dev_has_removable_media(BlockBackend *blk)
  */
 bool blk_dev_has_tray(BlockBackend *blk)
 {
+IO_CODE();
 return blk->dev_ops && blk->dev_ops->is_tray_open;
 }
 
@@ -1135,6 +1136,7 @@ void blk_dev_eject_request(BlockBackend *blk, bool force)
  */
 bool blk_dev_is_tray_open(BlockBackend *blk)
 {
+IO_CODE();
 if (blk_dev_has_tray(blk)) {
 return blk->dev_ops->is_tray_open(blk->dev_opaque);
 }
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index e2a1648deb..0334b85805 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -657,6 +657,7 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
 
 void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
 {
+IO_CODE();
 assert(!bdrv_dirty_bitmap_readonly(bitmap));
 bdrv_dirty_bitmaps_lock(bitmap->bs);
 if (!out) {
@@ -739,6 +740,7 @@ void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap 
*bitmap)
 void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes)
 {
 BdrvDirtyBitmap *bitmap;
+IO_CODE();
 
 if (QLIST_EMPTY(&bs->dirty_bitmaps)) {
 return;
@@ -930,6 +932,7 @@ bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,

[PATCH v7 28/31] block_int-common.h: assertions in the callers of BdrvChildClass function pointers

2022-02-11 Thread Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index d1f5cd2b39..4297431812 100644
--- a/block.c
+++ b/block.c
@@ -1497,7 +1497,7 @@ const BdrvChildClass child_of_bds = {
 
 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
 {
-IO_CODE();
+GLOBAL_STATE_CODE();
 return c->klass->get_parent_aio_context(c);
 }
 
@@ -2128,6 +2128,7 @@ bool bdrv_is_writable(BlockDriverState *bs)
 
 static char *bdrv_child_user_desc(BdrvChild *c)
 {
+GLOBAL_STATE_CODE();
 return c->klass->get_parent_desc(c);
 }
 
@@ -2844,6 +2845,7 @@ static void bdrv_replace_child_noperm(BdrvChild **childp,
 
 assert(!child->frozen);
 assert(old_bs != new_bs);
+GLOBAL_STATE_CODE();
 
 if (old_bs && new_bs) {
 assert(bdrv_get_aio_context(old_bs) == bdrv_get_aio_context(new_bs));
@@ -2940,6 +2942,7 @@ static void bdrv_attach_child_common_abort(void *opaque)
 BdrvChild *child = *s->child;
 BlockDriverState *bs = child->bs;
 
+GLOBAL_STATE_CODE();
 /*
  * Pass free_empty_child=false, because we still need the child
  * for the AioContext operations on the parent below; those
@@ -3308,6 +3311,7 @@ void bdrv_unref_child(BlockDriverState *parent, BdrvChild 
*child)
 static void bdrv_parent_cb_change_media(BlockDriverState *bs, bool load)
 {
 BdrvChild *c;
+GLOBAL_STATE_CODE();
 QLIST_FOREACH(c, &bs->parents, next_parent) {
 if (c->klass->change_media) {
 c->klass->change_media(c, load);
@@ -3807,6 +3811,7 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
 
 assert(!child_class || !flags);
 assert(!child_class == !parent);
+GLOBAL_STATE_CODE();
 
 if (reference) {
 bool options_non_empty = options ? qdict_size(options) : false;
@@ -4193,6 +4198,7 @@ static BlockReopenQueue 
*bdrv_reopen_queue_child(BlockReopenQueue *bs_queue,
  * important to avoid graph changes between the recursive queuing here and
  * bdrv_reopen_multiple(). */
 assert(bs->quiesce_counter > 0);
+GLOBAL_STATE_CODE();
 
 if (bs_queue == NULL) {
 bs_queue = g_new0(BlockReopenQueue, 1);
@@ -7327,6 +7333,7 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
 BdrvChild *child, *parent;
 
 g_assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+GLOBAL_STATE_CODE();
 
 if (old_context == new_context) {
 return;
@@ -7399,6 +7406,7 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
 static bool bdrv_parent_can_set_aio_context(BdrvChild *c, AioContext *ctx,
 GSList **ignore, Error **errp)
 {
+GLOBAL_STATE_CODE();
 if (g_slist_find(*ignore, c)) {
 return true;
 }
-- 
2.31.1

[PATCH v7 05/31] IO_CODE and IO_OR_GS_CODE for block I/O API

2022-02-11 Thread Emanuele Giuseppe Esposito

Mark all I/O functions with IO_CODE, and all "I/O OR GS" with
IO_OR_GS_CODE.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c  | 35 +++-
 block/dirty-bitmap.c |  1 +
 block/io.c   | 43 ++--
 include/block/block-io.h |  1 +
 4 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 038fe9af24..aa04a38457 100644
--- a/block.c
+++ b/block.c
@@ -137,6 +137,7 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
 /* page size or 4k (hdd sector size) should be on the safe side */
 return MAX(4096, qemu_real_host_page_size);
 }
+IO_CODE();
 
 return bs->bl.opt_mem_alignment;
 }
@@ -147,6 +148,7 @@ size_t bdrv_min_mem_align(BlockDriverState *bs)
 /* page size or 4k (hdd sector size) should be on the safe side */
 return MAX(4096, qemu_real_host_page_size);
 }
+IO_CODE();
 
 return bs->bl.min_mem_alignment;
 }
@@ -272,6 +274,7 @@ void bdrv_parse_filename_strip_prefix(const char *filename, 
const char *prefix,
  * image is inactivated. */
 bool bdrv_is_read_only(BlockDriverState *bs)
 {
+IO_CODE();
 return !(bs->open_flags & BDRV_O_RDWR);
 }
 
@@ -390,6 +393,7 @@ static char *bdrv_make_absolute_filename(BlockDriverState 
*relative_to,
 
 char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp)
 {
+IO_CODE();
 return bdrv_make_absolute_filename(bs, bs->backing_file, errp);
 }
 
@@ -759,6 +763,7 @@ int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, 
Error **errp)
 Error *local_err = NULL;
 int ret;
 
+IO_CODE();
 assert(bs != NULL);
 
 if (!bs->drv) {
@@ -784,6 +789,7 @@ void coroutine_fn 
bdrv_co_delete_file_noerr(BlockDriverState *bs)
 {
 Error *local_err = NULL;
 int ret;
+IO_CODE();
 
 if (!bs) {
 return;
@@ -1444,6 +1450,7 @@ static int bdrv_child_cb_update_filename(BdrvChild *c, 
BlockDriverState *base,
 AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c)
 {
 BlockDriverState *bs = c->opaque;
+IO_CODE();
 
 return bdrv_get_aio_context(bs);
 }
@@ -1466,6 +1473,7 @@ const BdrvChildClass child_of_bds = {
 
 AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c)
 {
+IO_CODE();
 return c->klass->get_parent_aio_context(c);
 }
 
@@ -2079,6 +2087,7 @@ static bool 
bdrv_is_writable_after_reopen(BlockDriverState *bs,
  */
 bool bdrv_is_writable(BlockDriverState *bs)
 {
+IO_CODE();
 return bdrv_is_writable_after_reopen(bs, NULL);
 }
 
@@ -5706,6 +5715,8 @@ static int64_t 
bdrv_sum_allocated_file_size(BlockDriverState *bs)
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
+IO_CODE();
+
 if (!drv) {
 return -ENOMEDIUM;
 }
@@ -5755,6 +5766,7 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState *bs)
 BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
BlockDriverState *in_bs, Error **errp)
 {
+IO_CODE();
 if (!drv->bdrv_measure) {
 error_setg(errp, "Block driver '%s' does not support size measurement",
drv->format_name);
@@ -5770,6 +5782,7 @@ BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts 
*opts,
 int64_t bdrv_nb_sectors(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
+IO_CODE();
 
 if (!drv)
 return -ENOMEDIUM;
@@ -5790,6 +5803,7 @@ int64_t bdrv_nb_sectors(BlockDriverState *bs)
 int64_t bdrv_getlength(BlockDriverState *bs)
 {
 int64_t ret = bdrv_nb_sectors(bs);
+IO_CODE();
 
 if (ret < 0) {
 return ret;
@@ -5804,12 +5818,14 @@ int64_t bdrv_getlength(BlockDriverState *bs)
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
 {
 int64_t nb_sectors = bdrv_nb_sectors(bs);
+IO_CODE();
 
 *nb_sectors_ptr = nb_sectors < 0 ? 0 : nb_sectors;
 }
 
 bool bdrv_is_sg(BlockDriverState *bs)
 {
+IO_CODE();
 return bs->sg;
 }
 
@@ -5819,6 +5835,7 @@ bool bdrv_is_sg(BlockDriverState *bs)
 bool bdrv_supports_compressed_writes(BlockDriverState *bs)
 {
 BlockDriverState *filtered;
+IO_CODE();
 
 if (!bs->drv || !block_driver_can_compress(bs->drv)) {
 return false;
@@ -5838,6 +5855,7 @@ bool bdrv_supports_compressed_writes(BlockDriverState *bs)
 
 const char *bdrv_get_format_name(BlockDriverState *bs)
 {
+IO_CODE();
 return bs->drv ? bs->drv->format_name : NULL;
 }
 
@@ -6146,6 +6164,7 @@ BlockDriverState *bdrv_next_all_states(BlockDriverState 
*bs)
 
 const char *bdrv_get_node_name(const BlockDriverState *bs)
 {
+IO_CODE();
 return bs->node_name;
 }
 
@@ -6170,6 +6189,7 @@ const char *bdrv_get_parent_name(const BlockDriverState 
*bs)
 /* TODO check what callers really want: bs->node_name or blk_name() */
 const char *bdrv_get_device_name(const BlockDriverState *bs)
 {
+IO_CODE();
 return bdrv_get_parent_name(bs) ?: "";
 }
 
@@ -6179,6 +6199,7 @@ const c

[PATCH v7 19/31] assertions for blockjob.h global state API

2022-02-11 Thread Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockjob.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/blockjob.c b/blockjob.c
index d79a52d204..4868453d74 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -62,6 +62,7 @@ static bool is_block_job(Job *job)
 BlockJob *block_job_next(BlockJob *bjob)
 {
 Job *job = bjob ? &bjob->job : NULL;
+GLOBAL_STATE_CODE();
 
 do {
 job = job_next(job);
@@ -73,6 +74,7 @@ BlockJob *block_job_next(BlockJob *bjob)
 BlockJob *block_job_get(const char *id)
 {
 Job *job = job_get(id);
+GLOBAL_STATE_CODE();
 
 if (job && is_block_job(job)) {
 return container_of(job, BlockJob, job);
@@ -184,6 +186,7 @@ static const BdrvChildClass child_job = {
 
 void block_job_remove_all_bdrv(BlockJob *job)
 {
+GLOBAL_STATE_CODE();
 /*
  * bdrv_root_unref_child() may reach child_job_[can_]set_aio_ctx(),
  * which will also traverse job->nodes, so consume the list one by
@@ -206,6 +209,7 @@ void block_job_remove_all_bdrv(BlockJob *job)
 bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs)
 {
 GSList *el;
+GLOBAL_STATE_CODE();
 
 for (el = job->nodes; el; el = el->next) {
 BdrvChild *c = el->data;
@@ -222,6 +226,7 @@ int block_job_add_bdrv(BlockJob *job, const char *name, 
BlockDriverState *bs,
 {
 BdrvChild *c;
 bool need_context_ops;
+GLOBAL_STATE_CODE();
 
 bdrv_ref(bs);
 
@@ -271,6 +276,8 @@ bool block_job_set_speed(BlockJob *job, int64_t speed, 
Error **errp)
 const BlockJobDriver *drv = block_job_driver(job);
 int64_t old_speed = job->speed;
 
+GLOBAL_STATE_CODE();
+
 if (job_apply_verb(&job->job, JOB_VERB_SET_SPEED, errp) < 0) {
 return false;
 }
@@ -309,6 +316,8 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp)
 BlockJobInfo *info;
 uint64_t progress_current, progress_total;
 
+GLOBAL_STATE_CODE();
+
 if (block_job_is_internal(job)) {
 error_setg(errp, "Cannot query QEMU internal jobs");
 return NULL;
@@ -491,6 +500,7 @@ fail:
 
 void block_job_iostatus_reset(BlockJob *job)
 {
+GLOBAL_STATE_CODE();
 if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
 return;
 }
@@ -548,5 +558,6 @@ BlockErrorAction block_job_error_action(BlockJob *job, 
BlockdevOnError on_err,
 
 AioContext *block_job_get_aio_context(BlockJob *job)
 {
+GLOBAL_STATE_CODE();
 return job->job.aio_context;
 }
-- 
2.31.1

[PATCH v7 10/31] block.c: assertions to the block layer permissions API

2022-02-11 Thread Emanuele Giuseppe Esposito

Now that we "covered" the three main cases where the
permission API was being used under BQL (fuse,
amend and invalidate_cache), we can safely assert for
the permission functions implemented in block.c

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/block.c b/block.c
index aa04a38457..2540ce3534 100644
--- a/block.c
+++ b/block.c
@@ -2109,6 +2109,7 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, 
Error **errp)
 
 assert(a->bs);
 assert(a->bs == b->bs);
+GLOBAL_STATE_CODE();
 
 if ((b->perm & a->shared_perm) == b->perm) {
 return true;
@@ -2132,6 +2133,7 @@ static bool bdrv_a_allow_b(BdrvChild *a, BdrvChild *b, 
Error **errp)
 static bool bdrv_parent_perms_conflict(BlockDriverState *bs, Error **errp)
 {
 BdrvChild *a, *b;
+GLOBAL_STATE_CODE();
 
 /*
  * During the loop we'll look at each pair twice. That's correct because
@@ -2213,6 +2215,8 @@ static void bdrv_child_set_perm_abort(void *opaque)
 {
 BdrvChildSetPermState *s = opaque;
 
+GLOBAL_STATE_CODE();
+
 s->child->perm = s->old_perm;
 s->child->shared_perm = s->old_shared_perm;
 }
@@ -2226,6 +2230,7 @@ static void bdrv_child_set_perm(BdrvChild *c, uint64_t 
perm,
 uint64_t shared, Transaction *tran)
 {
 BdrvChildSetPermState *s = g_new(BdrvChildSetPermState, 1);
+GLOBAL_STATE_CODE();
 
 *s = (BdrvChildSetPermState) {
 .child = c,
@@ -2405,6 +2410,7 @@ static int bdrv_node_refresh_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 BdrvChild *c;
 int ret;
 uint64_t cumulative_perms, cumulative_shared_perms;
+GLOBAL_STATE_CODE();
 
 bdrv_get_cumulative_perm(bs, &cumulative_perms, &cumulative_shared_perms);
 
@@ -2473,6 +2479,7 @@ static int bdrv_list_refresh_perms(GSList *list, 
BlockReopenQueue *q,
 {
 int ret;
 BlockDriverState *bs;
+GLOBAL_STATE_CODE();
 
 for ( ; list; list = list->next) {
 bs = list->data;
@@ -2540,6 +2547,7 @@ static int bdrv_refresh_perms(BlockDriverState *bs, Error 
**errp)
 int ret;
 Transaction *tran = tran_new();
 g_autoptr(GSList) list = bdrv_topological_dfs(NULL, NULL, bs);
+GLOBAL_STATE_CODE();
 
 ret = bdrv_list_refresh_perms(list, NULL, tran, errp);
 tran_finalize(tran, ret);
@@ -2602,6 +2610,7 @@ static void bdrv_filter_default_perms(BlockDriverState 
*bs, BdrvChild *c,
   uint64_t perm, uint64_t shared,
   uint64_t *nperm, uint64_t *nshared)
 {
+GLOBAL_STATE_CODE();
 *nperm = perm & DEFAULT_PERM_PASSTHROUGH;
 *nshared = (shared & DEFAULT_PERM_PASSTHROUGH) | DEFAULT_PERM_UNCHANGED;
 }
@@ -2613,6 +2622,7 @@ static void bdrv_default_perms_for_cow(BlockDriverState 
*bs, BdrvChild *c,
uint64_t *nperm, uint64_t *nshared)
 {
 assert(role & BDRV_CHILD_COW);
+GLOBAL_STATE_CODE();
 
 /*
  * We want consistent read from backing files if the parent needs it.
@@ -2649,6 +2659,7 @@ static void 
bdrv_default_perms_for_storage(BlockDriverState *bs, BdrvChild *c,
 {
 int flags;
 
+GLOBAL_STATE_CODE();
 assert(role & (BDRV_CHILD_METADATA | BDRV_CHILD_DATA));
 
 flags = bdrv_reopen_get_flags(reopen_queue, bs);
@@ -6026,6 +6037,7 @@ static void xdbg_graph_add_edge(XDbgBlockGraphConstructor 
*gr, void *parent,
 {
 BlockPermission qapi_perm;
 XDbgBlockGraphEdge *edge;
+GLOBAL_STATE_CODE();
 
 edge = g_new0(XDbgBlockGraphEdge, 1);
 
-- 
2.31.1

[PATCH v7 03/31] include/block/block: split header into I/O and global state API

2022-02-11 Thread Emanuele Giuseppe Esposito

block.h currently contains a mix of functions:
some of them run under the BQL and modify the block layer graph,
others are instead thread-safe and perform I/O in iothreads.
Some others can only be called by either the main loop or the
iothread running the AioContext (and not other iothreads),
and using them in another thread would cause deadlocks, and therefore
it is not ideal to define them as I/O.

It is not easy to understand which function is part of which
group (I/O vs GS vs "I/O or GS"), and this patch aims to clarify it.

The "GS" functions need the BQL, and often use
aio_context_acquire/release and/or drain to be sure they
can modify the graph safely.
The I/O function are instead thread safe, and can run in
any AioContext.
"I/O or GS" functions run instead in the main loop or in
a single iothread, and use BDRV_POLL_WHILE().

By splitting the header in two files, block-io.h
and block-global-state.h we have a clearer view on what
needs what kind of protection. block-common.h
contains common structures shared by both headers.

block.h is left there for legacy and to avoid changing
all includes in all c files that use the block APIs.

Assertions are added in the next patch.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c|   3 +
 block/meson.build  |   7 +-
 include/block/block-common.h   | 418 ++
 include/block/block-global-state.h | 256 +
 include/block/block-io.h   | 364 
 include/block/block.h  | 879 +
 6 files changed, 1069 insertions(+), 858 deletions(-)
 create mode 100644 include/block/block-common.h
 create mode 100644 include/block/block-global-state.h
 create mode 100644 include/block/block-io.h

diff --git a/block.c b/block.c
index df353d55e8..7483dfaddc 100644
--- a/block.c
+++ b/block.c
@@ -67,12 +67,15 @@
 
 #define NOT_DONE 0x7fff /* used while emulated sync operation in progress 
*/
 
+/* Protected by BQL */
 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
 QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
 
+/* Protected by BQL */
 static QTAILQ_HEAD(, BlockDriverState) all_bdrv_states =
 QTAILQ_HEAD_INITIALIZER(all_bdrv_states);
 
+/* Protected by BQL */
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
 QLIST_HEAD_INITIALIZER(bdrv_drivers);
 
diff --git a/block/meson.build b/block/meson.build
index deb73ca389..ee636e2754 100644
--- a/block/meson.build
+++ b/block/meson.build
@@ -113,8 +113,11 @@ block_ss.add(module_block_h)
 wrapper_py = find_program('../scripts/block-coroutine-wrapper.py')
 block_gen_c = custom_target('block-gen.c',
 output: 'block-gen.c',
-input: files('../include/block/block.h',
- 'coroutines.h'),
+input: files(
+  '../include/block/block-io.h',
+  '../include/block/block-global-state.h',
+  'coroutines.h'
+  ),
 command: [wrapper_py, '@OUTPUT@', '@INPUT@'])
 block_ss.add(block_gen_c)
 
diff --git a/include/block/block-common.h b/include/block/block-common.h
new file mode 100644
index 00..0c5dc4a86a
--- /dev/null
+++ b/include/block/block-common.h
@@ -0,0 +1,418 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_COMMON_H
+#define BLOCK_COMMON_H
+
+#include "block/aio.h"
+#include "block/aio-wait.h"
+#include "qemu/iov.h"
+#include "qemu/coroutine.h"
+#include "block/accounting.h"
+#include "block/dirty-bitmap.h"
+#include "block/blockjob.h"
+#include "qemu/hbitmap.h"
+#include "qemu/transactions.h"
+
+/*
+ * generated_co_wrapper
+ *
+ * Function specifier, which does nothing but mar

[PATCH v7 16/31] GS and IO CODE macros for blockjob_int.h

2022-02-11 Thread Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockjob.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/blockjob.c b/blockjob.c
index 10815a89fe..d79a52d204 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -84,6 +84,7 @@ BlockJob *block_job_get(const char *id)
 void block_job_free(Job *job)
 {
 BlockJob *bjob = container_of(job, BlockJob, job);
+GLOBAL_STATE_CODE();
 
 block_job_remove_all_bdrv(bjob);
 ratelimit_destroy(&bjob->limit);
@@ -299,6 +300,7 @@ bool block_job_set_speed(BlockJob *job, int64_t speed, 
Error **errp)
 
 int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n)
 {
+IO_CODE();
 return ratelimit_calculate_delay(&job->limit, n);
 }
 
@@ -434,6 +436,7 @@ void *block_job_create(const char *job_id, const 
BlockJobDriver *driver,
 {
 BlockJob *job;
 int ret;
+GLOBAL_STATE_CODE();
 
 if (job_id == NULL && !(flags & JOB_INTERNAL)) {
 job_id = bdrv_get_device_name(bs);
@@ -498,6 +501,7 @@ void block_job_iostatus_reset(BlockJob *job)
 void block_job_user_resume(Job *job)
 {
 BlockJob *bjob = container_of(job, BlockJob, job);
+GLOBAL_STATE_CODE();
 block_job_iostatus_reset(bjob);
 }
 
@@ -505,6 +509,7 @@ BlockErrorAction block_job_error_action(BlockJob *job, 
BlockdevOnError on_err,
 int is_read, int error)
 {
 BlockErrorAction action;
+IO_CODE();
 
 switch (on_err) {
 case BLOCKDEV_ON_ERROR_ENOSPC:
-- 
2.31.1

[PATCH v7 17/31] block.c: add assertions to static functions

2022-02-11 Thread Emanuele Giuseppe Esposito

Following the assertion derived from the API split,
propagate the assertion also in the static functions.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c   | 47 ++-
 block/block-backend.c |  3 +++
 2 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 8546612488..29c88e8727 100644
--- a/block.c
+++ b/block.c
@@ -438,6 +438,7 @@ BlockDriverState *bdrv_new(void)
 static BlockDriver *bdrv_do_find_format(const char *format_name)
 {
 BlockDriver *drv1;
+GLOBAL_STATE_CODE();
 
 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 if (!strcmp(drv1->format_name, format_name)) {
@@ -596,6 +597,8 @@ static int64_t create_file_fallback_truncate(BlockBackend 
*blk,
 int64_t size;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 ret = blk_truncate(blk, minimum_size, false, PREALLOC_MODE_OFF, 0,
&local_err);
 if (ret < 0 && ret != -ENOTSUP) {
@@ -634,6 +637,8 @@ static int 
create_file_fallback_zero_first_sector(BlockBackend *blk,
 int64_t bytes_to_clear;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 bytes_to_clear = MIN(current_size, BDRV_SECTOR_SIZE);
 if (bytes_to_clear) {
 ret = blk_pwrite_zeroes(blk, 0, bytes_to_clear, BDRV_REQ_MAY_UNMAP);
@@ -896,6 +901,7 @@ static BlockDriver *find_hdev_driver(const char *filename)
 {
 int score_max = 0, score;
 BlockDriver *drv = NULL, *d;
+GLOBAL_STATE_CODE();
 
 QLIST_FOREACH(d, &bdrv_drivers, list) {
 if (d->bdrv_probe_device) {
@@ -913,6 +919,7 @@ static BlockDriver *find_hdev_driver(const char *filename)
 static BlockDriver *bdrv_do_find_protocol(const char *protocol)
 {
 BlockDriver *drv1;
+GLOBAL_STATE_CODE();
 
 QLIST_FOREACH(drv1, &bdrv_drivers, list) {
 if (drv1->protocol_name && !strcmp(drv1->protocol_name, protocol)) {
@@ -1021,6 +1028,8 @@ static int find_image_format(BlockBackend *file, const 
char *filename,
 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
 int ret = 0;
 
+GLOBAL_STATE_CODE();
+
 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
 if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) 
{
 *pdrv = &bdrv_raw;
@@ -1103,6 +1112,7 @@ static BlockdevDetectZeroesOptions 
bdrv_parse_detect_zeroes(QemuOpts *opts,
 BlockdevDetectZeroesOptions detect_zeroes =
 qapi_enum_parse(&BlockdevDetectZeroesOptions_lookup, value,
 BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, &local_err);
+GLOBAL_STATE_CODE();
 g_free(value);
 if (local_err) {
 error_propagate(errp, local_err);
@@ -1218,6 +1228,7 @@ static void bdrv_child_cb_drained_end(BdrvChild *child,
 static int bdrv_child_cb_inactivate(BdrvChild *child)
 {
 BlockDriverState *bs = child->opaque;
+GLOBAL_STATE_CODE();
 assert(bs->open_flags & BDRV_O_INACTIVE);
 return 0;
 }
@@ -1244,6 +1255,7 @@ static void bdrv_child_cb_set_aio_ctx(BdrvChild *child, 
AioContext *ctx,
 static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
int parent_flags, QDict *parent_options)
 {
+GLOBAL_STATE_CODE();
 *child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
 
 /* For temporary files, unconditional cache=unsafe is fine */
@@ -1264,6 +1276,7 @@ static void bdrv_backing_attach(BdrvChild *c)
 BlockDriverState *parent = c->opaque;
 BlockDriverState *backing_hd = c->bs;
 
+GLOBAL_STATE_CODE();
 assert(!parent->backing_blocker);
 error_setg(&parent->backing_blocker,
"node is used as backing hd of '%s'",
@@ -1302,6 +1315,7 @@ static void bdrv_backing_detach(BdrvChild *c)
 {
 BlockDriverState *parent = c->opaque;
 
+GLOBAL_STATE_CODE();
 assert(parent->backing_blocker);
 bdrv_op_unblock_all(c->bs, parent->backing_blocker);
 error_free(parent->backing_blocker);
@@ -1314,6 +1328,7 @@ static int bdrv_backing_update_filename(BdrvChild *c, 
BlockDriverState *base,
 BlockDriverState *parent = c->opaque;
 bool read_only = bdrv_is_read_only(parent);
 int ret;
+GLOBAL_STATE_CODE();
 
 if (read_only) {
 ret = bdrv_reopen_set_read_only(parent, false, errp);
@@ -1345,6 +1360,7 @@ static void bdrv_inherited_options(BdrvChildRole role, 
bool parent_is_format,
int parent_flags, QDict *parent_options)
 {
 int flags = parent_flags;
+GLOBAL_STATE_CODE();
 
 /*
  * First, decide whether to set, clear, or leave BDRV_O_PROTOCOL.
@@ -1486,6 +1502,7 @@ AioContext *bdrv_child_get_parent_aio_context(BdrvChild 
*c)
 static int bdrv_open_flags(BlockDriverState *bs, int flags)
 {
 int open_flags = flags;
+GLOBAL_STATE_CODE();
 
 /*
  * Clear flags that are internal to the block layer before opening the
@@ -1498,6 +1515,8 @@ static int bdrv_open_flags(BlockDriverState *bs, int 
flags)
 
 static void update_flags_fro

[PATCH v7 09/31] IO_CODE and IO_OR_GS_CODE for block-backend I/O API

2022-02-11 Thread Emanuele Giuseppe Esposito

Mark all I/O functions with IO_CODE, and all "I/O OR GS" with
IO_OR_GS_CODE.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/block-backend.c | 57 +++
 include/sysemu/block-backend-io.h |  2 ++
 2 files changed, 59 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index ebfc3fe954..03b1e36a3c 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -733,6 +733,7 @@ void monitor_remove_blk(BlockBackend *blk)
  */
 const char *blk_name(const BlockBackend *blk)
 {
+IO_CODE();
 return blk->name ?: "";
 }
 
@@ -759,6 +760,7 @@ BlockBackend *blk_by_name(const char *name)
  */
 BlockDriverState *blk_bs(BlockBackend *blk)
 {
+IO_CODE();
 return blk->root ? blk->root->bs : NULL;
 }
 
@@ -1009,6 +1011,7 @@ DeviceState *blk_get_attached_dev(BlockBackend *blk)
 char *blk_get_attached_dev_id(BlockBackend *blk)
 {
 DeviceState *dev = blk->dev;
+IO_CODE();
 
 if (!dev) {
 return g_strdup("");
@@ -1210,16 +1213,19 @@ void blk_iostatus_set_err(BlockBackend *blk, int error)
 
 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
 {
+IO_CODE();
 blk->allow_write_beyond_eof = allow;
 }
 
 void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
 {
+IO_CODE();
 blk->allow_aio_context_change = allow;
 }
 
 void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
 {
+IO_CODE();
 blk->disable_request_queuing = disable;
 }
 
@@ -1303,6 +1309,7 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t 
offset,
BdrvRequestFlags flags)
 {
 int ret;
+IO_OR_GS_CODE();
 
 blk_inc_in_flight(blk);
 ret = blk_co_do_preadv(blk, offset, bytes, qiov, flags);
@@ -1354,6 +1361,7 @@ int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, 
int64_t offset,
  BdrvRequestFlags flags)
 {
 int ret;
+IO_OR_GS_CODE();
 
 blk_inc_in_flight(blk);
 ret = blk_co_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
@@ -1366,6 +1374,7 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, 
int64_t offset,
 int64_t bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags)
 {
+IO_OR_GS_CODE();
 return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
 }
 
@@ -1394,6 +1403,7 @@ typedef struct BlkRwCo {
 int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
   int64_t bytes, BdrvRequestFlags flags)
 {
+IO_OR_GS_CODE();
 return blk_pwritev_part(blk, offset, bytes, NULL, 0,
 flags | BDRV_REQ_ZERO_WRITE);
 }
@@ -1406,11 +1416,13 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags 
flags)
 
 void blk_inc_in_flight(BlockBackend *blk)
 {
+IO_CODE();
 qatomic_inc(&blk->in_flight);
 }
 
 void blk_dec_in_flight(BlockBackend *blk)
 {
+IO_CODE();
 qatomic_dec(&blk->in_flight);
 aio_wait_kick();
 }
@@ -1429,6 +1441,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
   void *opaque, int ret)
 {
 struct BlockBackendAIOCB *acb;
+IO_CODE();
 
 blk_inc_in_flight(blk);
 acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
@@ -1536,6 +1549,7 @@ BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, 
int64_t offset,
   int64_t bytes, BdrvRequestFlags flags,
   BlockCompletionFunc *cb, void *opaque)
 {
+IO_CODE();
 return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_write_entry,
 flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
 }
@@ -1544,6 +1558,7 @@ int blk_pread(BlockBackend *blk, int64_t offset, void 
*buf, int bytes)
 {
 int ret;
 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+IO_OR_GS_CODE();
 
 blk_inc_in_flight(blk);
 ret = blk_do_preadv(blk, offset, bytes, &qiov, 0);
@@ -1557,6 +1572,7 @@ int blk_pwrite(BlockBackend *blk, int64_t offset, const 
void *buf, int bytes,
 {
 int ret;
 QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+IO_OR_GS_CODE();
 
 ret = blk_pwritev_part(blk, offset, bytes, &qiov, 0, flags);
 
@@ -1565,6 +1581,7 @@ int blk_pwrite(BlockBackend *blk, int64_t offset, const 
void *buf, int bytes,
 
 int64_t blk_getlength(BlockBackend *blk)
 {
+IO_CODE();
 if (!blk_is_available(blk)) {
 return -ENOMEDIUM;
 }
@@ -1574,6 +1591,7 @@ int64_t blk_getlength(BlockBackend *blk)
 
 void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
 {
+IO_CODE();
 if (!blk_bs(blk)) {
 *nb_sectors_ptr = 0;
 } else {
@@ -1583,6 +1601,7 @@ void blk_get_geometry(BlockBackend *blk, uint64_t 
*nb_sectors_ptr)
 
 int64_t blk_nb_sectors(BlockBackend *blk)
 {
+IO_CODE();
 if (!blk_is_available(blk)) {
 return -ENOMEDIUM;
 }
@@ -1594,6 +1613,7 @@ BlockAIOCB *blk_aio_preadv(Bl

[PATCH v7 07/31] include/sysemu/block-backend: split header into I/O and global state (GS) API

2022-02-11 Thread Emanuele Giuseppe Esposito

Similarly to the previous patches, split block-backend.h
in block-backend-io.h and block-backend-global-state.h

In addition, remove "block/block.h" include as it seems
it is not necessary anymore, together with "qemu/iov.h"

block-backend-common.h contains the structures shared between
the two headers, and the functions that can't be categorized as
I/O or global state.

Assertions are added in the next patch.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/block-backend.c   |   9 +-
 include/sysemu/block-backend-common.h   |  84 ++
 include/sysemu/block-backend-global-state.h | 117 +
 include/sysemu/block-backend-io.h   | 159 
 include/sysemu/block-backend.h  | 269 +---
 5 files changed, 369 insertions(+), 269 deletions(-)
 create mode 100644 include/sysemu/block-backend-common.h
 create mode 100644 include/sysemu/block-backend-global-state.h
 create mode 100644 include/sysemu/block-backend-io.h

diff --git a/block/block-backend.c b/block/block-backend.c
index 98bfcd5cf2..462e18facf 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -79,6 +79,7 @@ struct BlockBackend {
 bool allow_aio_context_change;
 bool allow_write_beyond_eof;
 
+/* Protected by BQL */
 NotifierList remove_bs_notifiers, insert_bs_notifiers;
 QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
 
@@ -111,12 +112,14 @@ static const AIOCBInfo block_backend_aiocb_info = {
 static void drive_info_del(DriveInfo *dinfo);
 static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
 
-/* All BlockBackends */
+/* All BlockBackends. Protected by BQL. */
 static QTAILQ_HEAD(, BlockBackend) block_backends =
 QTAILQ_HEAD_INITIALIZER(block_backends);
 
-/* All BlockBackends referenced by the monitor and which are iterated through 
by
- * blk_next() */
+/*
+ * All BlockBackends referenced by the monitor and which are iterated through 
by
+ * blk_next(). Protected by BQL.
+ */
 static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
 QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
 
diff --git a/include/sysemu/block-backend-common.h 
b/include/sysemu/block-backend-common.h
new file mode 100644
index 00..6963bbf45a
--- /dev/null
+++ b/include/sysemu/block-backend-common.h
@@ -0,0 +1,84 @@
+/*
+ * QEMU Block backends
+ *
+ * Copyright (C) 2014-2016 Red Hat, Inc.
+ *
+ * Authors:
+ *  Markus Armbruster ,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1
+ * or later.  See the COPYING.LIB file in the top-level directory.
+ */
+
+#ifndef BLOCK_BACKEND_COMMON_H
+#define BLOCK_BACKEND_COMMON_H
+
+#include "qemu/iov.h"
+#include "block/throttle-groups.h"
+
+/*
+ * TODO Have to include block/block.h for a bunch of block layer
+ * types.  Unfortunately, this pulls in the whole BlockDriverState
+ * API, which we don't want used by many BlockBackend users.  Some of
+ * the types belong here, and the rest should be split into a common
+ * header and one for the BlockDriverState API.
+ */
+#include "block/block.h"
+
+/* Callbacks for block device models */
+typedef struct BlockDevOps {
+/*
+ * Runs when virtual media changed (monitor commands eject, change)
+ * Argument load is true on load and false on eject.
+ * Beware: doesn't run when a host device's physical media
+ * changes.  Sure would be useful if it did.
+ * Device models with removable media must implement this callback.
+ */
+void (*change_media_cb)(void *opaque, bool load, Error **errp);
+/*
+ * Runs when an eject request is issued from the monitor, the tray
+ * is closed, and the medium is locked.
+ * Device models that do not implement is_medium_locked will not need
+ * this callback.  Device models that can lock the medium or tray might
+ * want to implement the callback and unlock the tray when "force" is
+ * true, even if they do not support eject requests.
+ */
+void (*eject_request_cb)(void *opaque, bool force);
+/*
+ * Is the virtual tray open?
+ * Device models implement this only when the device has a tray.
+ */
+bool (*is_tray_open)(void *opaque);
+/*
+ * Is the virtual medium locked into the device?
+ * Device models implement this only when device has such a lock.
+ */
+bool (*is_medium_locked)(void *opaque);
+/*
+ * Runs when the size changed (e.g. monitor command block_resize)
+ */
+void (*resize_cb)(void *opaque);
+/*
+ * Runs when the backend receives a drain request.
+ */
+void (*drained_begin)(void *opaque);
+/*
+ * Runs when the backend's last drain request ends.
+ */
+void (*drained_end)(void *opaque);
+/*
+ * Is the device still busy?
+ */
+bool (*drained_poll)(void *opaque);
+} BlockDevOps;
+
+/*
+ * This struct is embedded in (the private) BlockBackend struct and contains
+ * fields that must be public. This is in particular for QLIST_ENTRY()

[PATCH v7 18/31] include/block/blockjob.h: global state API

2022-02-11 Thread Emanuele Giuseppe Esposito

blockjob functions run always under the BQL lock.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/block/blockjob.h | 29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 87fbb3985f..6525e16fd5 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -74,6 +74,13 @@ typedef struct BlockJob {
 GSList *nodes;
 } BlockJob;
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /**
  * block_job_next:
  * @job: A block job, or %NULL.
@@ -155,6 +162,21 @@ BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
  */
 void block_job_iostatus_reset(BlockJob *job);
 
+/*
+ * block_job_get_aio_context:
+ *
+ * Returns aio context associated with a block job.
+ */
+AioContext *block_job_get_aio_context(BlockJob *job);
+
+
+/*
+ * Common functions that are neither I/O nor Global State.
+ *
+ * See include/block/block-common.h for more information about
+ * the Common API.
+ */
+
 /**
  * block_job_is_internal:
  * @job: The job to determine if it is user-visible or not.
@@ -170,11 +192,4 @@ bool block_job_is_internal(BlockJob *job);
  */
 const BlockJobDriver *block_job_driver(BlockJob *job);
 
-/*
- * block_job_get_aio_context:
- *
- * Returns aio context associated with a block job.
- */
-AioContext *block_job_get_aio_context(BlockJob *job);
-
 #endif
-- 
2.31.1

[PATCH v7 20/31] include/sysemu/blockdev.h: global state API

2022-02-11 Thread Emanuele Giuseppe Esposito

blockdev functions run always under the BQL lock.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/sysemu/blockdev.h | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/sysemu/blockdev.h b/include/sysemu/blockdev.h
index f9fb54d437..3211b16513 100644
--- a/include/sysemu/blockdev.h
+++ b/include/sysemu/blockdev.h
@@ -13,9 +13,6 @@
 #include "block/block.h"
 #include "qemu/queue.h"
 
-void blockdev_mark_auto_del(BlockBackend *blk);
-void blockdev_auto_del(BlockBackend *blk);
-
 typedef enum {
 IF_DEFAULT = -1,/* for use with drive_add() only */
 /*
@@ -38,6 +35,16 @@ struct DriveInfo {
 QTAILQ_ENTRY(DriveInfo) next;
 };
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
+void blockdev_mark_auto_del(BlockBackend *blk);
+void blockdev_auto_del(BlockBackend *blk);
+
 DriveInfo *blk_legacy_dinfo(BlockBackend *blk);
 DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo);
 BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo);
-- 
2.31.1

[PATCH v7 06/31] block/export/fuse.c: allow writable exports to take RESIZE permission

2022-02-11 Thread Emanuele Giuseppe Esposito

Allow writable exports to get BLK_PERM_RESIZE permission
from creation, in fuse_export_create().
In this way, there is no need to give the permission in
fuse_do_truncate(), which might be run in an iothread.

Permissions should be set only in the main thread, so
in any case if an iothread tries to set RESIZE, it will
be blocked.

Also assert in fuse_do_truncate that if we give the
RESIZE permission we can then restore the original ones.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Hanna Reitz 
---
 block/export/fuse.c | 25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/block/export/fuse.c b/block/export/fuse.c
index fdda8e3c81..5029e70f84 100644
--- a/block/export/fuse.c
+++ b/block/export/fuse.c
@@ -86,8 +86,8 @@ static int fuse_export_create(BlockExport *blk_exp,
 
 assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
 
-/* For growable exports, take the RESIZE permission */
-if (args->growable) {
+/* For growable and writable exports, take the RESIZE permission */
+if (args->growable || blk_exp_args->writable) {
 uint64_t blk_perm, blk_shared_perm;
 
 blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm);
@@ -392,14 +392,23 @@ static int fuse_do_truncate(const FuseExport *exp, 
int64_t size,
 {
 uint64_t blk_perm, blk_shared_perm;
 BdrvRequestFlags truncate_flags = 0;
-int ret;
+bool add_resize_perm;
+int ret, ret_check;
+
+/* Growable and writable exports have a permanent RESIZE permission */
+add_resize_perm = !exp->growable && !exp->writable;
 
 if (req_zero_write) {
 truncate_flags |= BDRV_REQ_ZERO_WRITE;
 }
 
-/* Growable exports have a permanent RESIZE permission */
-if (!exp->growable) {
+if (add_resize_perm) {
+
+if (!qemu_in_main_thread()) {
+/* Changing permissions like below only works in the main thread */
+return -EPERM;
+}
+
 blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm);
 
 ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE,
@@ -412,9 +421,11 @@ static int fuse_do_truncate(const FuseExport *exp, int64_t 
size,
 ret = blk_truncate(exp->common.blk, size, true, prealloc,
truncate_flags, NULL);
 
-if (!exp->growable) {
+if (add_resize_perm) {
 /* Must succeed, because we are only giving up the RESIZE permission */
-blk_set_perm(exp->common.blk, blk_perm, blk_shared_perm, &error_abort);
+ret_check = blk_set_perm(exp->common.blk, blk_perm,
+ blk_shared_perm, &error_abort);
+assert(ret_check == 0);
 }
 
 return ret;
-- 
2.31.1

[PATCH v7 14/31] block: introduce assert_bdrv_graph_writable

2022-02-11 Thread Emanuele Giuseppe Esposito

We want to be sure that the functions that write the child and
parent list of a bs are under BQL and drain.

BQL prevents from concurrent writings from the GS API, while
drains protect from I/O.

TODO: drains are missing in some functions using this assert.
Therefore a proper assertion will fail. Because adding drains
requires additional discussions, they will be added in future
series.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c|  4 
 include/block/block_int-global-state.h | 17 +
 2 files changed, 21 insertions(+)

diff --git a/block.c b/block.c
index 496d2989d7..8546612488 100644
--- a/block.c
+++ b/block.c
@@ -1420,6 +1420,7 @@ static void bdrv_child_cb_attach(BdrvChild *child)
 {
 BlockDriverState *bs = child->opaque;
 
+assert_bdrv_graph_writable(bs);
 QLIST_INSERT_HEAD(&bs->children, child, next);
 
 if (child->role & BDRV_CHILD_COW) {
@@ -1439,6 +1440,7 @@ static void bdrv_child_cb_detach(BdrvChild *child)
 
 bdrv_unapply_subtree_drain(child, bs);
 
+assert_bdrv_graph_writable(bs);
 QLIST_REMOVE(child, next);
 }
 
@@ -2829,6 +2831,7 @@ static void bdrv_replace_child_noperm(BdrvChild **childp,
 if (child->klass->detach) {
 child->klass->detach(child);
 }
+assert_bdrv_graph_writable(old_bs);
 QLIST_REMOVE(child, next_parent);
 }
 
@@ -2838,6 +2841,7 @@ static void bdrv_replace_child_noperm(BdrvChild **childp,
 }
 
 if (new_bs) {
+assert_bdrv_graph_writable(new_bs);
 QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
 
 /*
diff --git a/include/block/block_int-global-state.h 
b/include/block/block_int-global-state.h
index 5078d6a6ea..0f21b0570b 100644
--- a/include/block/block_int-global-state.h
+++ b/include/block/block_int-global-state.h
@@ -309,4 +309,21 @@ void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
  */
 void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
 
+/**
+ * Make sure that the function is running under both drain and BQL.
+ * The latter protects from concurrent writings
+ * from the GS API, while the former prevents concurrent reads
+ * from I/O.
+ */
+static inline void assert_bdrv_graph_writable(BlockDriverState *bs)
+{
+/*
+ * TODO: this function is incomplete. Because the users of this
+ * assert lack the necessary drains, check only for BQL.
+ * Once the necessary drains are added,
+ * assert also for qatomic_read(&bs->quiesce_counter) > 0
+ */
+assert(qemu_in_main_thread());
+}
+
 #endif /* BLOCK_INT_GLOBAL_STATE */
-- 
2.31.1

[PATCH v7 11/31] include/block/block_int: split header into I/O and global state API

2022-02-11 Thread Emanuele Giuseppe Esposito

Similarly to the previous patch, split block_int.h
in block_int-io.h and block_int-global-state.h

block_int-common.h contains the structures shared between
the two headers, and the functions that can't be categorized as
I/O or global state.

Assertions are added in the next patch.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 blockdev.c |5 +
 include/block/block_int-common.h   | 1180 +++
 include/block/block_int-global-state.h |  312 +
 include/block/block_int-io.h   |  179 +++
 include/block/block_int.h  | 1489 +---
 5 files changed, 1679 insertions(+), 1486 deletions(-)
 create mode 100644 include/block/block_int-common.h
 create mode 100644 include/block/block_int-global-state.h
 create mode 100644 include/block/block_int-io.h

diff --git a/blockdev.c b/blockdev.c
index c65a559a06..99a651a64f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -63,6 +63,7 @@
 #include "qemu/main-loop.h"
 #include "qemu/throttle-options.h"
 
+/* Protected by BQL */
 QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states =
 QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states);
 
@@ -1175,6 +1176,8 @@ typedef struct BlkActionState BlkActionState;
  *
  * Only prepare() may fail. In a single transaction, only one of commit() or
  * abort() will be called. clean() will always be called if it is present.
+ *
+ * Always run under BQL.
  */
 typedef struct BlkActionOps {
 size_t instance_size;
@@ -2284,6 +2287,8 @@ static TransactionProperties *get_transaction_properties(
 /*
  * 'Atomic' group operations.  The operations are performed as a set, and if
  * any fail then we roll back all operations in the group.
+ *
+ * Always run under BQL.
  */
 void qmp_transaction(TransactionActionList *dev_list,
  bool has_props,
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
new file mode 100644
index 00..b92e3630fd
--- /dev/null
+++ b/include/block/block_int-common.h
@@ -0,0 +1,1180 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_INT_COMMON_H
+#define BLOCK_INT_COMMON_H
+
+#include "block/accounting.h"
+#include "block/block.h"
+#include "block/aio-wait.h"
+#include "qemu/queue.h"
+#include "qemu/coroutine.h"
+#include "qemu/stats64.h"
+#include "qemu/timer.h"
+#include "qemu/hbitmap.h"
+#include "block/snapshot.h"
+#include "qemu/throttle.h"
+#include "qemu/rcu.h"
+
+#define BLOCK_FLAG_LAZY_REFCOUNTS   8
+
+#define BLOCK_OPT_SIZE  "size"
+#define BLOCK_OPT_ENCRYPT   "encryption"
+#define BLOCK_OPT_ENCRYPT_FORMAT"encrypt.format"
+#define BLOCK_OPT_COMPAT6   "compat6"
+#define BLOCK_OPT_HWVERSION "hwversion"
+#define BLOCK_OPT_BACKING_FILE  "backing_file"
+#define BLOCK_OPT_BACKING_FMT   "backing_fmt"
+#define BLOCK_OPT_CLUSTER_SIZE  "cluster_size"
+#define BLOCK_OPT_TABLE_SIZE"table_size"
+#define BLOCK_OPT_PREALLOC  "preallocation"
+#define BLOCK_OPT_SUBFMT"subformat"
+#define BLOCK_OPT_COMPAT_LEVEL  "compat"
+#define BLOCK_OPT_LAZY_REFCOUNTS"lazy_refcounts"
+#define BLOCK_OPT_ADAPTER_TYPE  "adapter_type"
+#define BLOCK_OPT_REDUNDANCY"redundancy"
+#define BLOCK_OPT_NOCOW "nocow"
+#define BLOCK_OPT_EXTENT_SIZE_HINT  "extent_size_hint"
+#define BLOCK_OPT_OBJECT_SIZE   "object_size"
+#define BLOCK_OPT_REFCOUNT_BITS "refcount_bits"
+#define BLOCK_OPT_DATA_FILE "data_file"
+#define BLOCK_OPT_DATA_FILE_RAW "data_file_raw"
+#define BLOCK_OPT_COMPRESSION_TYPE  "compression_type"
+#define BLOCK_OPT_EXTL2 "extended_l2"
+
+#define BLOCK_PROBE_BUF_SIZE512
+
+enum BdrvTrackedRequestType {
+BDRV_TRACKED_READ,
+BDRV_TRACKED_WRITE,
+BDRV_TRACKED_DISCARD,
+BDRV_TRACKED_TRUNCATE,
+};
+
+/*
+ * That is not quit

[PATCH v7 02/31] main loop: macros to mark GS and I/O functions

2022-02-11 Thread Emanuele Giuseppe Esposito

Righ now, IO_CODE and IO_OR_GS_CODE are nop, as there isn't
really a way to check that a function is only called in I/O.
On the other side, we can use qemu_in_main_thread to check if
we are in the main loop.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/qemu/main-loop.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index bc42b5939d..77adc51627 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -269,6 +269,15 @@ bool qemu_mutex_iothread_locked(void);
  */
 bool qemu_in_main_thread(void);
 
+/* Mark and check that the function is part of the global state API. */
+#define GLOBAL_STATE_CODE() assert(qemu_in_main_thread())
+
+/* Mark and check that the function is part of the I/O API. */
+#define IO_CODE() /* nop */
+
+/* Mark and check that the function is part of the "I/O OR GS" API. */
+#define IO_OR_GS_CODE() /* nop */
+
 /**
  * qemu_mutex_lock_iothread: Lock the main loop mutex.
  *
-- 
2.31.1

[PATCH v7 04/31] assertions for block global state API

2022-02-11 Thread Emanuele Giuseppe Esposito

All the global state (GS) API functions will check that
qemu_in_main_thread() returns true. If not, it means
that the safety of BQL cannot be guaranteed, and
they need to be moved to I/O.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c| 125 -
 block/commit.c |   2 +
 block/io.c |  11 +
 blockdev.c |   1 +
 4 files changed, 137 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 7483dfaddc..038fe9af24 100644
--- a/block.c
+++ b/block.c
@@ -278,6 +278,8 @@ bool bdrv_is_read_only(BlockDriverState *bs)
 int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
bool ignore_allow_rdw, Error **errp)
 {
+GLOBAL_STATE_CODE();
+
 /* Do not set read_only if copy_on_read is enabled */
 if (bs->copy_on_read && read_only) {
 error_setg(errp, "Can't set node '%s' to r/o with copy-on-read 
enabled",
@@ -311,6 +313,7 @@ int bdrv_apply_auto_read_only(BlockDriverState *bs, const 
char *errmsg,
   Error **errp)
 {
 int ret = 0;
+GLOBAL_STATE_CODE();
 
 if (!(bs->open_flags & BDRV_O_RDWR)) {
 return 0;
@@ -393,6 +396,7 @@ char *bdrv_get_full_backing_filename(BlockDriverState *bs, 
Error **errp)
 void bdrv_register(BlockDriver *bdrv)
 {
 assert(bdrv->format_name);
+GLOBAL_STATE_CODE();
 QLIST_INSERT_HEAD(&bdrv_drivers, bdrv, list);
 }
 
@@ -401,6 +405,8 @@ BlockDriverState *bdrv_new(void)
 BlockDriverState *bs;
 int i;
 
+GLOBAL_STATE_CODE();
+
 bs = g_new0(BlockDriverState, 1);
 QLIST_INIT(&bs->dirty_bitmaps);
 for (i = 0; i < BLOCK_OP_TYPE_MAX; i++) {
@@ -443,6 +449,8 @@ BlockDriver *bdrv_find_format(const char *format_name)
 BlockDriver *drv1;
 int i;
 
+GLOBAL_STATE_CODE();
+
 drv1 = bdrv_do_find_format(format_name);
 if (drv1) {
 return drv1;
@@ -492,6 +500,7 @@ static int bdrv_format_is_whitelisted(const char 
*format_name, bool read_only)
 
 int bdrv_is_whitelisted(BlockDriver *drv, bool read_only)
 {
+GLOBAL_STATE_CODE();
 return bdrv_format_is_whitelisted(drv->format_name, read_only);
 }
 
@@ -527,6 +536,8 @@ int bdrv_create(BlockDriver *drv, const char* filename,
 {
 int ret;
 
+GLOBAL_STATE_CODE();
+
 Coroutine *co;
 CreateCo cco = {
 .drv = drv,
@@ -702,6 +713,8 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, 
Error **errp)
 QDict *qdict;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 drv = bdrv_find_protocol(filename, true, errp);
 if (drv == NULL) {
 return -ENOENT;
@@ -799,6 +812,7 @@ int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes 
*bsz)
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *filtered = bdrv_filter_bs(bs);
+GLOBAL_STATE_CODE();
 
 if (drv && drv->bdrv_probe_blocksizes) {
 return drv->bdrv_probe_blocksizes(bs, bsz);
@@ -819,6 +833,7 @@ int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry 
*geo)
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *filtered = bdrv_filter_bs(bs);
+GLOBAL_STATE_CODE();
 
 if (drv && drv->bdrv_probe_geometry) {
 return drv->bdrv_probe_geometry(bs, geo);
@@ -910,6 +925,7 @@ BlockDriver *bdrv_find_protocol(const char *filename,
 const char *p;
 int i;
 
+GLOBAL_STATE_CODE();
 /* TODO Drivers without bdrv_file_open must be specified explicitly */
 
 /*
@@ -1634,6 +1650,8 @@ BlockDriverState *bdrv_new_open_driver_opts(BlockDriver 
*drv,
 BlockDriverState *bs;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 bs = bdrv_new();
 bs->open_flags = flags;
 bs->options = options ?: qdict_new();
@@ -1659,6 +1677,7 @@ BlockDriverState *bdrv_new_open_driver_opts(BlockDriver 
*drv,
 BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
int flags, Error **errp)
 {
+GLOBAL_STATE_CODE();
 return bdrv_new_open_driver_opts(drv, node_name, NULL, flags, errp);
 }
 
@@ -3094,6 +3113,8 @@ BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
 BdrvChild *child = NULL;
 Transaction *tran = tran_new();
 
+GLOBAL_STATE_CODE();
+
 ret = bdrv_attach_child_noperm(parent_bs, child_bs, child_name, 
child_class,
child_role, &child, tran, errp);
 if (ret < 0) {
@@ -3120,6 +3141,8 @@ void bdrv_root_unref_child(BdrvChild *child)
 {
 BlockDriverState *child_bs;
 
+GLOBAL_STATE_CODE();
+
 child_bs = child->bs;
 bdrv_detach_child(&child);
 bdrv_unref(child_bs);
@@ -3194,6 +3217,7 @@ static void bdrv_unset_inherits_from(BlockDriverState 
*root, BdrvChild *child,
 /* Callers must ensure that child->frozen is false. */
 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
 {
+GLOBAL_STATE_CODE();
 if (child == NULL) {
 return;
 }
@@ -3344,6 +3368,7 @@ int bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_

[PATCH v7 08/31] block/block-backend.c: assertions for block-backend

2022-02-11 Thread Emanuele Giuseppe Esposito

All the global state (GS) API functions will check that
qemu_in_main_thread() returns true. If not, it means
that the safety of BQL cannot be guaranteed, and
they need to be moved to I/O.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block/block-backend.c  | 79 ++
 softmmu/qdev-monitor.c |  2 ++
 2 files changed, 81 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 462e18facf..ebfc3fe954 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -239,6 +239,7 @@ static void blk_root_activate(BdrvChild *child, Error 
**errp)
 
 void blk_set_force_allow_inactivate(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 blk->force_allow_inactivate = true;
 }
 
@@ -357,6 +358,8 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, 
uint64_t shared_perm)
 {
 BlockBackend *blk;
 
+GLOBAL_STATE_CODE();
+
 blk = g_new0(BlockBackend, 1);
 blk->refcnt = 1;
 blk->ctx = ctx;
@@ -394,6 +397,8 @@ BlockBackend *blk_new_with_bs(BlockDriverState *bs, 
uint64_t perm,
 {
 BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
 
+GLOBAL_STATE_CODE();
+
 if (blk_insert_bs(blk, bs, errp) < 0) {
 blk_unref(blk);
 return NULL;
@@ -422,6 +427,8 @@ BlockBackend *blk_new_open(const char *filename, const char 
*reference,
 uint64_t perm = 0;
 uint64_t shared = BLK_PERM_ALL;
 
+GLOBAL_STATE_CODE();
+
 /*
  * blk_new_open() is mainly used in .bdrv_create implementations and the
  * tools where sharing isn't a major concern because the BDS stays private
@@ -499,6 +506,7 @@ static void drive_info_del(DriveInfo *dinfo)
 
 int blk_get_refcnt(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return blk ? blk->refcnt : 0;
 }
 
@@ -509,6 +517,7 @@ int blk_get_refcnt(BlockBackend *blk)
 void blk_ref(BlockBackend *blk)
 {
 assert(blk->refcnt > 0);
+GLOBAL_STATE_CODE();
 blk->refcnt++;
 }
 
@@ -519,6 +528,7 @@ void blk_ref(BlockBackend *blk)
  */
 void blk_unref(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 if (blk) {
 assert(blk->refcnt > 0);
 if (blk->refcnt > 1) {
@@ -539,6 +549,7 @@ void blk_unref(BlockBackend *blk)
  */
 BlockBackend *blk_all_next(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return blk ? QTAILQ_NEXT(blk, link)
: QTAILQ_FIRST(&block_backends);
 }
@@ -547,6 +558,8 @@ void blk_remove_all_bs(void)
 {
 BlockBackend *blk = NULL;
 
+GLOBAL_STATE_CODE();
+
 while ((blk = blk_all_next(blk)) != NULL) {
 AioContext *ctx = blk_get_aio_context(blk);
 
@@ -570,6 +583,7 @@ void blk_remove_all_bs(void)
  */
 BlockBackend *blk_next(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return blk ? QTAILQ_NEXT(blk, monitor_link)
: QTAILQ_FIRST(&monitor_block_backends);
 }
@@ -636,6 +650,7 @@ static void bdrv_next_reset(BdrvNextIterator *it)
 
 BlockDriverState *bdrv_first(BdrvNextIterator *it)
 {
+GLOBAL_STATE_CODE();
 bdrv_next_reset(it);
 return bdrv_next(it);
 }
@@ -673,6 +688,7 @@ bool monitor_add_blk(BlockBackend *blk, const char *name, 
Error **errp)
 {
 assert(!blk->name);
 assert(name && name[0]);
+GLOBAL_STATE_CODE();
 
 if (!id_wellformed(name)) {
 error_setg(errp, "Invalid device name");
@@ -700,6 +716,8 @@ bool monitor_add_blk(BlockBackend *blk, const char *name, 
Error **errp)
  */
 void monitor_remove_blk(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
+
 if (!blk->name) {
 return;
 }
@@ -726,6 +744,7 @@ BlockBackend *blk_by_name(const char *name)
 {
 BlockBackend *blk = NULL;
 
+GLOBAL_STATE_CODE();
 assert(name);
 while ((blk = blk_next(blk)) != NULL) {
 if (!strcmp(name, blk->name)) {
@@ -760,6 +779,7 @@ static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
  */
 bool bdrv_has_blk(BlockDriverState *bs)
 {
+GLOBAL_STATE_CODE();
 return bdrv_first_blk(bs) != NULL;
 }
 
@@ -770,6 +790,7 @@ bool bdrv_is_root_node(BlockDriverState *bs)
 {
 BdrvChild *c;
 
+GLOBAL_STATE_CODE();
 QLIST_FOREACH(c, &bs->parents, next_parent) {
 if (c->klass != &child_root) {
 return false;
@@ -819,6 +840,7 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
  */
 BlockBackendPublic *blk_get_public(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return &blk->public;
 }
 
@@ -827,6 +849,7 @@ BlockBackendPublic *blk_get_public(BlockBackend *blk)
  */
 BlockBackend *blk_by_public(BlockBackendPublic *public)
 {
+GLOBAL_STATE_CODE();
 return container_of(public, BlockBackend, public);
 }
 
@@ -838,6 +861,8 @@ void blk_remove_bs(BlockBackend *blk)
 ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 BdrvChild *root;
 
+GLOBAL_STATE_CODE();
+
 notifier_list_notify(&blk->remove_bs_notifiers, blk);
 if (tgm->throttle_state) {
 BlockDriverState *bs = blk_bs(blk);
@@ -872,6 +897,7 @@ void blk_remove_bs(BlockBackend *blk)
 int

[PATCH v7 15/31] include/block/blockjob_int.h: split header into I/O and GS API

2022-02-11 Thread Emanuele Giuseppe Esposito

Since the I/O functions are not many, keep a single file.
Also split the function pointers in BlockJobDriver.

Signed-off-by: Emanuele Giuseppe Esposito 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/blockjob_int.h | 28 
 1 file changed, 28 insertions(+)

diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
index 6633d83da2..6bd9ae2b20 100644
--- a/include/block/blockjob_int.h
+++ b/include/block/blockjob_int.h
@@ -38,6 +38,13 @@ struct BlockJobDriver {
 /** Generic JobDriver callbacks and settings */
 JobDriver job_driver;
 
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
 /*
  * Returns whether the job has pending requests for the child or will
  * submit new requests before the next pause point. This callback is polled
@@ -46,6 +53,13 @@ struct BlockJobDriver {
  */
 bool (*drained_poll)(BlockJob *job);
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /*
  * If the callback is not NULL, it will be invoked before the job is
  * resumed in a new AioContext.  This is the place to move any resources
@@ -56,6 +70,13 @@ struct BlockJobDriver {
 void (*set_speed)(BlockJob *job, int64_t speed);
 };
 
+/*
+ * Global state (GS) API. These functions run under the BQL.
+ *
+ * See include/block/block-global-state.h for more information about
+ * the GS API.
+ */
+
 /**
  * block_job_create:
  * @job_id: The id of the newly-created job, or %NULL to have one
@@ -98,6 +119,13 @@ void block_job_free(Job *job);
  */
 void block_job_user_resume(Job *job);
 
+/*
+ * I/O API functions. These functions are thread-safe.
+ *
+ * See include/block/block-io.h for more information about
+ * the I/O API.
+ */
+
 /**
  * block_job_ratelimit_get_delay:
  *
-- 
2.31.1

[PATCH v7 12/31] assertions for block_int global state API

2022-02-11 Thread Emanuele Giuseppe Esposito

Signed-off-by: Emanuele Giuseppe Esposito 
---
 block.c | 15 +++
 block/backup.c  |  1 +
 block/block-backend.c   |  3 +++
 block/commit.c  |  2 ++
 block/dirty-bitmap.c|  1 +
 block/io.c  |  1 +
 block/mirror.c  |  4 
 block/monitor/bitmap-qmp-cmds.c |  6 ++
 block/stream.c  |  2 ++
 blockdev.c  |  7 +++
 10 files changed, 42 insertions(+)

diff --git a/block.c b/block.c
index 2540ce3534..1fee5e41e8 100644
--- a/block.c
+++ b/block.c
@@ -665,6 +665,8 @@ int coroutine_fn bdrv_co_create_opts_simple(BlockDriver 
*drv,
 Error *local_err = NULL;
 int ret;
 
+GLOBAL_STATE_CODE();
+
 size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
 buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
 prealloc = qapi_enum_parse(&PreallocMode_lookup, buf,
@@ -2504,6 +2506,8 @@ void bdrv_get_cumulative_perm(BlockDriverState *bs, 
uint64_t *perm,
 uint64_t cumulative_perms = 0;
 uint64_t cumulative_shared_perms = BLK_PERM_ALL;
 
+GLOBAL_STATE_CODE();
+
 QLIST_FOREACH(c, &bs->parents, next_parent) {
 cumulative_perms |= c->perm;
 cumulative_shared_perms &= c->shared_perm;
@@ -2562,6 +2566,8 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, 
uint64_t shared,
 Transaction *tran = tran_new();
 int ret;
 
+GLOBAL_STATE_CODE();
+
 bdrv_child_set_perm(c, perm, shared, tran);
 
 ret = bdrv_refresh_perms(c->bs, &local_err);
@@ -2592,6 +2598,8 @@ int bdrv_child_refresh_perms(BlockDriverState *bs, 
BdrvChild *c, Error **errp)
 uint64_t parent_perms, parent_shared;
 uint64_t perms, shared;
 
+GLOBAL_STATE_CODE();
+
 bdrv_get_cumulative_perm(bs, &parent_perms, &parent_shared);
 bdrv_child_perm(bs, c->bs, c, c->role, NULL,
 parent_perms, parent_shared, &perms, &shared);
@@ -2736,6 +2744,7 @@ void bdrv_default_perms(BlockDriverState *bs, BdrvChild 
*c,
 uint64_t perm, uint64_t shared,
 uint64_t *nperm, uint64_t *nshared)
 {
+GLOBAL_STATE_CODE();
 if (role & BDRV_CHILD_FILTERED) {
 assert(!(role & (BDRV_CHILD_DATA | BDRV_CHILD_METADATA |
  BDRV_CHILD_COW)));
@@ -3093,6 +3102,8 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState 
*child_bs,
 BdrvChild *child = NULL;
 Transaction *tran = tran_new();
 
+GLOBAL_STATE_CODE();
+
 ret = bdrv_attach_child_common(child_bs, child_name, child_class,
child_role, perm, shared_perm, opaque,
&child, tran, errp);
@@ -7484,6 +7495,8 @@ bool bdrv_recurse_can_replace(BlockDriverState *bs,
 {
 BlockDriverState *filtered;
 
+GLOBAL_STATE_CODE();
+
 if (!bs || !bs->drv) {
 return false;
 }
@@ -7655,6 +7668,7 @@ static bool append_strong_runtime_options(QDict *d, 
BlockDriverState *bs)
  * would result in exactly bs->backing. */
 static bool bdrv_backing_overridden(BlockDriverState *bs)
 {
+GLOBAL_STATE_CODE();
 if (bs->backing) {
 return strcmp(bs->auto_backing_file,
   bs->backing->bs->filename);
@@ -8043,6 +8057,7 @@ static BlockDriverState 
*bdrv_do_skip_filters(BlockDriverState *bs,
  */
 BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
 {
+GLOBAL_STATE_CODE();
 return bdrv_do_skip_filters(bs, true);
 }
 
diff --git a/block/backup.c b/block/backup.c
index 21d5983779..5cfd0b999c 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -372,6 +372,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 
 assert(bs);
 assert(target);
+GLOBAL_STATE_CODE();
 
 /* QMP interface protects us from these cases */
 assert(sync_mode != MIRROR_SYNC_MODE_INCREMENTAL);
diff --git a/block/block-backend.c b/block/block-backend.c
index 03b1e36a3c..292481069f 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1106,6 +1106,7 @@ static void blk_root_change_media(BdrvChild *child, bool 
load)
  */
 bool blk_dev_has_removable_media(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
 }
 
@@ -1123,6 +1124,7 @@ bool blk_dev_has_tray(BlockBackend *blk)
  */
 void blk_dev_eject_request(BlockBackend *blk, bool force)
 {
+GLOBAL_STATE_CODE();
 if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
 blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
 }
@@ -1145,6 +1147,7 @@ bool blk_dev_is_tray_open(BlockBackend *blk)
  */
 bool blk_dev_is_medium_locked(BlockBackend *blk)
 {
+GLOBAL_STATE_CODE();
 if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
 return blk->dev_ops->is_medium_locked(blk->dev_opaque);
 }
diff --git a/block/commit.c b/block/commit.c
index 2ce6637ca6..c76899f640 100644
--- a/block/commit.c
+++ b/block/commi

[PATCH v7 01/31] main-loop.h: introduce qemu_in_main_thread()

2022-02-11 Thread Emanuele Giuseppe Esposito

When invoked from the main loop, this function is the same
as qemu_mutex_iothread_locked, and returns true if the BQL is held.
When invoked from iothreads or tests, it returns true only
if the current AioContext is the Main Loop.

This essentially just extends qemu_mutex_iothread_locked to work
also in unit tests or other users like storage-daemon, that run
in the Main Loop but end up using the implementation in
stubs/iothread-lock.c.

Using qemu_mutex_iothread_locked in unit tests defaults to false
because they use the implementation in stubs/iothread-lock,
making all assertions added in next patches fail despite the
AioContext is still the main loop.

See the comment in the function header for more information.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/qemu/main-loop.h | 24 
 softmmu/cpus.c   |  5 +
 stubs/iothread-lock.c|  5 +
 3 files changed, 34 insertions(+)

diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 8dbc6fcb89..bc42b5939d 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -242,9 +242,33 @@ AioContext *iohandler_get_aio_context(void);
  * must always be taken outside other locks.  This function helps
  * functions take different paths depending on whether the current
  * thread is running within the main loop mutex.
+ *
+ * This function should never be used in the block layer, because
+ * unit tests, block layer tools and qemu-storage-daemon do not
+ * have a BQL.
+ * Please instead refer to qemu_in_main_thread().
  */
 bool qemu_mutex_iothread_locked(void);
 
+/**
+ * qemu_in_main_thread: return whether it's possible to safely access
+ * the global state of the block layer.
+ *
+ * Global state of the block layer is not accessible from I/O threads
+ * or worker threads; only from threads that "own" the default
+ * AioContext that qemu_get_aio_context() returns.  For tests, block
+ * layer tools and qemu-storage-daemon there is a designated thread that
+ * runs the event loop for qemu_get_aio_context(), and that is the
+ * main thread.
+ *
+ * For emulators, however, any thread that holds the BQL can act
+ * as the block layer main thread; this will be any of the actual
+ * main thread, the vCPU threads or the RCU thread.
+ *
+ * For clarity, do not use this function outside the block layer.
+ */
+bool qemu_in_main_thread(void);
+
 /**
  * qemu_mutex_lock_iothread: Lock the main loop mutex.
  *
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index 23bca46b07..fd4e139289 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -485,6 +485,11 @@ bool qemu_mutex_iothread_locked(void)
 return iothread_locked;
 }
 
+bool qemu_in_main_thread(void)
+{
+return qemu_mutex_iothread_locked();
+}
+
 /*
  * The BQL is taken from so many places that it is worth profiling the
  * callers directly, instead of funneling them all through a single function.
diff --git a/stubs/iothread-lock.c b/stubs/iothread-lock.c
index 5b45b7fc8b..ff7386e42c 100644
--- a/stubs/iothread-lock.c
+++ b/stubs/iothread-lock.c
@@ -6,6 +6,11 @@ bool qemu_mutex_iothread_locked(void)
 return false;
 }
 
+bool qemu_in_main_thread(void)
+{
+return qemu_get_current_aio_context() == qemu_get_aio_context();
+}
+
 void qemu_mutex_lock_iothread_impl(const char *file, int line)
 {
 }
-- 
2.31.1

[PATCH v7 00/31] block layer: split block APIs in global state and I/O

2022-02-11 Thread Emanuele Giuseppe Esposito

Currently, block layer APIs like block.h contain a mix of
functions that are either running in the main loop and under the
BQL, or are thread-safe functions and run in iothreads performing I/O.
The functions running under BQL also take care of modifying the
block graph, by using drain and/or aio_context_acquire/release.
This makes it very confusing to understand where each function
runs, and what assumptions it provided with regards to thread
safety.

We call the functions running under BQL "global state (GS) API", and
distinguish them from the thread-safe "I/O API".

The aim of this series is to split the relevant block headers in
global state and I/O sub-headers. The division will be done in
this way:
header.h will be split in header-global-state.h, header-io.h and
header-common.h. The latter will just contain the data structures
needed by header-global-state and header-io, and common helpers
that are neither in GS nor in I/O. header.h will remain for
legacy and to avoid changing all includes in all QEMU c files,
but will only include the two new headers. No function shall be
added in header.c .
Once we split all relevant headers, it will be much easier to see what
uses the AioContext lock and remove it, which is the overall main
goal of this and other series that I posted/will post.

In addition to splitting the relevant headers shown in this series,
it is also very helpful splitting the function pointers in some
block structures, to understand what runs under AioContext lock and
what doesn't. This is what patches 21-27 do.

Each function in the GS API will have an assertion, checking
that it is always running under BQL.
I/O functions are instead thread safe (or so should be), meaning
that they *can* run under BQL, but also in an iothread in another
AioContext. Therefore they do not provide any assertion, and
need to be audited manually to verify the correctness.

Adding assetions has helped finding 2 bugs already, as shown in
my series "Migration: fix missing iothread locking".

Tested this series by running unit tests, qemu-iotests and qtests
(x86_64).
Some functions in the GS API are used everywhere but not
properly tested. Therefore their assertion is never actually run in
the tests, so despite my very careful auditing, it is not impossible
to exclude that some will trigger while actually using QEMU.

Patch 1 introduces qemu_in_main_thread(), the function used in
all assertions. This had to be introduced otherwise all unit tests
would fail, since they run in the main loop but use the code in
stubs/iothread.c
Patches 2-27 (with the exception of patch 9-10, that are an additional
assert) are all structured in the same way: first we split the header
and in the next (or same, if small) patch we add assertions.
Patch 28-31 take care instead of the block layer permission API,
fixing some bugs where they are used in I/O functions.

This serie depends on my previous serie "block layer: permission API
refactoring in preparation to the API split"

Based-on: <20220209105452.1694545-1-eespo...@redhat.com>

Signed-off-by: Emanuele Giuseppe Esposito 
---
v7:
* crypto permissions and bdrv-activate patches sent in another serie
* (*bdrv_probe) and (*get_name) are I/O
* add missing license header in block-common.h
* in block-common.h:
  bdrv_parse_cache_mode
  bdrv_parse_discard_flags
  bdrv_perm_names
  bdrv_qapi_perm_to_blk_perm
  bdrv_init_with_whitelist
  bdrv_uses_whitelist
  bdrv_is_whitelisted
* in block-io.h:
  bdrv_get_full_backing_filename
  bdrv_make_zero
  bdrv_aio_cancel
* Introduce new "Global OR I/O" category for the functions using
BDRV_POLL_WHILE
  Functions in this category:
BDRV_POLL_WHILE
bdrv_drain
bdrv_co_drain
bdrv_truncate
bdrv_check
bdrv_invalidate_cache
bdrv_flush
bdrv_pdiscard
bdrv_readv_vmstate
bdrv_writev_vmstate
bdrv_parent_drained_begin_single
bdrv_parent_drained_end_single
bdrv_drain_poll
bdrv_drained_begin
bdrv_do_drained_begin_quiesce
bdrv_subtree_drained_begin
bdrv_drained_end
bdrv_drained_end_no_poll
bdrv_subtree_drained_end
* better comment descriptions of common, GS, I/O and "I/O or GS" categories
* remove job_pre_run, we don't really need it.
* insert assertion GLOBAL_STATE_CODE, IO_CODE and IO_OR_GS_CODE macros
* replace all assert(qemu_in_main_thread()) with GLOBAL_STATE_CODE
* use IO_CODE and IO_OR_GS_CODE assertions, in additional patches

v6:
* Additional assertions in "block.c: add assertions to static functions"
* bdrv_co_invalidate_cache: create a new GS function bdrv_activate, and move
  all GS logic of bdrv_co_invalidate_cache there, so that the
  coroutine only runs I/O code. Move the resulting 3 patches before
  "block/coroutines: I/O API"
* crypto (patch 30): introduce bdrv_amend_pre_run and bdrv_clean, along with
  job_pre_run and job_clean to be sure of setting the permissions in GS
* remove TODO in blk_{get/set}_perm, and handle the RESIZE permission in patch 6
* in I/O:
blk_ioctl

Re: [PATCH v2 1/8] tests/qemu-iotests/testrunner: Allow parallel test invocations

2022-02-11 Thread Thomas Huth


On 11/02/2022 10.29, Kevin Wolf wrote:

Am 09.02.2022 um 11:15 hat Thomas Huth geschrieben:

If multiple tests run in parallel, they must use unique file
names for the test output.

Suggested-by: Hanna Reitz 
Signed-off-by: Thomas Huth 
---
  tests/qemu-iotests/testrunner.py | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/testrunner.py b/tests/qemu-iotests/testrunner.py
index 0eace147b8..9d20f51bb1 100644
--- a/tests/qemu-iotests/testrunner.py
+++ b/tests/qemu-iotests/testrunner.py
@@ -259,7 +259,7 @@ def do_run_test(self, test: str, mp: bool) -> TestResult:
  """
  
  f_test = Path(test)

-f_bad = Path(f_test.name + '.out.bad')
+f_bad = Path(f'{os.getpid()}-{f_test.name}.out.bad')
  f_notrun = Path(f_test.name + '.notrun')
  f_casenotrun = Path(f_test.name + '.casenotrun')
  f_reference = Path(self.find_reference(test))


Hmm... It does make sense, but nobody ever cleans those files up.
Currently, the next run of the test will just overwrite the existing
file or delete it when the test succeeds. So after running the test
suite, you have .out.bad files for every failed test, but not for those
that succeeded.

After this change, won't the test directory accumulate tons of .out.bad
files over time?


True ... but we certainly want to keep the file for failed tests for further 
analysis instead of immediately deleting them ... maybe it would be enough 
to encode the image format (qcow2, qed, vmdk, ...) into the output name, 
instead of using the PID, so that "make check SPEED=thorough" works as 
expected here?


 Thomas

[PULL v2 2/7] block/nbd: Delete open timer when done

2022-02-11 Thread Vladimir Sementsov-Ogievskiy

From: Hanna Reitz 

We start the open timer to cancel the connection attempt after a while.
Once nbd_do_establish_connection() has returned, the attempt is over,
and we no longer need the timer.

Delete it before returning from nbd_open(), so that it does not persist
for longer.  It has no use after nbd_open(), and just like the reconnect
delay timer, it might well be dangerous if it were to fire afterwards.

Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Hanna Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 16cd7fef77..5ff8a57314 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -1885,11 +1885,19 @@ static int nbd_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto fail;
 }
 
+/*
+ * The connect attempt is done, so we no longer need this timer.
+ * Delete it, because we do not want it to be around when this node
+ * is drained or closed.
+ */
+open_timer_del(s);
+
 nbd_client_connection_enable_retry(s->conn);
 
 return 0;
 
 fail:
+open_timer_del(s);
 nbd_clear_bdrvstate(bs);
 return ret;
 }
-- 
2.31.1

[PULL v2 7/7] iotests/281: Let NBD connection yield in iothread

2022-02-11 Thread Vladimir Sementsov-Ogievskiy

From: Hanna Reitz 

Put an NBD block device into an I/O thread, and then read data from it,
hoping that the NBD connection will yield during that read.  When it
does, the coroutine must be reentered in the block device's I/O thread,
which will only happen if the NBD block driver attaches the connection's
QIOChannel to the new AioContext.  It did not do that after 4ddb5d2fde
("block/nbd: drop connection_co") and prior to "block/nbd: Move s->ioc
on AioContext change", which would cause an assertion failure.

To improve our chances of yielding, the NBD server is throttled to
reading 64 kB/s, and the NBD client reads 128 kB, so it should yield at
some point.

Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Hanna Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/281 | 28 +---
 tests/qemu-iotests/281.out |  4 ++--
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/tests/qemu-iotests/281 b/tests/qemu-iotests/281
index 4fb3cd30dd..5e1339bd75 100755
--- a/tests/qemu-iotests/281
+++ b/tests/qemu-iotests/281
@@ -253,8 +253,9 @@ class TestYieldingAndTimers(iotests.QMPTestCase):
 self.create_nbd_export()
 
 # Simple VM with an NBD block device connected to the NBD export
-# provided by the QSD
+# provided by the QSD, and an (initially unused) iothread
 self.vm = iotests.VM()
+self.vm.add_object('iothread,id=iothr')
 self.vm.add_blockdev('nbd,node-name=nbd,server.type=unix,' +
  f'server.path={self.sock},export=exp,' +
  'reconnect-delay=1,open-timeout=1')
@@ -299,19 +300,40 @@ class TestYieldingAndTimers(iotests.QMPTestCase):
 # thus not see the error, and so the test will pass.)
 time.sleep(2)
 
+def test_yield_in_iothread(self):
+# Move the NBD node to the I/O thread; the NBD block driver should
+# attach the connection's QIOChannel to that thread's AioContext, too
+result = self.vm.qmp('x-blockdev-set-iothread',
+ node_name='nbd', iothread='iothr')
+self.assert_qmp(result, 'return', {})
+
+# Do some I/O that will be throttled by the QSD, so that the network
+# connection hopefully will yield here.  When it is resumed, it must
+# then be resumed in the I/O thread's AioContext.
+result = self.vm.qmp('human-monitor-command',
+ command_line='qemu-io nbd "read 0 128K"')
+self.assert_qmp(result, 'return', '')
+
 def create_nbd_export(self):
 assert self.qsd is None
 
-# Simple NBD export of a null-co BDS
+# Export a throttled null-co BDS: Reads are throttled (max 64 kB/s),
+# writes are not.
 self.qsd = QemuStorageDaemon(
+'--object',
+'throttle-group,id=thrgr,x-bps-read=65536,x-bps-read-max=65536',
+
 '--blockdev',
 'null-co,node-name=null,read-zeroes=true',
 
+'--blockdev',
+'throttle,node-name=thr,file=null,throttle-group=thrgr',
+
 '--nbd-server',
 f'addr.type=unix,addr.path={self.sock}',
 
 '--export',
-'nbd,id=exp,node-name=null,name=exp,writable=true'
+'nbd,id=exp,node-name=thr,name=exp,writable=true'
 )
 
 def stop_nbd_export(self):
diff --git a/tests/qemu-iotests/281.out b/tests/qemu-iotests/281.out
index 914e3737bd..3f8a935a08 100644
--- a/tests/qemu-iotests/281.out
+++ b/tests/qemu-iotests/281.out
@@ -1,5 +1,5 @@
-.
+..
 --
-Ran 5 tests
+Ran 6 tests
 
 OK
-- 
2.31.1

[PULL v2 6/7] block/nbd: Move s->ioc on AioContext change

2022-02-11 Thread Vladimir Sementsov-Ogievskiy

From: Hanna Reitz 

s->ioc must always be attached to the NBD node's AioContext.  If that
context changes, s->ioc must be attached to the new context.

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=2033626
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Hanna Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 45 +
 1 file changed, 45 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index dc6c3f3bbc..5853d85d60 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -2055,6 +2055,42 @@ static void nbd_cancel_in_flight(BlockDriverState *bs)
 nbd_co_establish_connection_cancel(s->conn);
 }
 
+static void nbd_attach_aio_context(BlockDriverState *bs,
+   AioContext *new_context)
+{
+BDRVNBDState *s = bs->opaque;
+
+/* The open_timer is used only during nbd_open() */
+assert(!s->open_timer);
+
+/*
+ * The reconnect_delay_timer is scheduled in I/O paths when the
+ * connection is lost, to cancel the reconnection attempt after a
+ * given time.  Once this attempt is done (successfully or not),
+ * nbd_reconnect_attempt() ensures the timer is deleted before the
+ * respective I/O request is resumed.
+ * Since the AioContext can only be changed when a node is drained,
+ * the reconnect_delay_timer cannot be active here.
+ */
+assert(!s->reconnect_delay_timer);
+
+if (s->ioc) {
+qio_channel_attach_aio_context(s->ioc, new_context);
+}
+}
+
+static void nbd_detach_aio_context(BlockDriverState *bs)
+{
+BDRVNBDState *s = bs->opaque;
+
+assert(!s->open_timer);
+assert(!s->reconnect_delay_timer);
+
+if (s->ioc) {
+qio_channel_detach_aio_context(s->ioc);
+}
+}
+
 static BlockDriver bdrv_nbd = {
 .format_name= "nbd",
 .protocol_name  = "nbd",
@@ -2078,6 +2114,9 @@ static BlockDriver bdrv_nbd = {
 .bdrv_dirname   = nbd_dirname,
 .strong_runtime_opts= nbd_strong_runtime_opts,
 .bdrv_cancel_in_flight  = nbd_cancel_in_flight,
+
+.bdrv_attach_aio_context= nbd_attach_aio_context,
+.bdrv_detach_aio_context= nbd_detach_aio_context,
 };
 
 static BlockDriver bdrv_nbd_tcp = {
@@ -2103,6 +2142,9 @@ static BlockDriver bdrv_nbd_tcp = {
 .bdrv_dirname   = nbd_dirname,
 .strong_runtime_opts= nbd_strong_runtime_opts,
 .bdrv_cancel_in_flight  = nbd_cancel_in_flight,
+
+.bdrv_attach_aio_context= nbd_attach_aio_context,
+.bdrv_detach_aio_context= nbd_detach_aio_context,
 };
 
 static BlockDriver bdrv_nbd_unix = {
@@ -2128,6 +2170,9 @@ static BlockDriver bdrv_nbd_unix = {
 .bdrv_dirname   = nbd_dirname,
 .strong_runtime_opts= nbd_strong_runtime_opts,
 .bdrv_cancel_in_flight  = nbd_cancel_in_flight,
+
+.bdrv_attach_aio_context= nbd_attach_aio_context,
+.bdrv_detach_aio_context= nbd_detach_aio_context,
 };
 
 static void bdrv_nbd_init(void)
-- 
2.31.1

[PULL v2 1/7] block/nbd: Delete reconnect delay timer when done

2022-02-11 Thread Vladimir Sementsov-Ogievskiy

From: Hanna Reitz 

We start the reconnect delay timer to cancel the reconnection attempt
after a while.  Once nbd_co_do_establish_connection() has returned, this
attempt is over, and we no longer need the timer.

Delete it before returning from nbd_reconnect_attempt(), so that it does
not persist beyond the I/O request that was paused for reconnecting; we
do not want it to fire in a drained section, because all sort of things
can happen in such a section (e.g. the AioContext might be changed, and
we do not want the timer to fire in the wrong context; or the BDS might
even be deleted, and so the timer CB would access already-freed data).

Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Hanna Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 63dbfa807d..16cd7fef77 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -381,6 +381,13 @@ static coroutine_fn void 
nbd_reconnect_attempt(BDRVNBDState *s)
 }
 
 nbd_co_do_establish_connection(s->bs, NULL);
+
+/*
+ * The reconnect attempt is done (maybe successfully, maybe not), so
+ * we no longer need this timer.  Delete it so it will not outlive
+ * this I/O request (so draining removes all timers).
+ */
+reconnect_delay_timer_del(s);
 }
 
 static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle)
-- 
2.31.1

[PULL v2 0/7] nbd: handle AioContext change correctly

2022-02-11 Thread Vladimir Sementsov-Ogievskiy

The following changes since commit 0a301624c2f4ced3331ffd5bce85b4274fe132af:

  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20220208' 
into staging (2022-02-08 11:40:08 +)

are available in the Git repository at:

  https://src.openvz.org/scm/~vsementsov/qemu.git tags/pull-nbd-2022-02-09-v2

for you to fetch changes up to 8cfbe929e8c26050f0a4580a1606a370a947d4ce:

  iotests/281: Let NBD connection yield in iothread (2022-02-11 14:06:18 +0100)


nbd: handle AioContext change correctly

v2: add my s-o-b marks to each commit



Hanna Reitz (7):
  block/nbd: Delete reconnect delay timer when done
  block/nbd: Delete open timer when done
  block/nbd: Assert there are no timers when closed
  iotests.py: Add QemuStorageDaemon class
  iotests/281: Test lingering timers
  block/nbd: Move s->ioc on AioContext change
  iotests/281: Let NBD connection yield in iothread

 block/nbd.c   |  64 +
 tests/qemu-iotests/281| 101 +-
 tests/qemu-iotests/281.out|   4 +-
 tests/qemu-iotests/iotests.py |  40 ++
 4 files changed, 205 insertions(+), 4 deletions(-)

-- 
2.31.1

Re: [PULL 0/7] nbd: handle AioContext change correctly

2022-02-11 Thread Vladimir Sementsov-Ogievskiy


11.02.2022 15:52, Peter Maydell wrote:

On Wed, 9 Feb 2022 at 14:03, Vladimir Sementsov-Ogievskiy
 wrote:


The following changes since commit 0a301624c2f4ced3331ffd5bce85b4274fe132af:

   Merge remote-tracking branch 
'remotes/pmaydell/tags/pull-target-arm-20220208' into staging (2022-02-08 
11:40:08 +)

are available in the Git repository at:

   https://src.openvz.org/scm/~vsementsov/qemu.git tags/pull-nbd-2022-02-09

for you to fetch changes up to 1bd4523c2ded28b7805b971b9d3d746beabd0a94:

   iotests/281: Let NBD connection yield in iothread (2022-02-09 14:15:29 +0100)


nbd: handle AioContext change correctly



Hi; this pullreq is OK content-wise, but the commits are missing
your Signed-off-by: line as the submaintainer/submitter of the
pull request. Could you add them and resend, please?



Oops, sorry. I thought I'm already an experienced maintainer and don't need to re-read 
"Submitting a Pull Request" instruction every time. I was wrong:)


--
Best regards,
Vladimir

Re: [PULL 0/7] nbd: handle AioContext change correctly

2022-02-11 Thread Peter Maydell

On Wed, 9 Feb 2022 at 14:03, Vladimir Sementsov-Ogievskiy
 wrote:
>
> The following changes since commit 0a301624c2f4ced3331ffd5bce85b4274fe132af:
>
>   Merge remote-tracking branch 
> 'remotes/pmaydell/tags/pull-target-arm-20220208' into staging (2022-02-08 
> 11:40:08 +)
>
> are available in the Git repository at:
>
>   https://src.openvz.org/scm/~vsementsov/qemu.git tags/pull-nbd-2022-02-09
>
> for you to fetch changes up to 1bd4523c2ded28b7805b971b9d3d746beabd0a94:
>
>   iotests/281: Let NBD connection yield in iothread (2022-02-09 14:15:29 
> +0100)
>
> 
> nbd: handle AioContext change correctly
>

Hi; this pullreq is OK content-wise, but the commits are missing
your Signed-off-by: line as the submaintainer/submitter of the
pull request. Could you add them and resend, please?

thanks
-- PMM

Re: [PATCH 3/6] block.c: bdrv_replace_child_noperm: first call ->attach(), and then add child

2022-02-11 Thread Kevin Wolf

Am 08.02.2022 um 16:36 hat Emanuele Giuseppe Esposito geschrieben:
> Doing the opposite can make adding the child node to a non-drained node,
> as apply_subtree_drain is only done in ->attach() and thus make
> assert_bdrv_graph_writable fail.
> 
> This can happen for example during a transaction rollback (test 245,
> test_io_with_graph_changes):
> 1. a node is removed from the graph, thus it is undrained
> 2. then something happens, and we need to roll back the transactions
>through tran_abort()
> 3. at this point, the current code would first attach the undrained node
>to the graph via QLIST_INSERT_HEAD, and then call ->attach() that
>will take care of restoring the drain with apply_subtree_drain(),
>leaving the node undrained between the two operations.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> ---
>  block.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/block.c b/block.c
> index ec346a7e2e..08a6e3a4ef 100644
> --- a/block.c
> +++ b/block.c
> @@ -2872,8 +2872,6 @@ static void bdrv_replace_child_noperm(BdrvChild 
> **childp,
>  }
>  
>  if (new_bs) {
> -assert_bdrv_graph_writable(new_bs);
> -QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
>  
>  /*
>   * Detaching the old node may have led to the new node's
> @@ -2890,6 +2888,10 @@ static void bdrv_replace_child_noperm(BdrvChild 
> **childp,
>  if (child->klass->attach) {
>  child->klass->attach(child);
>  }
> +
> +assert_bdrv_graph_writable(new_bs);
> +QLIST_INSERT_HEAD(&new_bs->parents, child, next_parent);
> +
>  }

Extra empty line. Looks good otherwise.

Does this also mean that the order in bdrv_child_cb_attach/detach() is
wrong? Or maybe adding a new node to bs->children is okay even when the
child node isn't drained.

Kevin

[PULL v2 5/7] iotests/281: Test lingering timers

2022-02-11 Thread Vladimir Sementsov-Ogievskiy

From: Hanna Reitz 

Prior to "block/nbd: Delete reconnect delay timer when done" and
"block/nbd: Delete open timer when done", both of those timers would
remain scheduled even after successfully (re-)connecting to the server,
and they would not even be deleted when the BDS is deleted.

This test constructs exactly this situation:
(1) Configure an @open-timeout, so the open timer is armed, and
(2) Configure a @reconnect-delay and trigger a reconnect situation
(which succeeds immediately), so the reconnect delay timer is armed.
Then we immediately delete the BDS, and sleep for longer than the
@open-timeout and @reconnect-delay.  Prior to said patches, this caused
one (or both) of the timer CBs to access already-freed data.

Accessing freed data may or may not crash, so this test can produce
false successes, but I do not know how to show the problem in a better
or more reliable way.  If you run this test on "block/nbd: Assert there
are no timers when closed" and without the fix patches mentioned above,
you should reliably see an assertion failure.
(But all other tests that use the reconnect delay timer (264 and 277)
will fail in that configuration, too; as will nbd-reconnect-on-open,
which uses the open timer.)

Remove this test from the quick group because of the two second sleep
this patch introduces.

(I decided to put this test case into 281, because the main bug this
series addresses is in the interaction of the NBD block driver and I/O
threads, which is precisely the scope of 281.  The test case for that
other bug will also be put into the test class added here.

Also, excuse the test class's name, I couldn't come up with anything
better.  The "yield" part will make sense two patches from now.)

Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Hanna Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/281 | 79 +-
 tests/qemu-iotests/281.out |  4 +-
 2 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/281 b/tests/qemu-iotests/281
index 318e333939..4fb3cd30dd 100755
--- a/tests/qemu-iotests/281
+++ b/tests/qemu-iotests/281
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# group: rw quick
+# group: rw
 #
 # Test cases for blockdev + IOThread interactions
 #
@@ -20,8 +20,9 @@
 #
 
 import os
+import time
 import iotests
-from iotests import qemu_img
+from iotests import qemu_img, QemuStorageDaemon
 
 image_len = 64 * 1024 * 1024
 
@@ -243,6 +244,80 @@ class TestBlockdevBackupAbort(iotests.QMPTestCase):
 # Hangs on failure, we expect this error.
 self.assert_qmp(result, 'error/class', 'GenericError')
 
+# Test for RHBZ#2033626
+class TestYieldingAndTimers(iotests.QMPTestCase):
+sock = os.path.join(iotests.sock_dir, 'nbd.sock')
+qsd = None
+
+def setUp(self):
+self.create_nbd_export()
+
+# Simple VM with an NBD block device connected to the NBD export
+# provided by the QSD
+self.vm = iotests.VM()
+self.vm.add_blockdev('nbd,node-name=nbd,server.type=unix,' +
+ f'server.path={self.sock},export=exp,' +
+ 'reconnect-delay=1,open-timeout=1')
+
+self.vm.launch()
+
+def tearDown(self):
+self.stop_nbd_export()
+self.vm.shutdown()
+
+def test_timers_with_blockdev_del(self):
+# The NBD BDS will have had an active open timer, because setUp() gave
+# a positive value for @open-timeout.  It should be gone once the BDS
+# has been opened.
+# (But there used to be a bug where it remained active, which will
+# become important below.)
+
+# Stop and restart the NBD server, and do some I/O on the client to
+# trigger a reconnect and start the reconnect delay timer
+self.stop_nbd_export()
+self.create_nbd_export()
+
+result = self.vm.qmp('human-monitor-command',
+ command_line='qemu-io nbd "write 0 512"')
+self.assert_qmp(result, 'return', '')
+
+# Reconnect is done, so the reconnect delay timer should be gone.
+# (This is similar to how the open timer should be gone after open,
+# and similarly there used to be a bug where it was not gone.)
+
+# Delete the BDS to see whether both timers are gone.  If they are not,
+# they will remain active, fire later, and then access freed data.
+# (Or, with "block/nbd: Assert there are no timers when closed"
+# applied, the assertions added in that patch will fail.)
+result = self.vm.qmp('blockdev-del', node_name='nbd')
+self.assert_qmp(result, 'return', {})
+
+# Give the timers some time to fire (both have a timeout of 1 s).
+# (Sleeping in an iotest may ring some alarm bells, but note that if
+# the timing is off here, the test will just always pass.  If we kill
+# the VM too early, then we just kill the timers before th

[PULL v2 4/7] iotests.py: Add QemuStorageDaemon class

2022-02-11 Thread Vladimir Sementsov-Ogievskiy

From: Hanna Reitz 

This is a rather simple class that allows creating a QSD instance
running in the background and stopping it when no longer needed.

The __del__ handler is a safety net for when something goes so wrong in
a test that e.g. the tearDown() method is not called (e.g. setUp()
launches the QSD, but then launching a VM fails).  We do not want the
QSD to continue running after the test has failed, so __del__() will
take care to kill it.

Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Hanna Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/iotests.py | 40 +++
 1 file changed, 40 insertions(+)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 8cdb381f2a..6ba65eb1ff 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -73,6 +73,8 @@
 qemu_prog = os.environ.get('QEMU_PROG', 'qemu')
 qemu_opts = os.environ.get('QEMU_OPTIONS', '').strip().split(' ')
 
+qsd_prog = os.environ.get('QSD_PROG', 'qemu-storage-daemon')
+
 gdb_qemu_env = os.environ.get('GDB_OPTIONS')
 qemu_gdb = []
 if gdb_qemu_env:
@@ -345,6 +347,44 @@ def cmd(self, cmd):
 return self._read_output()
 
 
+class QemuStorageDaemon:
+def __init__(self, *args: str, instance_id: str = 'a'):
+assert '--pidfile' not in args
+self.pidfile = os.path.join(test_dir, f'qsd-{instance_id}-pid')
+all_args = [qsd_prog] + list(args) + ['--pidfile', self.pidfile]
+
+# Cannot use with here, we want the subprocess to stay around
+# pylint: disable=consider-using-with
+self._p = subprocess.Popen(all_args)
+while not os.path.exists(self.pidfile):
+if self._p.poll() is not None:
+cmd = ' '.join(all_args)
+raise RuntimeError(
+'qemu-storage-daemon terminated with exit code ' +
+f'{self._p.returncode}: {cmd}')
+
+time.sleep(0.01)
+
+with open(self.pidfile, encoding='utf-8') as f:
+self._pid = int(f.read().strip())
+
+assert self._pid == self._p.pid
+
+def stop(self, kill_signal=15):
+self._p.send_signal(kill_signal)
+self._p.wait()
+self._p = None
+
+try:
+os.remove(self.pidfile)
+except OSError:
+pass
+
+def __del__(self):
+if self._p is not None:
+self.stop(kill_signal=9)
+
+
 def qemu_nbd(*args):
 '''Run qemu-nbd in daemon mode and return the parent's exit code'''
 return subprocess.call(qemu_nbd_args + ['--fork'] + list(args))
-- 
2.31.1

[PULL v2 3/7] block/nbd: Assert there are no timers when closed

2022-02-11 Thread Vladimir Sementsov-Ogievskiy

From: Hanna Reitz 

Our two timers must not remain armed beyond nbd_clear_bdrvstate(), or
they will access freed data when they fire.

This patch is separate from the patches that actually fix the issue
(HEAD^^ and HEAD^) so that you can run the associated regression iotest
(281) on a configuration that reproducibly exposes the bug.

Reviewed-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Hanna Reitz 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 5ff8a57314..dc6c3f3bbc 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -110,6 +110,10 @@ static void nbd_clear_bdrvstate(BlockDriverState *bs)
 
 yank_unregister_instance(BLOCKDEV_YANK_INSTANCE(bs->node_name));
 
+/* Must not leave timers behind that would access freed data */
+assert(!s->reconnect_delay_timer);
+assert(!s->open_timer);
+
 object_unref(OBJECT(s->tlscreds));
 qapi_free_SocketAddress(s->saddr);
 s->saddr = NULL;
-- 
2.31.1

Re: [PATCH v12 5/8] qmp: decode feature & status bits in virtio-status

2022-02-11 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> Display feature names instead of bitmaps for host, guest, and
> backend for VirtIODevices.
>
> Display status names instead of bitmaps for VirtIODevices.
>
> Display feature names instead of bitmaps for backend, protocol,
> acked, and features (hdev->features) for vhost devices.
>
> Decode features according to device ID. Decode statuses
> according to configuration status bitmap (config_status_map).
> Decode vhost user protocol features according to vhost user
> protocol bitmap (vhost_user_protocol_map).
>
> Transport features are on the first line. Undecoded bits (if
> any) are stored in a separate field.
>
> Signed-off-by: Jonah Palmer 

[...]

> diff --git a/qapi/virtio.json b/qapi/virtio.json
> index ba61d83..474a8bd 100644
> --- a/qapi/virtio.json
> +++ b/qapi/virtio.json
> @@ -106,10 +106,10 @@
>  'n-tmp-sections': 'int',
>  'nvqs': 'uint32',
>  'vq-index': 'int',
> -'features': 'uint64',
> -'acked-features': 'uint64',
> -'backend-features': 'uint64',
> -'protocol-features': 'uint64',
> +'features': 'VirtioDeviceFeatures',
> +'acked-features': 'VirtioDeviceFeatures',
> +'backend-features': 'VirtioDeviceFeatures',
> +'protocol-features': 'VhostDeviceProtocols',
>  'max-queues': 'uint64',
>  'backend-cap': 'uint64',
>  'log-enabled': 'bool',
> @@ -176,11 +176,11 @@
>  'device-id': 'uint16',
>  'vhost-started': 'bool',
>  'device-endian': 'str',
> -'guest-features': 'uint64',
> -'host-features': 'uint64',
> -'backend-features': 'uint64',
> +'guest-features': 'VirtioDeviceFeatures',
> +'host-features': 'VirtioDeviceFeatures',
> +'backend-features': 'VirtioDeviceFeatures',
>  'num-vqs': 'int',
> -'status': 'uint8',
> +'status': 'VirtioDeviceStatus',
>  'isr': 'uint8',
>  'queue-sel': 'uint16',
>  'vm-running': 'bool',
> @@ -222,14 +222,28 @@
>  #"name": "virtio-crypto",
>  #"started": true,
>  #"device-id": 20,
> -#"backend-features": 0,
> +#"backend-features": {
> +#   "transports": [],
> +#   "dev-features": []
> +#},
>  #"start-on-kick": false,
>  #"isr": 1,
>  #"broken": false,
> -#"status": 15,
> +#"status": {
> +#   "statuses": ["ACKNOWLEDGE", "DRIVER", "FEATURES_OK",
> +#"DRIVER_OK"]
> +#},
>  #"num-vqs": 2,
> -#"guest-features": 5100273664,
> -#"host-features": 6325010432,
> +#"guest-features": {
> +#   "transports": ["EVENT_IDX", "INDIRECT_DESC", "VERSION_1"],
> +#   "dev-features": []
> +#},
> +#"host-features": {
> +#   "transports": ["PROTOCOL_FEATURES", "EVENT_IDX",
> +#  "INDIRECT_DESC", "VERSION_1", "ANY_LAYOUT",
> +#  "NOTIFY_ON_EMPTY"],
> +#   "dev-features": []
> +#},
>  #"use-guest-notifier-mask": true,
>  #"vm-running": true,
>  #"queue-sel": 1,
> @@ -257,22 +271,65 @@
>  #   "max-queues": 1,
>  #   "backend-cap": 2,
>  #   "log-size": 0,
> -#   "backend-features": 0,
> +#   "backend-features": {
> +#  "transports": [],
> +#  "dev-features": []
> +#   },
>  #   "nvqs": 2,
> -#   "protocol-features": 0,
> +#   "protocol-features": {
> +#  "protocols": []
> +#   },
>  #   "vq-index": 0,
>  #   "log-enabled": false,
> -#   "acked-features": 5100306432,
> -#   "features": 13908344832
> +#   "acked-features": {
> +#  "transports": ["EVENT_IDX", "INDIRECT_DESC", "VERSION_1",
> +# "ANY_LAYOUT", "NOTIFY_ON_EMPTY"],
> +#  "dev-features": ["MRG_RXBUF"]
> +#   },
> +#   "features": {
> +#  "transports": ["EVENT_IDX", "INDIRECT_DESC",
> +# "IOMMU_PLATFORM", "VERSION_1", 
> "ANY_LAYOUT",
> +# "NOTIFY_ON_EMPTY"],
> +#  "dev-features": ["LOG_ALL", "MRG_RXBUF"]
> +#   }
> +#},
> +#"backend-features": {
> +#   "transports": ["PROTOCOL_FEATURES", "EVENT_IDX", 
> "INDIRECT_DESC",
> +#  "VERSION_1", "ANY_LAYOUT", "NOTIFY_ON_EMPTY"],
> +#   "dev-features": ["GSO", "CTRL_MAC_ADDR", "GUEST_ANNOUNCE",

Re: [PATCH 2/6] block.c: bdrv_replace_child_noperm: first remove the child, and then call ->detach()

2022-02-11 Thread Kevin Wolf

Am 08.02.2022 um 16:36 hat Emanuele Giuseppe Esposito geschrieben:
> Doing the opposite can make ->detach() (more precisely
> bdrv_unapply_subtree_drain() in bdrv_child_cb_detach) undo the subtree_drain
> just performed to protect the removal of the child from the graph,
> thus making the fully-enabled assert_bdrv_graph_writable fail.
> 
> Note that assert_bdrv_graph_writable is not yet fully enabled.
> 
> Signed-off-by: Emanuele Giuseppe Esposito 
> ---
>  block.c | 10 ++
>  1 file changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 4551eba2aa..ec346a7e2e 100644
> --- a/block.c
> +++ b/block.c
> @@ -2854,14 +2854,16 @@ static void bdrv_replace_child_noperm(BdrvChild 
> **childp,
>  }
>  
>  if (old_bs) {
> -/* Detach first so that the recursive drain sections coming from 
> @child
> +assert_bdrv_graph_writable(old_bs);
> +QLIST_REMOVE(child, next_parent);
> +/*
> + * Detach first so that the recursive drain sections coming from 
> @child
>   * are already gone and we only end the drain sections that came from
> - * elsewhere. */
> + * elsewhere.
> + */

This comment is confusing, but that's not your fault.

It was originally added in commit d736f119dae and referred to calling
.detach() before calling .drained_end(), which was the very next thing
it would do. Commit debc2927671 moved the .drained_end() code to the end
of the whole operation, but left this comment (and a similar one for
.attach() and .drained_begin()) around even though it doesn't explain
the new code very well any more.

>  if (child->klass->detach) {
>  child->klass->detach(child);
>  }
> -assert_bdrv_graph_writable(old_bs);
> -QLIST_REMOVE(child, next_parent);
>  }
>  
>  child->bs = new_bs;

After digging into what the comment really meant, I think it doesn't
refer to the order of QLIST_REMOVE() and .detach(). The change looks
safe to me.

I would just suggest updating the comment to explain the order you're
fixing here instead of the now irrelevant one.

Kevin

Re: [PATCH v12 7/8] qmp: add QMP command x-query-virtio-queue-element

2022-02-11 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> This new command shows the information of a VirtQueue element.
>
> [Note: Up until v10 of this patch series, virtio.json had many (15+)
>  enums defined (e.g. decoded device features, statuses, etc.). In v10
>  most of these enums were removed and replaced with string literals.
>  By doing this we get (1) simpler schema, (2) smaller generated code,
>  and (3) less maintenance burden for when new things are added (e.g.
>  devices, device features, etc.).]
>
> Signed-off-by: Jonah Palmer 

[...]

> diff --git a/qapi/virtio.json b/qapi/virtio.json
> index 44cc05c..bb93d6d 100644
> --- a/qapi/virtio.json
> +++ b/qapi/virtio.json
> @@ -656,3 +656,186 @@
>'data': { 'path': 'str', 'queue': 'uint16' },
>'returns': 'VirtVhostQueueStatus',
>'features': [ 'unstable' ] }
> +
> +##
> +# @VirtioRingDesc:
> +#
> +# Information regarding the vring descriptor area
> +#
> +# @addr: Guest physical address of the descriptor area
> +#
> +# @len: Length of the descriptor area
> +#
> +# @flags: List of descriptor flags
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtioRingDesc',
> +  'data': { 'addr': 'uint64',
> +'len': 'uint32',
> +'flags': [ 'str' ] } }
> +
> +##
> +# @VirtioRingAvail:
> +#
> +# Information regarding the avail vring (a.k.a. driver area)
> +#
> +# @flags: VRingAvail flags
> +#
> +# @idx: VRingAvail index
> +#
> +# @ring: VRingAvail ring[] entry at provided index
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtioRingAvail',
> +  'data': { 'flags': 'uint16',
> +'idx': 'uint16',
> +'ring': 'uint16' } }
> +
> +##
> +# @VirtioRingUsed:
> +#
> +# Information regarding the used vring (a.k.a. device area)
> +#
> +# @flags: VRingUsed flags
> +#
> +# @idx: VRingUsed index
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtioRingUsed',
> +  'data': { 'flags': 'uint16',
> +'idx': 'uint16' } }
> +
> +##
> +# @VirtioQueueElement:
> +#
> +# Information regarding a VirtQueue's VirtQueueElement including
> +# descriptor, driver, and device areas
> +#
> +# @name: Name of the VirtIODevice that uses this VirtQueue
> +#
> +# @index: Index of the element in the queue
> +#
> +# @descs: List of descriptors (VirtioRingDesc)
> +#
> +# @avail: VRingAvail info
> +#
> +# @used: VRingUsed info
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtioQueueElement',
> +  'data': { 'name': 'str',
> +'index': 'uint32',
> +'descs': [ 'VirtioRingDesc' ],
> +'avail': 'VirtioRingAvail',
> +'used': 'VirtioRingUsed' } }
> +
> +##
> +# @x-query-virtio-queue-element:
> +#
> +# Return the information about a VirtQueue's VirtQueueElement
> +# (default: head of the queue)

I'd put this line ...

> +#
> +# @path: VirtIODevice canonical QOM path
> +#
> +# @queue: VirtQueue index to examine
> +#
> +# @index: Index of the element in the queue

... here.

> +#
> +# Features:
> +# @unstable: This command is meant for debugging.
> +#
> +# Returns: VirtioQueueElement information
> +#
> +# Since: 7.0
> +#
> +# Examples:
> +#
> +# 1. Introspect on virtio-net's VirtQueue 0 at index 5
> +#
> +# -> { "execute": "x-query-virtio-queue-element",
> +#  "arguments": { "path": 
> "/machine/peripheral-anon/device[1]/virtio-backend",
> +# "queue": 0,
> +# "index": 5 }
> +#}
> +# <- { "return": {
> +#"index": 5,
> +#"name": "virtio-net",
> +#"descs": [
> +#   { "flags": ["write"], "len": 1536, "addr": 5257305600 }
> +#],
> +#"avail": {
> +#   "idx": 256,
> +#   "flags": 0,
> +#   "ring": 5
> +#},
> +#"used": {
> +#   "idx": 13,
> +#   "flags": 0
> +#},
> +#}
> +#
> +# 2. Introspect on virtio-crypto's VirtQueue 1 at head
> +#
> +# -> { "execute": "x-query-virtio-queue-element",
> +#  "arguments": { "path": "/machine/peripheral/crypto0/virtio-backend",
> +# "queue": 1 }
> +#}
> +# <- { "return": {
> +#"index": 0,
> +#"name": "virtio-crypto",
> +#"descs": [
> +#   { "flags": [], "len": 0, "addr": 8080268923184214134 }
> +#],
> +#"avail": {
> +#   "idx": 280,
> +#   "flags": 0,
> +#   "ring": 0
> +#},
> +#"used": {
> +#   "idx": 280,
> +#   "flags": 0
> +#}
> +#}
> +#
> +# 3. Introspect on virtio-scsi's VirtQueue 2 at head
> +#
> +# -> { "execute": "x-query-virtio-queue-element",
> +#  "arguments": { "path": 
> "/machine/peripheral-anon/device[2]/virtio-backend",
> +# "queue": 2 }
> +#}
> +# <- { "return": {
> +#"index": 19,
> +#"name": "virtio-scsi",
> +#"descs": [
> +#   { "flags": ["used", "indirect",

Re: [PATCH v12 6/8] qmp: add QMP commands for virtio/vhost queue-status

2022-02-11 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> These new commands show the internal status of a VirtIODevice's
> VirtQueue and a vhost device's vhost_virtqueue (if active).
>
> Signed-off-by: Jonah Palmer 

QAPI schema
Acked-by: Markus Armbruster

Re: [PATCH v11 7/8] qmp: add QMP command x-query-virtio-queue-element

2022-02-11 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> This new command shows the information of a VirtQueue element.
>
> [Note: Up until v10 of this patch series, virtio.json had many (15+)
>  enums defined (e.g. decoded device features, statuses, etc.). In v10
>  most of these enums were removed and replaced with string literals.
>  By doing this we get (1) simpler schema, (2) smaller generated code,
>  and (3) less maintenance burden for when new things are added (e.g.
>  devices, device features, etc.).]
>
> Signed-off-by: Jonah Palmer 

[...]

> diff --git a/qapi/virtio.json b/qapi/virtio.json
> index 44cc05c..bb93d6d 100644
> --- a/qapi/virtio.json
> +++ b/qapi/virtio.json
> @@ -656,3 +656,186 @@
>'data': { 'path': 'str', 'queue': 'uint16' },
>'returns': 'VirtVhostQueueStatus',
>'features': [ 'unstable' ] }
> +
> +##
> +# @VirtioRingDesc:
> +#
> +# Information regarding the vring descriptor area
> +#
> +# @addr: Guest physical address of the descriptor area
> +#
> +# @len: Length of the descriptor area
> +#
> +# @flags: List of descriptor flags
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtioRingDesc',
> +  'data': { 'addr': 'uint64',
> +'len': 'uint32',
> +'flags': [ 'str' ] } }
> +
> +##
> +# @VirtioRingAvail:
> +#
> +# Information regarding the avail vring (a.k.a. driver area)
> +#
> +# @flags: VRingAvail flags
> +#
> +# @idx: VRingAvail index
> +#
> +# @ring: VRingAvail ring[] entry at provided index
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtioRingAvail',
> +  'data': { 'flags': 'uint16',
> +'idx': 'uint16',
> +'ring': 'uint16' } }
> +
> +##
> +# @VirtioRingUsed:
> +#
> +# Information regarding the used vring (a.k.a. device area)
> +#
> +# @flags: VRingUsed flags
> +#
> +# @idx: VRingUsed index
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtioRingUsed',
> +  'data': { 'flags': 'uint16',
> +'idx': 'uint16' } }
> +
> +##
> +# @VirtioQueueElement:
> +#
> +# Information regarding a VirtQueue's VirtQueueElement including
> +# descriptor, driver, and device areas
> +#
> +# @name: Name of the VirtIODevice that uses this VirtQueue
> +#
> +# @index: Index of the element in the queue
> +#
> +# @descs: List of descriptors (VirtioRingDesc)
> +#
> +# @avail: VRingAvail info
> +#
> +# @used: VRingUsed info
> +#
> +# Since: 7.0
> +#
> +##
> +
> +{ 'struct': 'VirtioQueueElement',
> +  'data': { 'name': 'str',
> +'index': 'uint32',
> +'descs': [ 'VirtioRingDesc' ],
> +'avail': 'VirtioRingAvail',
> +'used': 'VirtioRingUsed' } }
> +
> +##
> +# @x-query-virtio-queue-element:
> +#
> +# Return the information about a VirtQueue's VirtQueueElement
> +# (default: head of the queue)

I'd put this ...

> +#
> +# @path: VirtIODevice canonical QOM path
> +#
> +# @queue: VirtQueue index to examine
> +#
> +# @index: Index of the element in the queue

... here.

> +#
> +# Features:
> +# @unstable: This command is meant for debugging.
> +#
> +# Returns: VirtioQueueElement information
> +#
> +# Since: 7.0
> +#
> +# Examples:
> +#
> +# 1. Introspect on virtio-net's VirtQueue 0 at index 5
> +#
> +# -> { "execute": "x-query-virtio-queue-element",
> +#  "arguments": { "path": 
> "/machine/peripheral-anon/device[1]/virtio-backend",
> +# "queue": 0,
> +# "index": 5 }
> +#}
> +# <- { "return": {
> +#"index": 5,
> +#"name": "virtio-net",
> +#"descs": [
> +#   { "flags": ["write"], "len": 1536, "addr": 5257305600 }
> +#],
> +#"avail": {
> +#   "idx": 256,
> +#   "flags": 0,
> +#   "ring": 5
> +#},
> +#"used": {
> +#   "idx": 13,
> +#   "flags": 0
> +#},
> +#}
> +#
> +# 2. Introspect on virtio-crypto's VirtQueue 1 at head
> +#
> +# -> { "execute": "x-query-virtio-queue-element",
> +#  "arguments": { "path": "/machine/peripheral/crypto0/virtio-backend",
> +# "queue": 1 }
> +#}
> +# <- { "return": {
> +#"index": 0,
> +#"name": "virtio-crypto",
> +#"descs": [
> +#   { "flags": [], "len": 0, "addr": 8080268923184214134 }
> +#],
> +#"avail": {
> +#   "idx": 280,
> +#   "flags": 0,
> +#   "ring": 0
> +#},
> +#"used": {
> +#   "idx": 280,
> +#   "flags": 0
> +#}
> +#}
> +#
> +# 3. Introspect on virtio-scsi's VirtQueue 2 at head
> +#
> +# -> { "execute": "x-query-virtio-queue-element",
> +#  "arguments": { "path": 
> "/machine/peripheral-anon/device[2]/virtio-backend",
> +# "queue": 2 }
> +#}
> +# <- { "return": {
> +#"index": 19,
> +#"name": "virtio-scsi",
> +#"descs": [
> +#   { "flags": ["used", "indirect", "wri

Re: [PATCH v12 4/8] qmp: add QMP command x-query-virtio-status

2022-02-11 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> This new command shows the status of a VirtIODevice, including
> its corresponding vhost device's status (if active).
>
> Next patch will improve output by decoding feature bits, including
> vhost device's feature bits (backend, protocol, acked, and features).
> Also will decode status bits of a VirtIODevice.
>
> [Jonah: Similar to previous patch, added a check to @virtio_device_find
>  to ensure synchronicity between @virtio_list and the devices in the QOM
>  composition tree.]
>
> Signed-off-by: Jonah Palmer 

QAPI schema
Acked-by: Markus Armbruster

Re: [PATCH v11 6/8] qmp: add QMP commands for virtio/vhost queue-status

2022-02-11 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> These new commands show the internal status of a VirtIODevice's
> VirtQueue and a vhost device's vhost_virtqueue (if active).
>
> Signed-off-by: Jonah Palmer 

QAPI schema
Acked-by: Markus Armbruster

Re: [PATCH v12 3/8] qmp: add QMP command x-query-virtio

2022-02-11 Thread Markus Armbruster

Jonah Palmer  writes:

> From: Laurent Vivier 
>
> This new command lists all the instances of VirtIODevices with
> their canonical QOM path and name.
>
> [Jonah: @virtio_list duplicates information that already exists in
>  the QOM composition tree. However, extracting necessary information
>  from this tree seems to be a bit convoluted.
>
>  Instead, we still create our own list of realized virtio devices
>  but use @qmp_qom_get with the device's canonical QOM path to confirm
>  that the device exists and is realized. If the device exists but
>  is actually not realized, then we remove it from our list (for
>  synchronicity to the QOM composition tree).
>
>  Also, the QMP command @x-query-virtio is redundant as @qom-list
>  and @qom-get are sufficient to search '/machine/' for realized
>  virtio devices. However, @x-query-virtio is much more convenient
>  in listing realized virtio devices.]

Thanks for explaining this.  Whether the convenience is worth the extra
code is for the virtio maintainer to decide.

> Signed-off-by: Jonah Palmer 

QAPI schema
Acked-by: Markus Armbruster

Re: [PATCH 1/6] block/io.c: fix bdrv_child_cb_drained_begin invocations from a coroutine

2022-02-11 Thread Kevin Wolf

Am 08.02.2022 um 16:36 hat Emanuele Giuseppe Esposito geschrieben:
> Using bdrv_do_drained_begin_quiesce() in bdrv_child_cb_drained_begin()
> is not a good idea: the callback might be called when running
> a drain in a coroutine, and bdrv_drained_begin_poll() does not
> handle that case, resulting in assertion failure.

I remembered that we talked about this only recently on IRC, but it
didn't make any sense to me again when I read this commit message. So I
think we need --verbose.

The .drained_begin callback was always meant to run outside of coroutine
context, so the unexpected part isn't that it calls a function that
can't run in coroutine context, but that it is already called itself in
coroutine context.

The problematic path is bdrv_replace_child_noperm() which then calls
bdrv_parent_drained_begin_single(poll=true). Polling in coroutine
context is dangerous, it can cause deadlocks because the caller of the
coroutine can't make progress. So I believe this call is already wrong
in coroutine context.

Now I don't know the call path up to bdrv_replace_child_noperm(), but as
far as I remember, that was another function that was originally never
run in coroutine context. Maybe we have good reason to change this, I
can't point to anything that would be inherently wrong with it, but I
would still be curious in which context it does run in a coroutine now.

Anyway, whatever the specific place is, I believe we must drop out of
coroutine context _before_ calling bdrv_parent_drained_begin_single(),
not only in callbacks called by it.

> Instead, bdrv_do_drained_begin with no recursion and poll
> will accomplish the same thing (invoking bdrv_do_drained_begin_quiesce)
> but will firstly check if we are already in a coroutine, and exit
> from that via bdrv_co_yield_to_drain().
> 
> Signed-off-by: Emanuele Giuseppe Esposito 

Kevin

Re: [RFC] thread-pool: Add option to fix the pool size

2022-02-11 Thread Kevin Wolf

Am 03.02.2022 um 15:19 hat Stefan Hajnoczi geschrieben:
> On Thu, Feb 03, 2022 at 10:56:49AM +, Daniel P. Berrangé wrote:
> > On Thu, Feb 03, 2022 at 10:53:07AM +, Stefan Hajnoczi wrote:
> > > On Wed, Feb 02, 2022 at 06:52:34PM +0100, Nicolas Saenz Julienne wrote:
> > > > The thread pool regulates itself: when idle, it kills threads until
> > > > empty, when in demand, it creates new threads until full. This behaviour
> > > > doesn't play well with latency sensitive workloads where the price of
> > > > creating a new thread is too high. For example, when paired with qemu's
> > > > '-mlock', or using safety features like SafeStack, creating a new thread
> > > > has been measured take multiple milliseconds.
> > > > 
> > > > In order to mitigate this let's introduce a new option to set a fixed
> > > > pool size. The threads will be created during the pool's initialization,
> > > > remain available during its lifetime regardless of demand, and destroyed
> > > > upon freeing it. A properly characterized workload will then be able to
> > > > configure the pool to avoid any latency spike.
> > > > 
> > > > Signed-off-by: Nicolas Saenz Julienne 
> > > > 
> > > > ---
> > > > 
> > > > The fix I propose here works for my specific use-case, but I'm pretty
> > > > sure it'll need to be a bit more versatile to accommodate other
> > > > use-cases.
> > > > 
> > > > Some questions:
> > > > 
> > > > - Is unanimously setting these parameters for any pool instance too
> > > >   limiting? It'd make sense to move the options into the AioContext the
> > > >   pool belongs to. IIUC, for the general block use-case, this would be
> > > >   'qemu_aio_context' as initialized in qemu_init_main_loop().
> > > 
> > > Yes, qemu_aio_context is the main loop's AioContext. It's used unless
> > > IOThreads are configured.
> > > 
> > > It's nice to have global settings that affect all AioContexts, so I
> > > think this patch is fine for now.
> > > 
> > > In the future IOThread-specific parameters could be added if individual
> > > IOThread AioContexts need tuning (similar to how poll-max-ns works
> > > today).
> > > 
> > > > - Currently I'm setting two pool properties through a single qemu
> > > >   option. The pool's size and dynamic behaviour, or lack thereof. I
> > > >   think it'd be better to split them into separate options. I thought of
> > > >   different ways of expressing this (min/max-size where static happens
> > > >   when min-size=max-size, size and static/dynamic, etc..), but you might
> > > >   have ideas on what could be useful to other use-cases.
> > > 
> > > Yes, "min" and "max" is more flexible than fixed-size=n. fixed-size=n is
> > > equivalent to min=n,max=n. The current default policy is min=0,max=64.
> > > If you want more threads you could do min=0,max=128. If you want to
> > > reserve 1 thread all the time use min=1,max=64.
> > > 
> > > I would go with min and max.
> > 
> > This commit also exposes this as a new top level command line
> > argument. Given our aim to eliminate QemuOpts and use QAPI/QOM
> > properties for everything I think we need a different approach.
> > 
> > I'm not sure which exisiting QAPI/QOM option it most appropriate
> > to graft these tunables onto ?  -machine ?  -accel ?  Or is there
> > no good fit yet ?

I would agree that it should be QAPI, but just like QemuOpts doesn't
require that you shoehorn it into an existing option, QAPI doesn't
necessarily either if that's the interface that we want. You could just
create a new QAPI struct for it and parse the new option into that. This
would already be an improvement over this RFC.

Now, whether we actually want a new top-level option is a different
question (we usually try to avoid it), but it's not related to QAPI vs.
QemuOpts.

> Yep, I didn't comment on this because I don't have a good suggestion.
> 
> In terms of semantics I think we should have:
> 
> 1. A global default value that all new AioContext take. The QEMU main
>loop's qemu_aio_context will use this and all IOThread AioContext
>will use it (unless they have been overridden).
> 
>I would define it on --machine because that's the "global" object for
>a guest, but that's not very satisfying.

Semantically, -machine is about the virtual hardware where as iothreads
are about the backend, so I agree it's not a good fit.

For the main thread, you may want to configure all the same options that
you can configure for an iothread. So to me that sounds like we would
want to allow using an iothread object for the main thread, too.

That would still require us to tell QEMU which iothread object should be
used for the main thread, though.

> 2. (Future patch) --object iothread,thread-pool-min=N,thread-pool-max=M
>just like poll-max-ns and friends. This allows the values to be set
>on a per-IOThread basis.

And to be updated with qom-set. (Which is again something that you'll
want for the main thread, too.)

Kevin


signature.asc
Description: PGP signature

Re: [PATCH v2 1/8] tests/qemu-iotests/testrunner: Allow parallel test invocations

2022-02-11 Thread Vladimir Sementsov-Ogievskiy


11.02.2022 12:29, Kevin Wolf wrote:

Am 09.02.2022 um 11:15 hat Thomas Huth geschrieben:

If multiple tests run in parallel, they must use unique file
names for the test output.

Suggested-by: Hanna Reitz 
Signed-off-by: Thomas Huth 
---
  tests/qemu-iotests/testrunner.py | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/testrunner.py b/tests/qemu-iotests/testrunner.py
index 0eace147b8..9d20f51bb1 100644
--- a/tests/qemu-iotests/testrunner.py
+++ b/tests/qemu-iotests/testrunner.py
@@ -259,7 +259,7 @@ def do_run_test(self, test: str, mp: bool) -> TestResult:
  """
  
  f_test = Path(test)

-f_bad = Path(f_test.name + '.out.bad')
+f_bad = Path(f'{os.getpid()}-{f_test.name}.out.bad')
  f_notrun = Path(f_test.name + '.notrun')
  f_casenotrun = Path(f_test.name + '.casenotrun')
  f_reference = Path(self.find_reference(test))


Hmm... It does make sense, but nobody ever cleans those files up.
Currently, the next run of the test will just overwrite the existing
file or delete it when the test succeeds. So after running the test
suite, you have .out.bad files for every failed test, but not for those
that succeeded.

After this change, won't the test directory accumulate tons of .out.bad
files over time?



Actually, .out.bad files are put not to TEST_DIR but to 
build/tests/qemu-iotests..

If we want to do several runs in parallel, I think all files that test-run 
produces should be in TEST_DIR and SOCK_DIR. And we should care to keep 
TEST_DIR/*.out.bad after test-run, so user can examine them.


--
Best regards,
Vladimir

Re: [RFC] thread-pool: Add option to fix the pool size

2022-02-11 Thread Nicolas Saenz Julienne

On Thu, 2022-02-03 at 14:19 +, Stefan Hajnoczi wrote:
> Yep, I didn't comment on this because I don't have a good suggestion.
> 
> In terms of semantics I think we should have:
> 
> 1. A global default value that all new AioContext take. The QEMU main
>loop's qemu_aio_context will use this and all IOThread AioContext
>will use it (unless they have been overridden).
> 
>I would define it on --machine because that's the "global" object for
>a guest, but that's not very satisfying.

So I tried to implement this. One problem arouse:
 - The thread pool properties are now part of the MachineState. So as to
   properly use QOM.
 - Sadly, the main loop is initialized before the machine class options are
   populated. See 'qemu_init_main_loop()' and 'qemu_apply_machine_options()' in
   'softmmu/vl.c'.
 - Short of manually parsing the options, which IMO defeats the purpose of
   using QOM, or changing the initialization order, which I'm sure won't be
   easy, I can't access the properties early enough.

Any ideas?

Thanks!

-- 
Nicolás Sáenz

Re: [PATCH v2 1/8] tests/qemu-iotests/testrunner: Allow parallel test invocations

2022-02-11 Thread Kevin Wolf

Am 09.02.2022 um 11:15 hat Thomas Huth geschrieben:
> If multiple tests run in parallel, they must use unique file
> names for the test output.
> 
> Suggested-by: Hanna Reitz 
> Signed-off-by: Thomas Huth 
> ---
>  tests/qemu-iotests/testrunner.py | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tests/qemu-iotests/testrunner.py 
> b/tests/qemu-iotests/testrunner.py
> index 0eace147b8..9d20f51bb1 100644
> --- a/tests/qemu-iotests/testrunner.py
> +++ b/tests/qemu-iotests/testrunner.py
> @@ -259,7 +259,7 @@ def do_run_test(self, test: str, mp: bool) -> TestResult:
>  """
>  
>  f_test = Path(test)
> -f_bad = Path(f_test.name + '.out.bad')
> +f_bad = Path(f'{os.getpid()}-{f_test.name}.out.bad')
>  f_notrun = Path(f_test.name + '.notrun')
>  f_casenotrun = Path(f_test.name + '.casenotrun')
>  f_reference = Path(self.find_reference(test))

Hmm... It does make sense, but nobody ever cleans those files up.
Currently, the next run of the test will just overwrite the existing
file or delete it when the test succeeds. So after running the test
suite, you have .out.bad files for every failed test, but not for those
that succeeded.

After this change, won't the test directory accumulate tons of .out.bad
files over time?

Kevin

Re: [PATCH v5 02/20] job.h: categorize fields in struct Job

2022-02-11 Thread Emanuele Giuseppe Esposito




On 10/02/2022 18:35, Stefan Hajnoczi wrote:
> On Thu, Feb 10, 2022 at 05:26:52PM +0100, Emanuele Giuseppe Esposito wrote:
>>
>>
>> On 10/02/2022 16:40, Stefan Hajnoczi wrote:
>>> On Tue, Feb 08, 2022 at 09:34:55AM -0500, Emanuele Giuseppe Esposito wrote:
 Categorize the fields in struct Job to understand which ones
 need to be protected by the job mutex and which don't.

 Signed-off-by: Emanuele Giuseppe Esposito 
 ---
  include/qemu/job.h | 59 ++
  1 file changed, 34 insertions(+), 25 deletions(-)

 diff --git a/include/qemu/job.h b/include/qemu/job.h
 index d1192ffd61..86ec46c09e 100644
 --- a/include/qemu/job.h
 +++ b/include/qemu/job.h
 @@ -40,27 +40,50 @@ typedef struct JobTxn JobTxn;
   * Long-running operation.
   */
  typedef struct Job {
 +
 +/* Fields set at initialization (job_create), and never modified */
>>>
>>> Is there a corresponding "Field protected by job_mutex" comment that
>>> separates fields that need locking?
>>>
>>
>> That would be the comment
>>
>> /** Protected by job_mutex */
>>
>> situated right after the field "ProgressMeter progress;".
>>
>> Do you want me to change it in "Fields protected by job_mutex"?
> 
> I don't see it:
> 
> +/** The opaque value that is passed to the completion function.  */
> +void *opaque;
> +
> +/* ProgressMeter API is thread-safe */
> +ProgressMeter progress;
> +
> +
> +/** AioContext to run the job coroutine in */
> +AioContext *aio_context;
> +
> +/** Reference count of the block job */
> +int refcnt;
> 
> Is it added by a later patch or did I miss it?
> 

Yes sorry I forgot: it is added in patch 19, when the lock is not a nop
anymore.

I think this was suggested in one of the previous reviews, to avoid
having a misleading comment when the fields are not yet protected.

Emanuele

Re: [PATCH v2 2/8] tests/qemu-iotests: Improve the check for GNU sed

2022-02-11 Thread Kevin Wolf

Am 09.02.2022 um 11:15 hat Thomas Huth geschrieben:
> Instead of failing the iotests if GNU sed is not available (or skipping
> them completely in the check-block.sh script), it would be better to
> simply skip the bash-based tests, so that the python-based tests could
> still be run. Thus add the check for BusyBox sed to common.rc and mark
> the tests as "not run" if GNU sed is not available. Then we can also
> remove the sed checks from the check-block.sh script.
> 
> Signed-off-by: Thomas Huth 

I agree that skipping bash tests is slightly better than skipping all
tests. And that the skipping should really be done in qemu-iotests
itself and not in a wrapper around it.

But can't we make it even better and skip only bash tests that actually
use sed?

> +# We need GNU sed for the iotests. Make sure to not use BusyBox sed
> +# which says that "This is not GNU sed version 4.0"
>  SED=
>  for sed in sed gsed; do
> -($sed --version | grep 'GNU sed') > /dev/null 2>&1
> +($sed --version | grep -v "not GNU sed" | grep 'GNU sed') > /dev/null 
> 2>&1
>  if [ "$?" -eq 0 ]; then
>  SED=$sed
>  break
>  fi
>  done
>  if [ -z "$SED" ]; then
> -echo "$0: GNU sed not found"
> -exit 1
> +_notrun "GNU sed not found"
>  fi

Couldn't we just define 'sed' as a function or alias here that skips the
test with _notrun only when it's actually called?

Kevin

Re: [PATCH v4 10/15] hw/nvme: Remove reg_size variable and update BAR0 size calculation

2022-02-11 Thread Klaus Jensen

On Jan 26 18:11, Lukasz Maniak wrote:
> From: Łukasz Gieryk 
> 
> The n->reg_size parameter unnecessarily splits the BAR0 size calculation
> in two phases; removed to simplify the code.
> 
> With all the calculations done in one place, it seems the pow2ceil,
> applied originally to reg_size, is unnecessary. The rounding should
> happen as the last step, when BAR size includes Nvme registers, queue
> registers, and MSIX-related space.
> 
> Finally, the size of the mmio memory region is extended to cover the 1st
> 4KiB padding (see the map below). Access to this range is handled as
> interaction with a non-existing queue and generates an error trace, so
> actually nothing changes, while the reg_size variable is no longer needed.
> 
> 
> |  BAR0|
> 
> [Nvme Registers]
> [Queues]
> [power-of-2 padding] - removed in this patch
> [4KiB padding (1)  ]
> [MSIX TABLE]
> [4KiB padding (2)  ]
> [MSIX PBA  ]
> [power-of-2 padding]
> 
> Signed-off-by: Łukasz Gieryk 
> ---
>  hw/nvme/ctrl.c | 10 +-
>  hw/nvme/nvme.h |  1 -
>  2 files changed, 5 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 426507ca8a..40eb6bd1a8 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -6372,9 +6372,6 @@ static void nvme_init_state(NvmeCtrl *n)
>  n->conf_ioqpairs = n->params.max_ioqpairs;
>  n->conf_msix_qsize = n->params.msix_qsize;
>  
> -/* add one to max_ioqpairs to account for the admin queue pair */
> -n->reg_size = pow2ceil(sizeof(NvmeBar) +
> -   2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
>  n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
>  n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
>  n->temperature = NVME_TEMPERATURE;
> @@ -6498,7 +6495,10 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice 
> *pci_dev, Error **errp)
>  pcie_ari_init(pci_dev, 0x100, 1);
>  }
>  
> -bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
> +/* add one to max_ioqpairs to account for the admin queue pair */
> +bar_size = sizeof(NvmeBar) +
> +   2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE;
> +bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
>  msix_table_offset = bar_size;
>  msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
>  
> @@ -6512,7 +6512,7 @@ static int nvme_init_pci(NvmeCtrl *n, PCIDevice 
> *pci_dev, Error **errp)
>  
>  memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
>  memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
> -  n->reg_size);
> +  msix_table_offset);
>  memory_region_add_subregion(&n->bar0, 0, &n->iomem);
>  
>  if (pci_is_vf(pci_dev)) {
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 927890b490..1401ac3904 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -414,7 +414,6 @@ typedef struct NvmeCtrl {
>  uint16_tmax_prp_ents;
>  uint16_tcqe_size;
>  uint16_tsqe_size;
> -uint32_treg_size;
>  uint32_tmax_q_ents;
>  uint8_t outstanding_aers;
>  uint32_tirq_status;
> -- 
> 2.25.1
> 

Nice catch.

Reviewed-by: Klaus Jensen 


signature.asc
Description: PGP signature

73 matches

Mail list logo