date:20200903

[PATCH] iotests: Remove 030 from the auto group

2020-09-03 Thread Thomas Huth

Test 030 is still occasionally failing in the CI ... so for the
time being, let's disable it in the "auto" group. We can add it
back once it got more stable.

Signed-off-by: Thomas Huth 
---
 I just saw the problem here:
  https://cirrus-ci.com/task/5449330930745344?command=main#L6482
 and Peter hit it a couple of weeks ago:
  https://lists.gnu.org/archive/html/qemu-devel/2020-08/msg00136.html

 tests/qemu-iotests/group | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 5cad015231..f084061a16 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -51,7 +51,7 @@
 027 rw auto quick
 028 rw backing quick
 029 rw auto quick
-030 rw auto backing
+030 rw backing
 031 rw auto quick
 032 rw auto quick
 033 rw auto quick
-- 
2.18.2

[PATCH v4 2/2] file-posix: add sg_get_max_segments that actually works with sg

2020-09-03 Thread Tom Yan

sg devices have different major/minor than their corresponding
block devices. Using sysfs to get max segments never really worked
for them.

Fortunately the sg driver provides an ioctl to get sg_tablesize,
which is apparently equivalent to max segments.

Signed-off-by: Tom Yan 
---
 block/file-posix.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 411a23cf99..9e37594145 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1178,6 +1178,21 @@ static int sg_get_max_transfer_length(int fd)
 #endif
 }
 
+static int sg_get_max_segments(int fd)
+{
+#ifdef SG_GET_SG_TABLESIZE
+long max_segments = 0;
+
+if (ioctl(fd, SG_GET_SG_TABLESIZE, _segments) == 0) {
+return max_segments;
+} else {
+return -errno;
+}
+#else
+return -ENOSYS;
+#endif
+}
+
 static int get_max_transfer_length(int fd)
 {
 #if defined(BLKSECTGET) && defined(BLKSSZGET)
@@ -1268,7 +1283,7 @@ static void hdev_refresh_limits(BlockDriverState *bs, 
Error **errp)
 bs->bl.max_transfer = pow2floor(ret);
 }
 
-ret = get_max_segments(s->fd);
+ret = bs->sg ? sg_get_max_segments(s->fd) : get_max_segments(s->fd);
 if (ret > 0) {
 bs->bl.max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
ret * qemu_real_host_page_size);
-- 
2.28.0

[PATCH RESEND v2 1/2] file-posix: split hdev_refresh_limits from raw_refresh_limits

2020-09-03 Thread Tom Yan

We can and should get max transfer length and max segments for all host
devices / cdroms (on Linux).

Also use MIN_NON_ZERO instead when we clamp max transfer length against
max segments.

Signed-off-by: Tom Yan 
---
v2: fix get_max_transfer_length for non-sg devices; also add/fix
sg_get_max_segments for sg devices
 block/file-posix.c | 56 +-
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 9a00d4190a..0c9124c8aa 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1178,7 +1178,23 @@ static int sg_get_max_transfer_length(int fd)
 #endif
 }
 
-static int sg_get_max_segments(int fd)
+static int get_max_transfer_length(int fd)
+{
+#if defined(BLKSECTGET) && defined(BLKSSZGET)
+int sect = 0;
+int ssz = 0;
+
+if (ioctl(fd, BLKSECTGET, ) == 0 && ioctl(fd, BLKSSZGET, )) {
+return sect * ssz;
+} else {
+return -errno;
+}
+#else
+return -ENOSYS;
+#endif
+}
+
+static int get_max_segments(int fd)
 {
 #ifdef CONFIG_LINUX
 char buf[32];
@@ -1233,23 +1249,31 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 {
 BDRVRawState *s = bs->opaque;
 
-if (bs->sg) {
-int ret = sg_get_max_transfer_length(s->fd);
+raw_probe_alignment(bs, s->fd, errp);
+bs->bl.min_mem_alignment = s->buf_align;
+bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
+}
+
+static void hdev_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+BDRVRawState *s = bs->opaque;
+
+/* BLKSECTGET has always been implemented wrongly in the sg driver
+   and BLKSSZGET has never been supported by it*/
+int ret = bs->sg ? sg_get_max_transfer_length(s->fd) :
+   get_max_transfer_length(s->fd);
 
-if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
-bs->bl.max_transfer = pow2floor(ret);
-}
+if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+bs->bl.max_transfer = pow2floor(ret);
+}
 
-ret = sg_get_max_segments(s->fd);
-if (ret > 0) {
-bs->bl.max_transfer = MIN(bs->bl.max_transfer,
-  ret * qemu_real_host_page_size);
-}
+ret = get_max_segments(s->fd);
+if (ret > 0) {
+bs->bl.max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
+   ret * qemu_real_host_page_size);
 }
 
-raw_probe_alignment(bs, s->fd, errp);
-bs->bl.min_mem_alignment = s->buf_align;
-bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
+raw_refresh_limits(bs, errp);
 }
 
 static int check_for_dasd(int fd)
@@ -3604,7 +3628,7 @@ static BlockDriver bdrv_host_device = {
 .bdrv_co_pdiscard   = hdev_co_pdiscard,
 .bdrv_co_copy_range_from = raw_co_copy_range_from,
 .bdrv_co_copy_range_to  = raw_co_copy_range_to,
-.bdrv_refresh_limits = raw_refresh_limits,
+.bdrv_refresh_limits = hdev_refresh_limits,
 .bdrv_io_plug = raw_aio_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
@@ -3728,7 +3752,7 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_co_preadv = raw_co_preadv,
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
-.bdrv_refresh_limits = raw_refresh_limits,
+.bdrv_refresh_limits = hdev_refresh_limits,
 .bdrv_io_plug = raw_aio_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
-- 
2.28.0

[PATCH v3 1/2] file-posix: split hdev_refresh_limits from raw_refresh_limits

2020-09-03 Thread Tom Yan

We can and should get max transfer length and max segments for all host
devices / cdroms (on Linux).

Also use MIN_NON_ZERO instead when we clamp max transfer length against
max segments.

Signed-off-by: Tom Yan 
---
v2: fix get_max_transfer_length for non-sg devices; also add/fix
sg_get_max_segments for sg devices
v3: fix typo in get_max_transfer_length
 block/file-posix.c | 57 +-
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 9a00d4190a..411a23cf99 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1178,7 +1178,24 @@ static int sg_get_max_transfer_length(int fd)
 #endif
 }
 
-static int sg_get_max_segments(int fd)
+static int get_max_transfer_length(int fd)
+{
+#if defined(BLKSECTGET) && defined(BLKSSZGET)
+int sect = 0;
+int ssz = 0;
+
+if (ioctl(fd, BLKSECTGET, ) == 0 &&
+ioctl(fd, BLKSSZGET, ) == 0) {
+return sect * ssz;
+} else {
+return -errno;
+}
+#else
+return -ENOSYS;
+#endif
+}
+
+static int get_max_segments(int fd)
 {
 #ifdef CONFIG_LINUX
 char buf[32];
@@ -1233,23 +1250,31 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 {
 BDRVRawState *s = bs->opaque;
 
-if (bs->sg) {
-int ret = sg_get_max_transfer_length(s->fd);
+raw_probe_alignment(bs, s->fd, errp);
+bs->bl.min_mem_alignment = s->buf_align;
+bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
+}
+
+static void hdev_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+BDRVRawState *s = bs->opaque;
+
+/* BLKSECTGET has always been implemented wrongly in the sg driver
+   and BLKSSZGET has never been supported by it*/
+int ret = bs->sg ? sg_get_max_transfer_length(s->fd) :
+   get_max_transfer_length(s->fd);
 
-if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
-bs->bl.max_transfer = pow2floor(ret);
-}
+if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+bs->bl.max_transfer = pow2floor(ret);
+}
 
-ret = sg_get_max_segments(s->fd);
-if (ret > 0) {
-bs->bl.max_transfer = MIN(bs->bl.max_transfer,
-  ret * qemu_real_host_page_size);
-}
+ret = get_max_segments(s->fd);
+if (ret > 0) {
+bs->bl.max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
+   ret * qemu_real_host_page_size);
 }
 
-raw_probe_alignment(bs, s->fd, errp);
-bs->bl.min_mem_alignment = s->buf_align;
-bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
+raw_refresh_limits(bs, errp);
 }
 
 static int check_for_dasd(int fd)
@@ -3604,7 +3629,7 @@ static BlockDriver bdrv_host_device = {
 .bdrv_co_pdiscard   = hdev_co_pdiscard,
 .bdrv_co_copy_range_from = raw_co_copy_range_from,
 .bdrv_co_copy_range_to  = raw_co_copy_range_to,
-.bdrv_refresh_limits = raw_refresh_limits,
+.bdrv_refresh_limits = hdev_refresh_limits,
 .bdrv_io_plug = raw_aio_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
@@ -3728,7 +3753,7 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_co_preadv = raw_co_preadv,
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
-.bdrv_refresh_limits = raw_refresh_limits,
+.bdrv_refresh_limits = hdev_refresh_limits,
 .bdrv_io_plug = raw_aio_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
-- 
2.28.0

[PATCH v4 1/2] file-posix: split hdev_refresh_limits from raw_refresh_limits

2020-09-03 Thread Tom Yan

We can and should get max transfer length and max segments for all host
devices / cdroms (on Linux).

Also use MIN_NON_ZERO instead when we clamp max transfer length against
max segments.

Signed-off-by: Tom Yan 
---
v2: fix get_max_transfer_length for non-sg devices; also add/fix
sg_get_max_segments for sg devices
v3: fix typo in get_max_transfer_length
v4: fix accidental .orig inclusion
 block/file-posix.c | 57 +-
 1 file changed, 41 insertions(+), 16 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 9a00d4190a..411a23cf99 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1178,7 +1178,24 @@ static int sg_get_max_transfer_length(int fd)
 #endif
 }
 
-static int sg_get_max_segments(int fd)
+static int get_max_transfer_length(int fd)
+{
+#if defined(BLKSECTGET) && defined(BLKSSZGET)
+int sect = 0;
+int ssz = 0;
+
+if (ioctl(fd, BLKSECTGET, ) == 0 &&
+ioctl(fd, BLKSSZGET, ) == 0) {
+return sect * ssz;
+} else {
+return -errno;
+}
+#else
+return -ENOSYS;
+#endif
+}
+
+static int get_max_segments(int fd)
 {
 #ifdef CONFIG_LINUX
 char buf[32];
@@ -1233,23 +1250,31 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 {
 BDRVRawState *s = bs->opaque;
 
-if (bs->sg) {
-int ret = sg_get_max_transfer_length(s->fd);
+raw_probe_alignment(bs, s->fd, errp);
+bs->bl.min_mem_alignment = s->buf_align;
+bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
+}
+
+static void hdev_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+BDRVRawState *s = bs->opaque;
+
+/* BLKSECTGET has always been implemented wrongly in the sg driver
+   and BLKSSZGET has never been supported by it*/
+int ret = bs->sg ? sg_get_max_transfer_length(s->fd) :
+   get_max_transfer_length(s->fd);
 
-if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
-bs->bl.max_transfer = pow2floor(ret);
-}
+if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+bs->bl.max_transfer = pow2floor(ret);
+}
 
-ret = sg_get_max_segments(s->fd);
-if (ret > 0) {
-bs->bl.max_transfer = MIN(bs->bl.max_transfer,
-  ret * qemu_real_host_page_size);
-}
+ret = get_max_segments(s->fd);
+if (ret > 0) {
+bs->bl.max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
+   ret * qemu_real_host_page_size);
 }
 
-raw_probe_alignment(bs, s->fd, errp);
-bs->bl.min_mem_alignment = s->buf_align;
-bs->bl.opt_mem_alignment = MAX(s->buf_align, qemu_real_host_page_size);
+raw_refresh_limits(bs, errp);
 }
 
 static int check_for_dasd(int fd)
@@ -3604,7 +3629,7 @@ static BlockDriver bdrv_host_device = {
 .bdrv_co_pdiscard   = hdev_co_pdiscard,
 .bdrv_co_copy_range_from = raw_co_copy_range_from,
 .bdrv_co_copy_range_to  = raw_co_copy_range_to,
-.bdrv_refresh_limits = raw_refresh_limits,
+.bdrv_refresh_limits = hdev_refresh_limits,
 .bdrv_io_plug = raw_aio_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
@@ -3728,7 +3753,7 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_co_preadv = raw_co_preadv,
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
-.bdrv_refresh_limits = raw_refresh_limits,
+.bdrv_refresh_limits = hdev_refresh_limits,
 .bdrv_io_plug = raw_aio_plug,
 .bdrv_io_unplug = raw_aio_unplug,
 .bdrv_attach_aio_context = raw_aio_attach_aio_context,
-- 
2.28.0

[PATCH RESEND v2 2/2] file-posix: add sg_get_max_segments that actually works with sg

2020-09-03 Thread Tom Yan

sg devices have different major/minor than their corresponding
block devices. Using sysfs to get max segments never really worked
for them.

Fortunately the sg driver provides an ioctl to get sg_tablesize,
which is apparently equivalent to max segments.

Signed-off-by: Tom Yan 
---
 block/file-posix.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 0c9124c8aa..d2cd9a3362 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1178,6 +1178,21 @@ static int sg_get_max_transfer_length(int fd)
 #endif
 }
 
+static int sg_get_max_segments(int fd)
+{
+#ifdef SG_GET_SG_TABLESIZE
+long max_segments = 0;
+
+if (ioctl(fd, SG_GET_SG_TABLESIZE, _segments) == 0) {
+return max_segments;
+} else {
+return -errno;
+}
+#else
+return -ENOSYS;
+#endif
+}
+
 static int get_max_transfer_length(int fd)
 {
 #if defined(BLKSECTGET) && defined(BLKSSZGET)
@@ -1267,7 +1282,7 @@ static void hdev_refresh_limits(BlockDriverState *bs, 
Error **errp)
 bs->bl.max_transfer = pow2floor(ret);
 }
 
-ret = get_max_segments(s->fd);
+ret = bs->sg ? sg_get_max_segments(s->fd) : get_max_segments(s->fd);
 if (ret > 0) {
 bs->bl.max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
ret * qemu_real_host_page_size);
-- 
2.28.0

Re: [RFC PATCH 00/12] hw: Forbid DMA write accesses to MMIO regions

2020-09-03 Thread Jason Wang




On 2020/9/4 上午3:46, Edgar E. Iglesias wrote:

On Thu, Sep 03, 2020 at 07:53:33PM +0200, Paolo Bonzini wrote:

On 03/09/20 17:50, Edgar E. Iglesias wrote:

Hmm, I guess it would make sense to have a configurable option in KVM
to isolate passthrough devices so they only can DMA to guest RAM...

Passthrough devices are always protected by the IOMMU, anything else
would be obviously insane^H^H^Hecure. :)

Really? To always do that blindly seems wrong.

I'm refering to the passthrough device not being able to reach registers
of other passthrough devices within the same guest.

Ah okay; sorry, I misunderstood.  That makes more sense now!

Multiple devices are put in the same IOMMU "container" (page table
basically), and that takes care of reaching registers of other
passthrough devices.

Thanks, yes, that's a sane default. What I was trying to say before is that
it may make sense to allow the user to "harden" the setup by selectivly
putting certain passthrough devs on a separate group that can *only*
DMA access guest RAM (not other device regs).



This makes sens but it requires the knowledge from the management layer 
of whether P2P is needed which is probably not easy.


Thanks




Some devs need access to other device's regs but many passthrough devs don't
need DMA access to anything else but RAM (e.g an Ethernet MAC).

That could mitigate the damage caused by wild DMA pointers...

Cheers,
Edgar

Re: [RFC PATCH 11/12] hw/pci: Only allow PCI slave devices to write to direct memory

2020-09-03 Thread Paolo Bonzini

On 03/09/20 15:18, Philippe Mathieu-Daudé wrote:
> As of this patch, all the non-PCI, but I plan to add a similar
> check for USB on top of this series.

Do you mean for memory-mapped USB host controllers?

>> I'm worried that there are cases of MMIO reads that would be broken.
>> They are certainly niche these days, but they should still work; the
>> most "famous" one is perhaps the old BASIC
>>
>>DEF SEG=
>>BLOAD "picture.pic", 0
> 
> This looks like ISA stuff. I don't think ISA does such checks
> (and didn't plan to add them there) but I'd need to verify.

It works on bare metal even with a modern video card.

> Do you have an acceptance test?

Nope. :(

Paolo

Re: [RFC PATCH 00/12] hw: Forbid DMA write accesses to MMIO regions

2020-09-03 Thread Edgar E. Iglesias

On Thu, Sep 03, 2020 at 07:53:33PM +0200, Paolo Bonzini wrote:
> On 03/09/20 17:50, Edgar E. Iglesias wrote:
> >>> Hmm, I guess it would make sense to have a configurable option in KVM
> >>> to isolate passthrough devices so they only can DMA to guest RAM...
> >>
> >> Passthrough devices are always protected by the IOMMU, anything else
> >> would be obviously insane^H^H^Hecure. :)
> > 
> > Really? To always do that blindly seems wrong.
> > 
> > I'm refering to the passthrough device not being able to reach registers
> > of other passthrough devices within the same guest.
> 
> Ah okay; sorry, I misunderstood.  That makes more sense now!
> 
> Multiple devices are put in the same IOMMU "container" (page table
> basically), and that takes care of reaching registers of other
> passthrough devices.

Thanks, yes, that's a sane default. What I was trying to say before is that
it may make sense to allow the user to "harden" the setup by selectivly
putting certain passthrough devs on a separate group that can *only*
DMA access guest RAM (not other device regs).

Some devs need access to other device's regs but many passthrough devs don't
need DMA access to anything else but RAM (e.g an Ethernet MAC).

That could mitigate the damage caused by wild DMA pointers...

Cheers,
Edgar

[PATCH 3/4] block/nbd: fix reconnect-delay

2020-09-03 Thread Vladimir Sementsov-Ogievskiy

reconnect-delay has a design flaw: we handle it in the same loop where
we do connection attempt. So, reconnect-delay may be exceeded by
unpredictable time of connection attempt.

Let's instead use separate timer.

How to reproduce the bug:

1. Create an image on node1:
   qemu-img create -f qcow2 xx 100M

2. Start NBD server on node1:
   qemu-nbd xx

3. On node2 start qemu-io:

./build/qemu-io --image-opts \
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=15

4. Type 'read 0 512' in qemu-io interface to check that connection
   works

Be careful: you should make steps 5-7 in a short time, less than 15
seconds.

5. Kill nbd server on node1

6. Run 'read 0 512' in qemu-io interface again, to be sure that nbd
client goes to reconnect loop.

7. On node1 run the following command

   sudo iptables -A INPUT -p tcp --dport 10809 -j DROP

This will make the connect() call of qemu-io at node2 take a long time.

And you'll see that read command in qemu-io will hang for a long time,
more than 15 seconds specified by reconnect-delay parameter. It's the
bug.

8. Don't forget to drop iptables rule on node1:

   sudo iptables -D INPUT -p tcp --dport 10809 -j DROP

Important note: Step [5] is necessary to reproduce _this_ bug. If we
miss step [5], the read command (step 6) will hang for a long time and
this commit doesn't help, because there will be not long connect() to
unreachable host, but long sendmsg() to unreachable host, which should
be fixed by enabling and adjusting keep-alive on the socket, which is a
thing for further patch set.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 59 +
 1 file changed, 50 insertions(+), 9 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index a495ad7ddf..caae0e6d31 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -122,6 +122,8 @@ typedef struct BDRVNBDState {
 Error *connect_err;
 bool wait_in_flight;
 
+QEMUTimer *reconnect_delay_timer;
+
 NBDClientRequest requests[MAX_NBD_REQUESTS];
 NBDReply reply;
 BlockDriverState *bs;
@@ -188,10 +190,49 @@ static void nbd_recv_coroutines_wake_all(BDRVNBDState *s)
 }
 }
 
+static void reconnect_delay_timer_del(BDRVNBDState *s)
+{
+if (s->reconnect_delay_timer) {
+timer_del(s->reconnect_delay_timer);
+timer_free(s->reconnect_delay_timer);
+s->reconnect_delay_timer = NULL;
+}
+}
+
+static void reconnect_delay_timer_cb(void *opaque)
+{
+BDRVNBDState *s = opaque;
+
+if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
+s->state = NBD_CLIENT_CONNECTING_NOWAIT;
+while (qemu_co_enter_next(>free_sema, NULL)) {
+/* Resume all queued requests */
+}
+}
+
+reconnect_delay_timer_del(s);
+}
+
+static void reconnect_delay_timer_init(BDRVNBDState *s, uint64_t 
expire_time_ns)
+{
+if (s->state != NBD_CLIENT_CONNECTING_WAIT) {
+return;
+}
+
+assert(!s->reconnect_delay_timer);
+s->reconnect_delay_timer = aio_timer_new(bdrv_get_aio_context(s->bs),
+ QEMU_CLOCK_REALTIME,
+ SCALE_NS,
+ reconnect_delay_timer_cb, s);
+timer_mod(s->reconnect_delay_timer, expire_time_ns);
+}
+
 static void nbd_client_detach_aio_context(BlockDriverState *bs)
 {
 BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
 
+/* Timer is deleted in nbd_client_co_drain_begin() */
+assert(!s->reconnect_delay_timer);
 qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
 }
 
@@ -243,6 +284,8 @@ static void coroutine_fn 
nbd_client_co_drain_begin(BlockDriverState *bs)
 
 nbd_co_establish_connection_cancel(bs, false);
 
+reconnect_delay_timer_del(s);
+
 if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
 s->state = NBD_CLIENT_CONNECTING_NOWAIT;
 qemu_co_queue_restart_all(>free_sema);
@@ -593,21 +636,17 @@ out:
 
 static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState *s)
 {
-uint64_t start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-uint64_t delay_ns = s->reconnect_delay * NANOSECONDS_PER_SECOND;
 uint64_t timeout = 1 * NANOSECONDS_PER_SECOND;
 uint64_t max_timeout = 16 * NANOSECONDS_PER_SECOND;
 
+if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
+reconnect_delay_timer_init(s, qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
+   s->reconnect_delay * 
NANOSECONDS_PER_SECOND);
+}
+
 nbd_reconnect_attempt(s);
 
 while (nbd_client_connecting(s)) {
-if (s->state == NBD_CLIENT_CONNECTING_WAIT &&
-qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time_ns > delay_ns)
-{
-s->state = NBD_CLIENT_CONNECTING_NOWAIT;
-qemu_co_queue_restart_all(>free_sema);
-}
-
 if (s->drained) {
 bdrv_dec_in_flight(s->bs);
 s->wait_drained_end = true;
@@ -629,6 +668,8 @@

[PATCH 2/4] block/nbd: correctly use qio_channel_detach_aio_context when needed

2020-09-03 Thread Vladimir Sementsov-Ogievskiy

Don't use nbd_client_detach_aio_context() driver handler where we want
to finalize the connection. We should directly use
qio_channel_detach_aio_context() in such cases. Driver handler may (and
will) contain another things, unrelated to the qio channel.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 912ea27be7..a495ad7ddf 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -549,7 +549,7 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState 
*s)
 
 /* Finalize previous connection if any */
 if (s->ioc) {
-nbd_client_detach_aio_context(s->bs);
+qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
 object_unref(OBJECT(s->sioc));
 s->sioc = NULL;
 object_unref(OBJECT(s->ioc));
@@ -707,7 +707,7 @@ static coroutine_fn void nbd_connection_entry(void *opaque)
 
 s->connection_co = NULL;
 if (s->ioc) {
-nbd_client_detach_aio_context(s->bs);
+qio_channel_detach_aio_context(QIO_CHANNEL(s->ioc));
 object_unref(OBJECT(s->sioc));
 s->sioc = NULL;
 object_unref(OBJECT(s->ioc));
-- 
2.18.0

[PATCH 0/4] nbd reconnect new fixes

2020-09-03 Thread Vladimir Sementsov-Ogievskiy

Hi! Let's continue fixing nbd reconnect feature.

These series is based on "[PULL 0/6] NBD patches for 2020-09-02"
Based-on: <20200902215400.2673028-1-ebl...@redhat.com>

Vladimir Sementsov-Ogievskiy (4):
  block/nbd: fix drain dead-lock because of nbd reconnect-delay
  block/nbd: correctly use qio_channel_detach_aio_context when needed
  block/nbd: fix reconnect-delay
  block/nbd: nbd_co_reconnect_loop(): don't connect if drained

 block/nbd.c | 71 -
 1 file changed, 60 insertions(+), 11 deletions(-)

-- 
2.18.0

[PATCH 4/4] block/nbd: nbd_co_reconnect_loop(): don't connect if drained

2020-09-03 Thread Vladimir Sementsov-Ogievskiy

In a recent commit 12c75e20a269ac we've improved
nbd_co_reconnect_loop() to not make drain wait for additional sleep.
Similarly, we shouldn't try to connect, if previous sleep was
interrupted by drain begin, otherwise drain_begin will have to wait for
the whole connection attempt.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index caae0e6d31..4548046cd7 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -661,6 +661,9 @@ static coroutine_fn void nbd_co_reconnect_loop(BDRVNBDState 
*s)
 } else {
 qemu_co_sleep_ns_wakeable(QEMU_CLOCK_REALTIME, timeout,
   >connection_co_sleep_ns_state);
+if (s->drained) {
+continue;
+}
 if (timeout < max_timeout) {
 timeout *= 2;
 }
-- 
2.18.0

[PATCH 1/4] block/nbd: fix drain dead-lock because of nbd reconnect-delay

2020-09-03 Thread Vladimir Sementsov-Ogievskiy

We pause reconnect process during drained section. So, if we have some
requests, waiting for reconnect we should cancel them, otherwise they
deadlock the drained section.

How to reproduce:

1. Create an image:
   qemu-img create -f qcow2 xx 100M

2. Start NBD server:
   qemu-nbd xx

3. Start vm with second nbd disk on node2, like this:

  ./build/x86_64-softmmu/qemu-system-x86_64 -nodefaults -drive \
 file=/work/images/cent7.qcow2 -drive \
 
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=60
 \
 -vnc :0 -m 2G -enable-kvm -vga std

4. Access the vm through vnc (or some other way?), and check that NBD
   drive works:

   dd if=/dev/sdb of=/dev/null bs=1M count=10

   - the command should succeed.

5. Now, kill the nbd server, and run dd in the guest again:

   dd if=/dev/sdb of=/dev/null bs=1M count=10

Now Qemu is trying to reconnect, and dd-generated requests are waiting
for the connection (they will wait up to 60 seconds (see
reconnect-delay option above) and than fail). But suddenly, vm may
totally hang in the deadlock. You may need to increase reconnect-delay
period to catch the dead-lock.

VM doesn't respond because drain dead-lock happens in cpu thread with
global mutex taken. That's not good thing by itself and is not fixed
by this commit (true way is using iothreads). Still this commit fixes
drain dead-lock itself.

Note: probably, we can instead continue to reconnect during drained
section. To achieve this, we may move negotiation to the connect thread
to make it independent of bs aio context. But expanding drained section
doesn't seem good anyway. So, let's now fix the bug the simplest way.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 9daf003bea..912ea27be7 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -242,6 +242,11 @@ static void coroutine_fn 
nbd_client_co_drain_begin(BlockDriverState *bs)
 }
 
 nbd_co_establish_connection_cancel(bs, false);
+
+if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
+s->state = NBD_CLIENT_CONNECTING_NOWAIT;
+qemu_co_queue_restart_all(>free_sema);
+}
 }
 
 static void coroutine_fn nbd_client_co_drain_end(BlockDriverState *bs)
-- 
2.18.0

[PATCH v2] hw/ide: check null block before _cancel_dma_sync

2020-09-03 Thread P J P

From: Prasad J Pandit 

When cancelling an i/o operation via ide_cancel_dma_sync(),
a block pointer may be null. Add check to avoid null pointer
dereference.

 -> https://ruhr-uni-bochum.sciebo.de/s/NNWP2GfwzYKeKwE?path=%2Fide_nullptr1
 ==1803100==Hint: address points to the zero page.
 #0 blk_bs ../block/block-backend.c:714
 #1 blk_drain ../block/block-backend.c:1715
 #2 ide_cancel_dma_sync ../hw/ide/core.c:723
 #3 bmdma_cmd_writeb ../hw/ide/pci.c:298
 #4 bmdma_write ../hw/ide/piix.c:75
 #5 memory_region_write_accessor ../softmmu/memory.c:483
 #6 access_with_adjusted_size ../softmmu/memory.c:544
 #7 memory_region_dispatch_write ../softmmu/memory.c:1465
 #8 flatview_write_continue ../exec.c:3176
 ...

Reported-by: Ruhr-University 
Signed-off-by: Prasad J Pandit 
---
 hw/ide/core.c | 1 +
 hw/ide/pci.c  | 5 -
 2 files changed, 5 insertions(+), 1 deletion(-)

Update v2: use an assert() call
 -> https://lists.nongnu.org/archive/html/qemu-devel/2020-08/msg08336.html

diff --git a/hw/ide/core.c b/hw/ide/core.c
index f76f7e5234..8105187f49 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -718,6 +718,7 @@ void ide_cancel_dma_sync(IDEState *s)
  * whole DMA operation will be submitted to disk with a single
  * aio operation with preadv/pwritev.
  */
+assert(s->blk);
 if (s->bus->dma->aiocb) {
 trace_ide_cancel_dma_sync_remaining();
 blk_drain(s->blk);
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index b50091b615..b47e675456 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -295,7 +295,10 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
 /* Ignore writes to SSBM if it keeps the old value */
 if ((val & BM_CMD_START) != (bm->cmd & BM_CMD_START)) {
 if (!(val & BM_CMD_START)) {
-ide_cancel_dma_sync(idebus_active_if(bm->bus));
+IDEState *s = idebus_active_if(bm->bus);
+if (s->blk) {
+ide_cancel_dma_sync(s);
+}
 bm->status &= ~BM_STATUS_DMAING;
 } else {
 bm->cur_addr = bm->addr;
-- 
2.26.2

Re: [RFC PATCH 00/12] hw: Forbid DMA write accesses to MMIO regions

2020-09-03 Thread Paolo Bonzini

On 03/09/20 17:50, Edgar E. Iglesias wrote:
>>> Hmm, I guess it would make sense to have a configurable option in KVM
>>> to isolate passthrough devices so they only can DMA to guest RAM...
>>
>> Passthrough devices are always protected by the IOMMU, anything else
>> would be obviously insane^H^H^Hecure. :)
> 
> Really? To always do that blindly seems wrong.
> 
> I'm refering to the passthrough device not being able to reach registers
> of other passthrough devices within the same guest.

Ah okay; sorry, I misunderstood.  That makes more sense now!

Multiple devices are put in the same IOMMU "container" (page table
basically), and that takes care of reaching registers of other
passthrough devices.

Paolo

> Obviously the IOMMU should be setup so that passthrough devices don't reach\
> other guests or the host.

Re: [PATCH v1] sd: sdhci: assert data_count is within fifo_buffer

2020-09-03 Thread P J P

+-- On Thu, 3 Sep 2020, Philippe Mathieu-Daudé wrote --+
| If you don't mind I might split the assert in 2 when applying:
| 
| -assert(s->data_count <= s->buf_maxsz && s->data_count > begin);
| +assert(s->data_count <= s->buf_maxsz);
| +assert(s->data_count > begin);

Sure, np. Thank you.
--
Prasad J Pandit / Red Hat Product Security Team
8685 545E B54C 486B C6EB 271E E285 8B5A F050 DE8D

[PATCH 4/4] hw/sd/sdhci: Yield if interrupt delivered during multiple transfer

2020-09-03 Thread Philippe Mathieu-Daudé

The Descriptor Table has a bit to allow the DMA to generates
Interrupt when the operation of the descriptor line is completed
(see "1.13.4. Descriptor Table" of 'SD Host Controller Simplified
Specification Version 2.00').

If we have pending interrupt and the descriptor requires it
to be generated as soon as it is completed, reschedule pending
transfers and yield to the CPU.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sdhci.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 06cb098036c..74b0bf77103 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -834,7 +834,10 @@ static void sdhci_do_adma(SDHCIState *s)
 s->norintsts |= SDHC_NIS_DMA;
 }
 
-sdhci_update_irq(s);
+if (sdhci_update_irq(s) && !(dscr.attr & SDHC_ADMA_ATTR_END)) {
+/* IRQ delivered, reschedule current transfer */
+break;
+}
 }
 
 /* ADMA transfer terminates if blkcnt == 0 or by END attribute */
-- 
2.26.2

Re: [PATCH v1] sd: sdhci: assert data_count is within fifo_buffer

2020-09-03 Thread P J P

+-- On Thu, 3 Sep 2020, Philippe Mathieu-Daudé wrote --+
| > -assert(s->data_count <= s->buf_maxsz && s->data_count > begin);
| > +assert(s->data_count <= s->buf_maxsz);
| > +assert(s->data_count > begin);
| 
| Doesn't seem enough, guest crash here, having:
| 
| (gdb) p begin
| $1 = 0
| (gdb) p s->data_count
| $2 = 0

I was actually thinking of a case if 's->data_count' and 'begin' are same? It 
may lead to an infinite loop condition.

| (gdb) p s->blksize
| $3 = 0

This is strange. 

| Beh, something is wrong in this model, because when using ADMA2
| length 0 means 65536 bytes (see '1.13.4. Descriptor Table' in
| "SD Host Controller Simplified Specification Version 2.00").

* DMA length 's->data_count - begin'?

* if s->blksize is 65536, it'd set 'block_size = 0' in transfer_multi_blocks()

   #define BLOCK_SIZE_MASK (4 * KiB - 1)  <== 0xFFF

   static void sdhci_sdma_transfer_multi_blocks(SDHCIState *s)  
   
   {
 ...
 const uint16_t block_size = s->blksize & BLOCK_SIZE_MASK;  <== 0


Thank you.
--
Prasad J Pandit / Red Hat Product Security Team
8685 545E B54C 486B C6EB 271E E285 8B5A F050 DE8D

[PATCH 2/4] hw/sd/sdhci: Resume pending DMA transfers on MMIO accesses

2020-09-03 Thread Philippe Mathieu-Daudé

If we have pending DMA requests scheduled, process them first.
So far we don't need to implement a bottom half to process them.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sdhci.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 703357e94a7..2b197631870 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -945,11 +945,21 @@ sdhci_buff_access_is_sequential(SDHCIState *s, unsigned 
byte_num)
 return true;
 }
 
+static void sdhci_resume_pending_transfer(SDHCIState *s)
+{
+timer_del(s->transfer_timer);
+sdhci_data_transfer(s);
+}
+
 static uint64_t sdhci_read(void *opaque, hwaddr offset, unsigned size)
 {
 SDHCIState *s = (SDHCIState *)opaque;
 uint32_t ret = 0;
 
+if (timer_pending(s->transfer_timer)) {
+sdhci_resume_pending_transfer(s);
+}
+
 switch (offset & ~0x3) {
 case SDHC_SYSAD:
 ret = s->sdmasysad;
@@ -1093,6 +1103,10 @@ sdhci_write(void *opaque, hwaddr offset, uint64_t val, 
unsigned size)
 uint32_t value = val;
 value <<= shift;
 
+if (timer_pending(s->transfer_timer)) {
+sdhci_resume_pending_transfer(s);
+}
+
 switch (offset & ~0x3) {
 case SDHC_SYSAD:
 s->sdmasysad = (s->sdmasysad & mask) | value;
-- 
2.26.2

[PATCH 3/4] hw/sd/sdhci: Let sdhci_update_irq() return if IRQ was delivered

2020-09-03 Thread Philippe Mathieu-Daudé

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sdhci.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index 2b197631870..06cb098036c 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -215,9 +215,14 @@ static uint8_t sdhci_slotint(SDHCIState *s)
  ((s->norintsts & SDHC_NIS_REMOVE) && (s->wakcon & SDHC_WKUP_ON_RMV));
 }
 
-static inline void sdhci_update_irq(SDHCIState *s)
+/* Return true if IRQ was pending and delivered */
+static bool sdhci_update_irq(SDHCIState *s)
 {
-qemu_set_irq(s->irq, sdhci_slotint(s));
+bool pending = sdhci_slotint(s);
+
+qemu_set_irq(s->irq, pending);
+
+return pending;
 }
 
 static void sdhci_raise_insertion_irq(void *opaque)
-- 
2.26.2

[PATCH 1/4] hw/sd/sdhci: Stop multiple transfers when block count is cleared

2020-09-03 Thread Philippe Mathieu-Daudé

Clearing BlockCount stops multiple transfers.

See "SD Host Controller Simplified Specification Version 2.00":

- 2.2.3. Block Count Register (Offset 006h)
- Table 2-8 : Determination of Transfer Type

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sdhci.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
index ecbf84e9d3f..703357e94a7 100644
--- a/hw/sd/sdhci.c
+++ b/hw/sd/sdhci.c
@@ -728,6 +728,12 @@ static void sdhci_do_adma(SDHCIState *s)
 ADMADescr dscr = {};
 int i;
 
+if (s->trnmod & SDHC_TRNS_BLK_CNT_EN && !s->blkcnt) {
+/* Stop Multiple Transfer */
+sdhci_end_transfer(s);
+return;
+}
+
 for (i = 0; i < SDHC_ADMA_DESCS_PER_DELAY; ++i) {
 s->admaerr &= ~SDHC_ADMAERR_LENGTH_MISMATCH;
 
@@ -753,7 +759,6 @@ static void sdhci_do_adma(SDHCIState *s)
 
 switch (dscr.attr & SDHC_ADMA_ATTR_ACT_MASK) {
 case SDHC_ADMA_ATTR_ACT_TRAN:  /* data transfer */
-
 if (s->trnmod & SDHC_TRNS_READ) {
 while (length) {
 if (s->data_count == 0) {
-- 
2.26.2

[PATCH 0/4] hw/sd/sdhci: Strengthen multiple DMA transfers

2020-09-03 Thread Philippe Mathieu-Daudé

Still trying to fix the bugs reported by Aleksander...

- Do not send 0 block count
- Reduce DMA to MMIO re-entrancy by yielding when pending IRQ

Based-on: <20200901140411.112150-1-f4...@amsat.org>

Philippe Mathieu-Daudé (4):
  hw/sd/sdhci: Stop multiple transfers when block count is cleared
  hw/sd/sdhci: Resume pending DMA transfers on MMIO accesses
  hw/sd/sdhci: Let sdhci_update_irq() return if IRQ was delivered
  hw/sd/sdhci: Yield if interrupt delivered during multiple transfer

 hw/sd/sdhci.c | 35 +++
 1 file changed, 31 insertions(+), 4 deletions(-)

-- 
2.26.2

Re: [PATCH] iotests: Allow running from different directory

2020-09-03 Thread Kevin Wolf

Am 03.09.2020 um 14:54 hat Max Reitz geschrieben:
> On 02.09.20 13:03, Kevin Wolf wrote:
> > It is convenient to be able to edit the tests and run them without
> > changing the current working directory back and forth. Instead of
> > assuming that $PWD is the qemu-iotests build directory, derive the build
> > directory from the executed script.
> > 
> > This allows 'check' to find the required files even when called from
> > another directory. The scratch directory will still be in the current
> > working directory.
> > 
> > Signed-off-by: Kevin Wolf 
> > ---
> >  tests/qemu-iotests/check | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
> > index 3ab859ac1a..22ada6a549 100755
> > --- a/tests/qemu-iotests/check
> > +++ b/tests/qemu-iotests/check
> > @@ -44,7 +44,7 @@ then
> >  _init_error "failed to obtain source tree name from check symlink"
> >  fi
> >  source_iotests=$(cd "$source_iotests"; pwd) || _init_error "failed to 
> > enter source tree"
> > -build_iotests=$PWD
> > +build_iotests=$(dirname "$0")
> 
> This breaks running check from the build tree.
> (i.e. cd $build/tests/qemu-iotests; ./check)
> 
> The problem is that to run the test, we do cd to the source directory
> ($source_iotests), and so $build_iotests then becomes invalid if it’s
> just a relative path.  In my case, this leads to the following error:
> 
> -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
> +./common.rc: line 139: $QEMU/tests/qemu-iotests/../../qemu-img: No such
> file or directory

Ah, my symlinks in the source tree made it work for me.

> I think this could be resolved by wrapping the $(dirname) in
> $(realpath), i.e.
> 
> build_iotests=$(realpath "$(dirname "$0")")

Sounds good, I'll update it in my tree.

Kevin


signature.asc
Description: PGP signature

Re: [PATCH v6 8/8] block/file: switch to use qemu_open/qemu_create for improved errors

2020-09-03 Thread Richard Henderson

On 9/3/20 8:22 AM, Daniel P. Berrangé wrote:
> Currently at startup if using cache=none on a filesystem lacking
> O_DIRECT such as tmpfs, at startup QEMU prints
> 
> qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: file system may not 
> support O_DIRECT
> qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: Could not open 
> '/tmp/foo.img': Invalid argument
> 
> while at QMP level the hint is missing, so QEMU reports just
> 
>   "error": {
>   "class": "GenericError",
>   "desc": "Could not open '/tmp/foo.img': Invalid argument"
>   }
> 
> which is close to useless for the end user trying to figure out what
> they did wrong.
> 
> With this change at startup QEMU prints
> 
> qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: Unable to open 
> '/tmp/foo.img': filesystem does not support O_DIRECT
> 
> while at the QMP level QEMU reports a massively more informative
> 
>   "error": {
>  "class": "GenericError",
>  "desc": "Unable to open '/tmp/foo.img': filesystem does not support 
> O_DIRECT"
>   }
> 
> Reviewed-by: Eric Blake 
> Reviewed-by: Philippe Mathieu-Daudé 
> Reviewed-by: Markus Armbruster 
> Signed-off-by: Daniel P. Berrangé 
> ---

Reviewed-by: Richard Henderson 

r~

Re: [PATCH v6 7/8] util: give a specific error message when O_DIRECT doesn't work

2020-09-03 Thread Richard Henderson

On 9/3/20 8:22 AM, Daniel P. Berrangé wrote:
> A common error scenario is to tell QEMU to use O_DIRECT in combination
> with a filesystem that doesn't support it. To aid users to diagnosing
> their mistake we want to provide a clear error message when this happens.
> 
> Reviewed-by: Markus Armbruster 
> Reviewed-by: Eric Blake 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  util/osdep.c | 15 ++-
>  1 file changed, 14 insertions(+), 1 deletion(-)

Reviewed-by: Richard Henderson 

r~

Re: [PATCH v6 6/8] util: introduce qemu_open and qemu_create with error reporting

2020-09-03 Thread Richard Henderson

On 9/3/20 8:22 AM, Daniel P. Berrangé wrote:
> qemu_open_old() works like open(): set errno and return -1 on failure.
> It has even more failure modes, though.  Reporting the error clearly
> to users is basically impossible for many of them.
> 
> Our standard cure for "errno is too coarse" is the Error object.
> Introduce two new helper methods:
> 
>   int qemu_open(const char *name, int flags, Error **errp);
>   int qemu_create(const char *name, int flags, mode_t mode, Error **errp);
> 
> Note that with this design we no longer require or even accept the
> O_CREAT flag. Avoiding overloading the two distinct operations
> means we can avoid variable arguments which would prevent 'errp' from
> being the last argument. It also gives us a guarantee that the 'mode' is
> given when creating files, avoiding a latent security bug.
> 
> Reviewed-by: Markus Armbruster 
> Signed-off-by: Daniel P. Berrangé 
> ---

Reviewed-by: Richard Henderson 

r~

Re: [PATCH v6 5/8] util: add Error object for qemu_open_internal error reporting

2020-09-03 Thread Richard Henderson

On 9/3/20 8:22 AM, Daniel P. Berrangé wrote:
> Instead of relying on the limited information from errno, we can now
> also provide detailed error messages to callers that ask for it.
> 
> Reviewed-by: Markus Armbruster 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  util/osdep.c | 15 +--
>  1 file changed, 13 insertions(+), 2 deletions(-)

Reviewed-by: Richard Henderson 

r~

Re: [PATCH v6 3/8] util: rename qemu_open() to qemu_open_old()

2020-09-03 Thread Richard Henderson

On 9/3/20 8:22 AM, Daniel P. Berrangé wrote:
> We want to introduce a new version of qemu_open() that uses an Error
> object for reporting problems and make this it the preferred interface.
> Rename the existing method to release the namespace for the new impl.
> 
> Reviewed-by: Eric Blake 
> Reviewed-by: Philippe Mathieu-Daudé 
> Reviewed-by: Markus Armbruster 
> Signed-off-by: Daniel P. Berrangé 
> ---

Reviewed-by: Richard Henderson 

r~

Re: [PATCH v6 2/8] util: split off a helper for dealing with O_CLOEXEC flag

2020-09-03 Thread Richard Henderson

On 9/3/20 8:22 AM, Daniel P. Berrangé wrote:
> We're going to have multiple callers to open() from qemu_open()
> soon. Readability would thus benefit from having a helper for
> dealing with O_CLOEXEC.
> 
> Reviewed-by: Markus Armbruster 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  util/osdep.c | 23 +++
>  1 file changed, 15 insertions(+), 8 deletions(-)

Reviewed-by: Richard Henderson 

r~

Re: [PATCH v6 4/8] util: refactor qemu_open_old to split off variadic args handling

2020-09-03 Thread Richard Henderson

On 9/3/20 8:22 AM, Daniel P. Berrangé wrote:
> This simple refactoring prepares for future patches. The variadic args
> handling is split from the main bulk of the open logic.
> 
> Reviewed-by: Markus Armbruster 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  util/osdep.c | 24 +---
>  1 file changed, 17 insertions(+), 7 deletions(-)

Reviewed-by: Richard Henderson 

r~

Re: [PATCH v6 1/8] monitor: simplify functions for getting a dup'd fdset entry

2020-09-03 Thread Richard Henderson

On 9/3/20 8:22 AM, Daniel P. Berrangé wrote:
> Currently code has to call monitor_fdset_get_fd, then dup
> the return fd, and then add the duplicate FD back into the
> fdset. This dance is overly verbose for the caller and
> introduces extra failure modes which can be avoided by
> folding all the logic into monitor_fdset_dup_fd_add and
> removing monitor_fdset_get_fd entirely.
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  include/monitor/monitor.h |  3 +-
>  include/qemu/osdep.h  |  1 +
>  monitor/misc.c| 58 +--
>  stubs/fdset.c |  8 ++
>  util/osdep.c  | 19 ++---
>  5 files changed, 32 insertions(+), 57 deletions(-)

Reviewed-by: Richard Henderson 

r~

Re: [PATCH 00/63] qom: Rename macros for consistency

2020-09-03 Thread Eduardo Habkost

On Wed, Sep 02, 2020 at 06:42:08PM -0400, Eduardo Habkost wrote:
> Reducing boilerplate QOM code using the new OBJECT_DEFINE_* and
> OBJECT_DECLARE_* macros is quite difficult when there are
> multiple ways a TYPE_* constant name is written.
> 
> This series renames many type checking macros and/or TYPE_*
> constants to make sure they are consistent.
> 
> This series is based on machine-next and can be fetched from:
> 
>   https://github.com/ehabkost/qemu-hacks work/qom-rename-macros

I'm queueing:

[PATCH 01/63] gpex: Fix type checking function name
[PATCH 02/63] chardev: Rename TYPE_CHARDEV_* to TYPE_*_CHARDEV
[PATCH 05/63] ap-device: Rename AP_DEVICE_TYPE to TYPE_AP_DEVICE
[PATCH 06/63] dev-smartcard-reader: Rename CCID_DEV_NAME to TYPE_USB_CCID_DEV
[PATCH 08/63] vfio: Rename VFIO_AP_DEVICE_TYPE to TYPE_VFIO_AP_DEVICE
[PATCH 10/63] vmgenid: Rename VMGENID_DEVICE to TYPE_VMGENID
[PATCH 32/63] ahci: Rename ICH_AHCI to ICH9_AHCI
[PATCH 39/63] esp: Rename ESP_STATE to ESP
[PATCH 40/63] filter-rewriter: Rename FILTER_COLO_REWRITER to FILTER_REWRITER
[PATCH 47/63] rs6000_mc: Rename RS6000MC_DEVICE to RS6000MC
[PATCH 48/63] sabre: Rename SABRE_DEVICE to SABRE
[PATCH 53/63] usb: Rename USB_SERIAL_DEV to USB_SERIAL
[PATCH 55/63] vfio: Rename PCI_VFIO to VFIO_PCI

-- 
Eduardo

[PATCH v2 1/3] qcow2: Fix removal of list members from BDRVQcow2State.cluster_allocs

2020-09-03 Thread Alberto Garcia

When a write request needs to allocate new clusters (or change the L2
bitmap of existing ones) a QCowL2Meta structure is created so the L2
metadata can be later updated and any copy-on-write can be performed
if necessary.

A write request can span a region consisting of an arbitrary
combination of previously unallocated and allocated clusters, and if
the unallocated ones can be put contiguous to the existing ones then
QEMU will do so in order to minimize the number of write operations.

In practice this means that a write request has not just one but a
number of QCowL2Meta structures. All of them are added to the
cluster_allocs list that is stored in BDRVQcow2State and is used to
detect overlapping requests. After the write request finishes all its
associated QCowL2Meta are removed from that list. calculate_l2_meta()
takes care of creating and putting those structures in the list, and
qcow2_handle_l2meta() takes care of removing them.

The problem is that the error path in handle_alloc() also tries to
remove an item in that list, a remnant from the time when this was
handled there (that code would not even be correct anymore because
it only removes one struct and not all the ones from the same write
request).

This can trigger a double removal of the same item from the list,
causing a crash. This is not easy to reproduce in practice because
it requires that do_alloc_cluster_offset() fails after a successful
previous allocation during the same write request, but it can be
reproduced with the included test case.

Signed-off-by: Alberto Garcia 
---
 block/qcow2-cluster.c  |  3 --
 tests/qemu-iotests/305 | 74 ++
 tests/qemu-iotests/305.out | 16 +
 tests/qemu-iotests/group   |  1 +
 4 files changed, 91 insertions(+), 3 deletions(-)
 create mode 100755 tests/qemu-iotests/305
 create mode 100644 tests/qemu-iotests/305.out

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 996b3314f4..25e38daa78 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1709,9 +1709,6 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 
 out:
 qcow2_cache_put(s->l2_table_cache, (void **) _slice);
-if (ret < 0 && *m && (*m)->nb_clusters > 0) {
-QLIST_REMOVE(*m, next_in_flight);
-}
 return ret;
 }
 
diff --git a/tests/qemu-iotests/305 b/tests/qemu-iotests/305
new file mode 100755
index 00..768818af4a
--- /dev/null
+++ b/tests/qemu-iotests/305
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+#
+# Test the handling of errors in write requests with multiple allocations
+#
+# Copyright (C) 2020 Igalia, S.L.
+# Author: Alberto Garcia 
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=be...@igalia.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1# failure is the default!
+
+_cleanup()
+{
+_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+_unsupported_imgopts cluster_size refcount_bits extended_l2 compat=0.10 
data_file
+
+echo '### Create the image'
+_make_test_img -o refcount_bits=64,cluster_size=1k 1M
+
+# The reference counts of the clusters for the first 123k of this
+# write request are stored in the first refcount block. The last
+# cluster (guest offset 123k) is referenced in the second refcount
+# block.
+echo '### Fill the first refcount block and one data cluster from the second'
+$QEMU_IO -c 'write 0 124k' "$TEST_IMG" | _filter_qemu_io
+
+echo '### Discard two of the last data clusters, leave one in the middle'
+$QEMU_IO -c 'discard 121k 1k' "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c 'discard 123k 1k' "$TEST_IMG" | _filter_qemu_io
+
+echo '### Corrupt the offset of the second refcount block'
+refcount_table_offset=$(peek_file_be "$TEST_IMG" 48 8)
+poke_file "$TEST_IMG" $(($refcount_table_offset+14)) "\x06"
+
+# This tries to allocate the two clusters discarded earlier (guest
+# offsets 121k and 123k). Their reference counts are in the first and
+# second refcount blocks respectively, but only the first one can be
+# allocated correctly because the second entry of the refcount table
+# is corrupted.
+echo '### Try to allocate the discarded clusters again'
+$QEMU_IO -c 'write 121k 3k' "$TEST_IMG" |

[PATCH v2 2/3] qcow2: Don't check nb_clusters when removing l2meta from the list

2020-09-03 Thread Alberto Garcia

In the past, when a new cluster was allocated the l2meta structure was
a variable in the stack so it was necessary to have a way to tell
whether it had been initialized and contained valid data or not. The
nb_clusters field was used for this purpose. Since commit f50f88b9fe
this is no longer the case, l2meta (nowadays a pointer to a list) is
only allocated when needed and nb_clusters is guaranteed to be > 0 so
this check is unnecessary.

Signed-off-by: Alberto Garcia 
---
 block/qcow2.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index da56b1a4df..54a7d2f475 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2112,9 +2112,7 @@ static coroutine_fn int 
qcow2_handle_l2meta(BlockDriverState *bs,
 }
 
 /* Take the request off the list of running requests */
-if (l2meta->nb_clusters != 0) {
-QLIST_REMOVE(l2meta, next_in_flight);
-}
+QLIST_REMOVE(l2meta, next_in_flight);
 
 qemu_co_queue_restart_all(>dependent_requests);
 
-- 
2.20.1

[PATCH v2 3/3] qcow2: Rewrite the documentation of qcow2_alloc_cluster_offset()

2020-09-03 Thread Alberto Garcia

The current text corresponds to an earlier, simpler version of this
function and it does not explain how it works now.

Signed-off-by: Alberto Garcia 
---
 block/qcow2-cluster.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 25e38daa78..f1ce6afcf5 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1713,18 +1713,22 @@ out:
 }
 
 /*
- * alloc_cluster_offset
+ * For a given area on the virtual disk defined by @offset and @bytes,
+ * find the corresponding area on the qcow2 image, allocating new
+ * clusters (or subclusters) if necessary. The result can span a
+ * combination of allocated and previously unallocated clusters.
  *
- * For a given offset on the virtual disk, find the cluster offset in qcow2
- * file. If the offset is not found, allocate a new cluster.
+ * On return, @host_offset is set to the beginning of the requested
+ * area. This area is guaranteed to be contiguous on the qcow2 file
+ * but it can be smaller than initially requested. In this case @bytes
+ * is updated with the actual size.
  *
- * If the cluster was already allocated, m->nb_clusters is set to 0 and
- * other fields in m are meaningless.
- *
- * If the cluster is newly allocated, m->nb_clusters is set to the number of
- * contiguous clusters that have been allocated. In this case, the other
- * fields of m are valid and contain information about the first allocated
- * cluster.
+ * If any clusters or subclusters were allocated then @m contains a
+ * list with the information of all the affected regions. Note that
+ * this can happen regardless of whether this function succeeds or
+ * not. The caller is responsible for updating the L2 metadata of the
+ * allocated clusters (on success) or freeing them (on failure), and
+ * for clearing the contents of @m afterwards in both cases.
  *
  * If the request conflicts with another write request in flight, the coroutine
  * is queued and will be reentered when the dependency has completed.
-- 
2.20.1

[PATCH v2 0/3] qcow2: Fix removal of list members from BDRVQcow2State.cluster_allocs

2020-09-03 Thread Alberto Garcia

Hi,

here are the changes from v1:

- Split changes into three different patches.
- Rewrite the documentation of qcow2_alloc_cluster_offset() [Max]
- Use peek_file_be in the test case to read the offset of the refcount
  table [Max].
- Extend the list of unsupported options for the test case [Max].

Berto

Alberto Garcia (3):
  qcow2: Fix removal of list members from BDRVQcow2State.cluster_allocs
  qcow2: Don't check nb_clusters when removing l2meta from the list
  qcow2: Rewrite the documentation of qcow2_alloc_cluster_offset()

 block/qcow2-cluster.c  | 27 +++---
 block/qcow2.c  |  4 +--
 tests/qemu-iotests/305 | 74 ++
 tests/qemu-iotests/305.out | 16 +
 tests/qemu-iotests/group   |  1 +
 5 files changed, 106 insertions(+), 16 deletions(-)
 create mode 100755 tests/qemu-iotests/305
 create mode 100644 tests/qemu-iotests/305.out

-- 
2.20.1

Re: [PATCH v1] sd: sdhci: assert data_count is within fifo_buffer

2020-09-03 Thread Philippe Mathieu-Daudé

On 9/3/20 4:01 PM, Philippe Mathieu-Daudé wrote:
> On 9/3/20 9:08 AM, P J P wrote:
>> From: Prasad J Pandit 
>>
>> While doing multi block SDMA, transfer block size may exceed
>> the 's->fifo_buffer[s->buf_maxsz]' size. It may leave the
>> current element pointer 's->data_count' pointing out of bounds.
>> Leading the subsequent DMA r/w operation to OOB access issue.
>> Assert that 's->data_count' is within fifo_buffer.
>>
>>  -> 
>> https://ruhr-uni-bochum.sciebo.de/s/NNWP2GfwzYKeKwE?path=%2Fsdhci_oob_write1
>>  ==1459837==ERROR: AddressSanitizer: heap-buffer-overflow
>>  WRITE of size 54722048 at 0x6151e280 thread T3
>>  #0  __interceptor_memcpy (/lib64/libasan.so.6+0x3a71d)
>>  #1  flatview_read_continue ../exec.c:3245
>>  #2  flatview_read ../exec.c:3278
>>  #3  address_space_read_full ../exec.c:3291
>>  #4  address_space_rw ../exec.c:3319
>>  #5  dma_memory_rw_relaxed ../include/sysemu/dma.h:87
>>  #6  dma_memory_rw ../include/sysemu/dma.h:110
>>  #7  dma_memory_read ../include/sysemu/dma.h:116
>>  #8  sdhci_sdma_transfer_multi_blocks ../hw/sd/sdhci.c:629
>>  #9  sdhci_write ../hw/sd/sdhci.c:1097
>>  #10 memory_region_write_accessor ../softmmu/memory.c:483
>>  ...
>>
>> Reported-by: Ruhr-University 
>> Suggested-by: Philippe Mathieu-Daudé 
>> Signed-off-by: Prasad J Pandit 
>> ---
>>  hw/sd/sdhci.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> Update v1: use assert(3) calls
>>   -> https://lists.nongnu.org/archive/html/qemu-devel/2020-09/msg00966.html
>>
>> diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
>> index 1785d7e1f7..023acbed41 100644
>> --- a/hw/sd/sdhci.c
>> +++ b/hw/sd/sdhci.c
>> @@ -604,6 +604,7 @@ static void sdhci_sdma_transfer_multi_blocks(SDHCIState 
>> *s)
>>  s->blkcnt--;
>>  }
>>  }
>> +assert(s->data_count <= s->buf_maxsz && s->data_count > begin);
>>  dma_memory_write(s->dma_as, s->sdmasysad,
>>   >fifo_buffer[begin], s->data_count - begin);
>>  s->sdmasysad += s->data_count - begin;
>> @@ -626,6 +627,7 @@ static void sdhci_sdma_transfer_multi_blocks(SDHCIState 
>> *s)
>>  s->data_count = block_size;
>>  boundary_count -= block_size - begin;
>>  }
>> +assert(s->data_count <= s->buf_maxsz && s->data_count > begin);
>>  dma_memory_read(s->dma_as, s->sdmasysad,
>>  >fifo_buffer[begin], s->data_count - begin);
>>  s->sdmasysad += s->data_count - begin;
>>
> 
> qemu-system-i386: hw/sd/sdhci.c:632: void
> sdhci_sdma_transfer_multi_blocks(SDHCIState *): Assertion `s->data_count
> <= s->buf_maxsz && s->data_count > begin' failed.
> 
> Tested-by: Philippe Mathieu-Daudé 
> 
> If you don't mind I might split the assert in 2 when applying:
> 
> -assert(s->data_count <= s->buf_maxsz && s->data_count > begin);
> +assert(s->data_count <= s->buf_maxsz);
> +assert(s->data_count > begin);

Doesn't seem enough, guest crash here, having:

(gdb) p begin
$1 = 0
(gdb) p s->data_count
$2 = 0
(gdb) p s->blksize
$3 = 0

Beh, something is wrong in this model, because when using ADMA2
length 0 means 65536 bytes (see '1.13.4. Descriptor Table' in
"SD Host Controller Simplified Specification Version 2.00").

Re: [RFC PATCH 12/12] dma: Assert when device writes to indirect memory (such MMIO regions)

2020-09-03 Thread Edgar E. Iglesias

On Thu, Sep 03, 2020 at 01:08:31PM +0200, Philippe Mathieu-Daudé wrote:
> Assert DMA accesses are done on direct memory (in particular
> to catch invalid accesses to MMIO regions).

Hi Philippe,

Is the motivation for this to make it easier to find DMA programming errors?
Shouldn't guest SW use the IOMMU/DMA-APIs to debug those?

There're valid use-cases where DMA devices target non-memory, in particular
in the embedded space but also on PCI systems.

Also, since guest SW programs the DMA registers, guest SW would be able to trig 
this assert at will...

Cheers,
Edgar




> 
> Example with the reproducer from LP#1886362 (see previous commit):
> 
>   qemu-system-i386: include/sysemu/dma.h:111: int dma_memory_rw(AddressSpace 
> *, dma_addr_t, void *, dma_addr_t, DMADirection, MemTxAttrs): Assertion `dir 
> == DMA_DIRECTION_TO_DEVICE || attrs.direct_access' failed.
>   (gdb) bt
>   #0  0x751d69e5 in raise () at /lib64/libc.so.6
>   #1  0x751bf895 in abort () at /lib64/libc.so.6
>   #2  0x751bf769 in _nl_load_domain.cold () at /lib64/libc.so.6
>   #3  0x751cee76 in annobin_assert.c_end () at /lib64/libc.so.6
>   #4  0x57b48a94 in dma_memory_rw (as=0x7fffddd3ca28, addr=4064, 
> buf=0x7fff7780, len=16, dir=DMA_DIRECTION_FROM_DEVICE, attrs=...) at 
> /home/phil/source/qemu/include/sysemu/dma.h:111
>   #5  0x57b487e0 in pci_dma_rw (dev=0x7fffddd3c800, addr=4064, 
> buf=0x7fff7780, len=16, dir=DMA_DIRECTION_FROM_DEVICE) at 
> /home/phil/source/qemu/include/hw/pci/pci.h:791
>   #6  0x57b47373 in pci_dma_write (dev=0x7fffddd3c800, addr=4064, 
> buf=0x7fff7780, len=16) at /home/phil/source/qemu/include/hw/pci/pci.h:804
>   #7  0x57b340b4 in e1000e_write_packet_to_guest 
> (core=0x7fffddd3f4e0, pkt=0x6116c740, rxr=0x7fff7cf0, 
> rss_info=0x7fff7d10) at hw/net/e1000e_core.c:1609
>   #8  0x57b30739 in e1000e_receive_iov (core=0x7fffddd3f4e0, 
> iov=0x61960e80, iovcnt=4) at hw/net/e1000e_core.c:1709
>   #9  0x576e2069 in e1000e_nc_receive_iov (nc=0x6140a060, 
> iov=0x61960e80, iovcnt=4) at hw/net/e1000e.c:213
>   #10 0x572a3c34 in net_tx_pkt_sendv (pkt=0x63128800, 
> nc=0x6140a060, iov=0x61960e80, iov_cnt=4) at hw/net/net_tx_pkt.c:556
>   #11 0x572a23e2 in net_tx_pkt_send (pkt=0x63128800, 
> nc=0x6140a060) at hw/net/net_tx_pkt.c:633
>   #12 0x572a4c67 in net_tx_pkt_send_loopback (pkt=0x63128800, 
> nc=0x6140a060) at hw/net/net_tx_pkt.c:646
>   #13 0x57b70b05 in e1000e_tx_pkt_send (core=0x7fffddd3f4e0, 
> tx=0x7fffddd5f748, queue_index=0) at hw/net/e1000e_core.c:664
>   #14 0x57b6eab8 in e1000e_process_tx_desc (core=0x7fffddd3f4e0, 
> tx=0x7fffddd5f748, dp=0x7fff8680, queue_index=0) at 
> hw/net/e1000e_core.c:743
>   #15 0x57b6d65d in e1000e_start_xmit (core=0x7fffddd3f4e0, 
> txr=0x7fff88a0) at hw/net/e1000e_core.c:934
>   #16 0x57b5ea38 in e1000e_set_tctl (core=0x7fffddd3f4e0, index=256, 
> val=255) at hw/net/e1000e_core.c:2431
>   #17 0x57b369ef in e1000e_core_write (core=0x7fffddd3f4e0, 
> addr=1027, val=255, size=4) at hw/net/e1000e_core.c:3265
>   #18 0x576de3be in e1000e_mmio_write (opaque=0x7fffddd3c800, 
> addr=1027, val=255, size=4) at hw/net/e1000e.c:109
>   #19 0x58e6b789 in memory_region_write_accessor (mr=0x7fffddd3f110, 
> addr=1027, value=0x7fff8eb0, size=4, shift=0, mask=4294967295, attrs=...) 
> at softmmu/memory.c:483
>   #20 0x58e6b05b in access_with_adjusted_size (addr=1027, 
> value=0x7fff8eb0, size=1, access_size_min=4, access_size_max=4, 
> access_fn= 0x58e6b120 , mr=0x7fffddd3f110, 
> attrs=...) at softmmu/memory.c:544
>   #21 0x58e69776 in memory_region_dispatch_write (mr=0x7fffddd3f110, 
> addr=1027, data=255, op=MO_8, attrs=...) at softmmu/memory.c:1465
>   #22 0x58f60462 in flatview_write_continue (fv=0x6063f9e0, 
> addr=3775005699, attrs=..., ptr=0x602e3710, len=1, addr1=1027, l=1, 
> mr=0x7fffddd3f110) at exec.c:3176
>   #23 0x58f4e38b in flatview_write (fv=0x6063f9e0, 
> addr=3775005699, attrs=..., buf=0x602e3710, len=1) at exec.c:3220
>   #24 0x58f4dd4f in address_space_write (as=0x6080baa0, 
> addr=3775005699, attrs=..., buf=0x602e3710, len=1) at exec.c:3315
>   #25 0x5916b3e0 in qtest_process_command (chr=0x5c03f300 
> , words=0x60458150) at softmmu/qtest.c:567
>   #26 0x5915f7f2 in qtest_process_inbuf (chr=0x5c03f300 
> , inbuf=0x619200e0) at softmmu/qtest.c:710
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  include/sysemu/dma.h | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h
> index 8a7dbf0b0f3..a4ba9438a56 100644
> --- a/include/sysemu/dma.h
> +++ b/include/sysemu/dma.h
> @@ -108,6 +108,8 @@ static inline int dma_memory_rw(AddressSpace *as, 
> dma_addr_t addr,
>

Re: [RFC PATCH 00/12] hw: Forbid DMA write accesses to MMIO regions

2020-09-03 Thread Edgar E. Iglesias

On Thu, Sep 03, 2020 at 05:46:39PM +0200, Paolo Bonzini wrote:
> On 03/09/20 16:24, Edgar E. Iglesias wrote:
> >> [*] I do wonder about hardware-device-passthrough setups; I
> >> don't think I would care to pass through an arbitrary device
> >> to an untrusted guest...
> > Hmm, I guess it would make sense to have a configurable option in KVM
> > to isolate passthrough devices so they only can DMA to guest RAM...
> 
> Passthrough devices are always protected by the IOMMU, anything else
> would be obviously insane^H^H^Hecure. :)

Really? To always do that blindly seems wrong.

I'm refering to the passthrough device not being able to reach registers
of other passthrough devices within the same guest.

Obviously the IOMMU should be setup so that passthrough devices don't reach\
other guests or the host.

Cheers,
Edgar

Re: [PATCH] qemu-iotests: move check-block back to Makefiles

2020-09-03 Thread Lukáš Doktor

Dne 02. 09. 20 v 12:21 Thomas Huth napsal(a):
> On 02/09/2020 10.37, Paolo Bonzini wrote:
>> On 02/09/20 10:19, Daniel P. Berrangé wrote:
>>> On Wed, Sep 02, 2020 at 04:00:46AM -0400, Paolo Bonzini wrote:
 check-block has its own test harness, unlike every other test.  If
 we capture its output, as is in general nicer to do without V=1,
 there will be no sign of progress.  So for lack of a better option
 just move the invocation of the test back to Makefile rules.
>>>
>>> I expect the correct long term solution here is to stop using the
>>> check-block.sh script.  Instead have code which sets up each
>>> of the I/O tests as an explicit test target in meson. We could
>>> use meson's test grouping features too.
>>
>> I'm not sure, "check-acceptance" will never be integrated in Meson, and
>> it may well be the same for "check-block".  Actually I wonder if Avocado
>> would be a better check-block.sh than check-block.sh.
> 
> Wasn't there even some support for the iotests in avocado (or
> avocado-vt) at one point in time? ... not sure anymore, Cleber, Wainer,
> do you remember?
> 
>  Thomas
> 

Avocado supports so called "external runner", which allows to specify a command 
to be executed and arguments as different variants so it was just a matter of 
coming up with the list of "./check" invocation like "./check -qcow2 -nbd 001". 
All of these were executed as a separate test and reported PASS/FAIL. Actually 
I created a wrapper, which also checked the output of the "./check" and allowed 
the test to result in "WARN" which I then used for reporting skipped tests.

Cleber, is there a better way of running qemu-iotests? I know Drew had a plan 
on adding kvm-unit-test runner which would support kvm-unit-tests out of the 
box including the proper setup, but I'm not aware of any such initiative for 
qemu-iotests.

Regards,
Lukáš

PS: Now I remembered that I actually contributed the kvm-unit-test wrapper 
upstream, it's slightly outdated compare to the downstream version but could 
work as an example (if you're interested I can try to synchronize it with the 
current downstream implementation): 
https://github.com/avocado-framework/avocado/blob/master/contrib/testsuites/run-kvm-unit-test.sh

signature.asc
Description: OpenPGP digital signature

Re: [RFC PATCH 00/12] hw: Forbid DMA write accesses to MMIO regions

2020-09-03 Thread Paolo Bonzini

On 03/09/20 16:24, Edgar E. Iglesias wrote:
>> [*] I do wonder about hardware-device-passthrough setups; I
>> don't think I would care to pass through an arbitrary device
>> to an untrusted guest...
> Hmm, I guess it would make sense to have a configurable option in KVM
> to isolate passthrough devices so they only can DMA to guest RAM...

Passthrough devices are always protected by the IOMMU, anything else
would be obviously insane^H^H^Hecure. :)

Paolo

Re: flatview_write_continue global mutex deadlock

2020-09-03 Thread Paolo Bonzini

On 03/09/20 17:42, Vladimir Sementsov-Ogievskiy wrote:
> 
> 
> so no timer exists now: reconnect code goes to yield during drain, to
> continue after drain-end.. Haha, that's obviously bad design, as nobody
> will wake up the waiting requests, and drain will hang forever. OK
> thanks, you helped me, I see now that nbd code is wrong..
> 
> But still, is it OK to do blk_drain holding the global mutex? Drain may
> take a relatively long time, and vm is not responding due to global
> mutex locked in cpu thread..

It has been like that forever, and it's a major reason to use iothreads
even if you don't care about performance.

Paolo

Re: [PATCH] qcow2: Use macros for the L1, refcount and bitmap table entry sizes

2020-09-03 Thread Max Reitz

On 28.08.20 13:08, Alberto Garcia wrote:
> This patch replaces instances of sizeof(uint64_t) in the qcow2 driver
> with macros that indicate what those sizes are actually referring to.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2.h  |  6 +++
>  block/qcow2-bitmap.c   | 11 --
>  block/qcow2-cluster.c  | 24 ++--
>  block/qcow2-refcount.c | 89 ++
>  block/qcow2-snapshot.c | 20 +-
>  block/qcow2.c  | 27 ++---
>  6 files changed, 94 insertions(+), 83 deletions(-)

Thanks, applied to my block branch:

https://git.xanclic.moe/XanClic/qemu/commits/branch/block



signature.asc
Description: OpenPGP digital signature

Re: flatview_write_continue global mutex deadlock

2020-09-03 Thread Vladimir Sementsov-Ogievskiy


03.09.2020 15:29, Paolo Bonzini wrote:

On 03/09/20 13:16, Vladimir Sementsov-Ogievskiy wrote:

(gdb) info thr
   Id   Target Id    Frame
* 1    Thread 0x7fb9f0f39e00 (LWP 215115) "qemu-system-x86"
0x7fb9d784f54d in __lll_lock_wait () from /lib64/libpthread.so.0
[...]
#1  0x56069bfbd3f1 in qemu_poll_ns (fds=0x7fb9401dcdf0, nfds=1,
timeout=542076652475) at ../util/qemu-timer.c:347
#2  0x56069bfd949f in fdmon_poll_wait (ctx=0x56069e6864c0,
ready_list=0x7fb9481fc200, timeout=542076652475) at ../util/fdmon-poll.c:79
#3  0x56069bfcdf4c in aio_poll (ctx=0x56069e6864c0, blocking=true)
at ../util/aio-posix.c:601
#4  0x56069be80cf3 in bdrv_do_drained_begin (bs=0x56069e6c0950,
recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at
../block/io.c:427
#5  0x56069be80ddb in bdrv_drained_begin (bs=0x56069e6c0950) at
../block/io.c:433
#6  0x56069bf1e5b4 in blk_drain (blk=0x56069e6adb50) at
../block/block-backend.c:1718
#7  0x56069ba40fb5 in ide_cancel_dma_sync (s=0x56069f0d1548) at
../hw/ide/core.c:723
[...]
#13 0x56069bd965e2 in flatview_write_continue (fv=0x7fb9401ce100,
addr=49152, attrs=..., ptr=0x7fb9f0f87000, len=1, addr1=0, l=1,
mr=0x56069f0d2420) at ../exec.c:3176


So this is a vCPU thread.  The question is, why is the reconnect timer
not on the same AioContext?  If it were, aio_poll would execute it.

Paolo



(gdb) fr 4
#4  0x564cdffabcf3 in bdrv_do_drained_begin (bs=0x564ce2112950, 
recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at 
../block/io.c:427
427 BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, 
parent));
(gdb) p bs->aio_context
$2 = (AioContext *) 0x564ce20d84c0
(gdb) p bs->drv
$3 = (BlockDriver *) 0x564ce088bb60 
(gdb) set $s=(BDRVNBDState *)bs->opaque
(gdb) p $s->connection_co
connection_co connection_co_sleep_ns_state
(gdb) p $s->connection_co_sleep_ns_state
$4 = (QemuCoSleepState *) 0x0
(gdb) p $s->state
$5 = NBD_CLIENT_CONNECTING_WAIT
(gdb) p $s->connection_co
$6 = (Coroutine *) 0x564ce2118880

...

(gdb) qemu coroutine $6
#0  0x564ce00f402b in qemu_coroutine_switch (from_=0x564ce2118880, 
to_=0x7f8dd2fff598, action=COROUTINE_YIELD) at ../util/coroutine-ucontext.c:302
#1  0x564ce00c2b1a in qemu_coroutine_yield () at 
../util/qemu-coroutine.c:193
#2  0x564cdffc4e50 in nbd_co_reconnect_loop (s=0x564ce21180e0) at 
../block/nbd.c:352
#3  0x564cdffc4f13 in nbd_connection_entry (opaque=0x564ce21180e0) at 
../block/nbd.c:386
#4  0x564ce00f3d33 in coroutine_trampoline (i0=-502167424, i1=22092) at 
../util/coroutine-ucontext.c:173
#5  0x7f8e621cc190 in ?? () from /lib64/libc.so.6
#6  0x7ffcb9b51540 in ?? ()
#7  0x in ?? ()

so no timer exists now: reconnect code goes to yield during drain, to continue 
after drain-end.. Haha, that's obviously bad design, as nobody will wake up the 
waiting requests, and drain will hang forever. OK thanks, you helped me, I see 
now that nbd code is wrong..

But still, is it OK to do blk_drain holding the global mutex? Drain may take a 
relatively long time, and vm is not responding due to global mutex locked in 
cpu thread..

--
Best regards,
Vladimir

[PATCH v6 8/8] block/file: switch to use qemu_open/qemu_create for improved errors

2020-09-03 Thread Daniel P . Berrangé

Currently at startup if using cache=none on a filesystem lacking
O_DIRECT such as tmpfs, at startup QEMU prints

qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: file system may not 
support O_DIRECT
qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: Could not open 
'/tmp/foo.img': Invalid argument

while at QMP level the hint is missing, so QEMU reports just

  "error": {
  "class": "GenericError",
  "desc": "Could not open '/tmp/foo.img': Invalid argument"
  }

which is close to useless for the end user trying to figure out what
they did wrong.

With this change at startup QEMU prints

qemu-system-x86_64: -drive file=/tmp/foo.img,cache=none: Unable to open 
'/tmp/foo.img': filesystem does not support O_DIRECT

while at the QMP level QEMU reports a massively more informative

  "error": {
 "class": "GenericError",
 "desc": "Unable to open '/tmp/foo.img': filesystem does not support 
O_DIRECT"
  }

Reviewed-by: Eric Blake 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Markus Armbruster 
Signed-off-by: Daniel P. Berrangé 
---
 block/file-posix.c | 18 +++---
 block/file-win32.c |  6 ++
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index bac2566f10..c63926d592 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -630,11 +630,10 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 raw_parse_flags(bdrv_flags, >open_flags, false);
 
 s->fd = -1;
-fd = qemu_open_old(filename, s->open_flags, 0644);
+fd = qemu_open(filename, s->open_flags, errp);
 ret = fd < 0 ? -errno : 0;
 
 if (ret < 0) {
-error_setg_file_open(errp, -ret, filename);
 if (ret == -EROFS) {
 ret = -EACCES;
 }
@@ -1032,15 +1031,13 @@ static int raw_reconfigure_getfd(BlockDriverState *bs, 
int flags,
 }
 }
 
-/* If we cannot use fcntl, or fcntl failed, fall back to qemu_open_old() */
+/* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
 if (fd == -1) {
 const char *normalized_filename = bs->filename;
 ret = raw_normalize_devicepath(_filename, errp);
 if (ret >= 0) {
-assert(!(*open_flags & O_CREAT));
-fd = qemu_open_old(normalized_filename, *open_flags);
+fd = qemu_open(normalized_filename, *open_flags, errp);
 if (fd == -1) {
-error_setg_errno(errp, errno, "Could not reopen file");
 return -1;
 }
 }
@@ -2411,10 +2408,9 @@ raw_co_create(BlockdevCreateOptions *options, Error 
**errp)
 }
 
 /* Create file */
-fd = qemu_open_old(file_opts->filename, O_RDWR | O_CREAT | O_BINARY, 0644);
+fd = qemu_create(file_opts->filename, O_RDWR | O_BINARY, 0644, errp);
 if (fd < 0) {
 result = -errno;
-error_setg_errno(errp, -result, "Could not create file");
 goto out;
 }
 
@@ -3335,7 +3331,7 @@ static bool setup_cdrom(char *bsd_path, Error **errp)
 for (index = 0; index < num_of_test_partitions; index++) {
 snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path,
  index);
-fd = qemu_open_old(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE);
+fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE, 
NULL);
 if (fd >= 0) {
 partition_found = true;
 qemu_close(fd);
@@ -3653,7 +3649,7 @@ static int cdrom_probe_device(const char *filename)
 int prio = 0;
 struct stat st;
 
-fd = qemu_open_old(filename, O_RDONLY | O_NONBLOCK);
+fd = qemu_open(filename, O_RDONLY | O_NONBLOCK, NULL);
 if (fd < 0) {
 goto out;
 }
@@ -3787,7 +3783,7 @@ static int cdrom_reopen(BlockDriverState *bs)
  */
 if (s->fd >= 0)
 qemu_close(s->fd);
-fd = qemu_open_old(bs->filename, s->open_flags, 0644);
+fd = qemu_open(bs->filename, s->open_flags, NULL);
 if (fd < 0) {
 s->fd = -1;
 return -EIO;
diff --git a/block/file-win32.c b/block/file-win32.c
index 8c1845830e..1a31f8a5ba 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -576,11 +576,9 @@ static int raw_co_create(BlockdevCreateOptions *options, 
Error **errp)
 return -EINVAL;
 }
 
-fd = qemu_open_old(file_opts->filename,
-   O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
-   0644);
+fd = qemu_create(file_opts->filename, O_WRONLY | O_TRUNC | O_BINARY,
+ 0644, errp);
 if (fd < 0) {
-error_setg_errno(errp, errno, "Could not create file");
 return -EIO;
 }
 set_sparse(fd);
-- 
2.26.2

[PATCH v6 4/8] util: refactor qemu_open_old to split off variadic args handling

2020-09-03 Thread Daniel P . Berrangé

This simple refactoring prepares for future patches. The variadic args
handling is split from the main bulk of the open logic.

Reviewed-by: Markus Armbruster 
Signed-off-by: Daniel P. Berrangé 
---
 util/osdep.c | 24 +---
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/util/osdep.c b/util/osdep.c
index 7504c156e8..11531e8f59 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -296,10 +296,10 @@ static int qemu_open_cloexec(const char *name, int flags, 
mode_t mode)
 /*
  * Opens a file with FD_CLOEXEC set
  */
-int qemu_open_old(const char *name, int flags, ...)
+static int
+qemu_open_internal(const char *name, int flags, mode_t mode)
 {
 int ret;
-int mode = 0;
 
 #ifndef _WIN32
 const char *fdset_id_str;
@@ -324,15 +324,25 @@ int qemu_open_old(const char *name, int flags, ...)
 }
 #endif
 
-if (flags & O_CREAT) {
-va_list ap;
+ret = qemu_open_cloexec(name, flags, mode);
+
+return ret;
+}
+
 
-va_start(ap, flags);
+int qemu_open_old(const char *name, int flags, ...)
+{
+va_list ap;
+mode_t mode = 0;
+int ret;
+
+va_start(ap, flags);
+if (flags & O_CREAT) {
 mode = va_arg(ap, int);
-va_end(ap);
 }
+va_end(ap);
 
-ret = qemu_open_cloexec(name, flags, mode);
+ret = qemu_open_internal(name, flags, mode);
 
 #ifdef O_DIRECT
 if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
-- 
2.26.2

[PATCH v6 3/8] util: rename qemu_open() to qemu_open_old()

2020-09-03 Thread Daniel P . Berrangé

We want to introduce a new version of qemu_open() that uses an Error
object for reporting problems and make this it the preferred interface.
Rename the existing method to release the namespace for the new impl.

Reviewed-by: Eric Blake 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Markus Armbruster 
Signed-off-by: Daniel P. Berrangé 
---
 accel/kvm/kvm-all.c|  2 +-
 backends/rng-random.c  |  2 +-
 backends/tpm/tpm_passthrough.c |  8 
 block/file-posix.c | 14 +++---
 block/file-win32.c |  5 +++--
 block/vvfat.c  |  5 +++--
 chardev/char-fd.c  |  2 +-
 chardev/char-pipe.c|  6 +++---
 chardev/char.c |  2 +-
 dump/dump.c|  2 +-
 hw/s390x/s390-skeys.c  |  2 +-
 hw/usb/host-libusb.c   |  2 +-
 hw/usb/u2f-passthru.c  |  4 ++--
 hw/vfio/common.c   |  4 ++--
 include/qemu/osdep.h   |  2 +-
 io/channel-file.c  |  2 +-
 net/vhost-vdpa.c   |  2 +-
 os-posix.c |  2 +-
 qga/channel-posix.c|  4 ++--
 qga/commands-posix.c   |  6 +++---
 target/arm/kvm.c   |  2 +-
 ui/console.c   |  2 +-
 util/osdep.c   |  2 +-
 util/oslib-posix.c |  2 +-
 24 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 63ef6af9a1..ad8b315b35 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -2013,7 +2013,7 @@ static int kvm_init(MachineState *ms)
 #endif
 QLIST_INIT(>kvm_parked_vcpus);
 s->vmfd = -1;
-s->fd = qemu_open("/dev/kvm", O_RDWR);
+s->fd = qemu_open_old("/dev/kvm", O_RDWR);
 if (s->fd == -1) {
 fprintf(stderr, "Could not access KVM kernel module: %m\n");
 ret = -errno;
diff --git a/backends/rng-random.c b/backends/rng-random.c
index 32998d8ee7..245b12ab24 100644
--- a/backends/rng-random.c
+++ b/backends/rng-random.c
@@ -75,7 +75,7 @@ static void rng_random_opened(RngBackend *b, Error **errp)
 error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"filename", "a valid filename");
 } else {
-s->fd = qemu_open(s->filename, O_RDONLY | O_NONBLOCK);
+s->fd = qemu_open_old(s->filename, O_RDONLY | O_NONBLOCK);
 if (s->fd == -1) {
 error_setg_file_open(errp, errno, s->filename);
 }
diff --git a/backends/tpm/tpm_passthrough.c b/backends/tpm/tpm_passthrough.c
index 7403807ec4..81e2d8f531 100644
--- a/backends/tpm/tpm_passthrough.c
+++ b/backends/tpm/tpm_passthrough.c
@@ -217,7 +217,7 @@ static int 
tpm_passthrough_open_sysfs_cancel(TPMPassthruState *tpm_pt)
 char path[PATH_MAX];
 
 if (tpm_pt->options->cancel_path) {
-fd = qemu_open(tpm_pt->options->cancel_path, O_WRONLY);
+fd = qemu_open_old(tpm_pt->options->cancel_path, O_WRONLY);
 if (fd < 0) {
 error_report("tpm_passthrough: Could not open TPM cancel path: %s",
  strerror(errno));
@@ -235,11 +235,11 @@ static int 
tpm_passthrough_open_sysfs_cancel(TPMPassthruState *tpm_pt)
 dev++;
 if (snprintf(path, sizeof(path), "/sys/class/tpm/%s/device/cancel",
  dev) < sizeof(path)) {
-fd = qemu_open(path, O_WRONLY);
+fd = qemu_open_old(path, O_WRONLY);
 if (fd < 0) {
 if (snprintf(path, sizeof(path), 
"/sys/class/misc/%s/device/cancel",
  dev) < sizeof(path)) {
-fd = qemu_open(path, O_WRONLY);
+fd = qemu_open_old(path, O_WRONLY);
 }
 }
 }
@@ -271,7 +271,7 @@ tpm_passthrough_handle_device_opts(TPMPassthruState 
*tpm_pt, QemuOpts *opts)
 }
 
 tpm_pt->tpm_dev = value ? value : TPM_PASSTHROUGH_DEFAULT_DEVICE;
-tpm_pt->tpm_fd = qemu_open(tpm_pt->tpm_dev, O_RDWR);
+tpm_pt->tpm_fd = qemu_open_old(tpm_pt->tpm_dev, O_RDWR);
 if (tpm_pt->tpm_fd < 0) {
 error_report("Cannot access TPM device using '%s': %s",
  tpm_pt->tpm_dev, strerror(errno));
diff --git a/block/file-posix.c b/block/file-posix.c
index 9a00d4190a..bac2566f10 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -630,7 +630,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 raw_parse_flags(bdrv_flags, >open_flags, false);
 
 s->fd = -1;
-fd = qemu_open(filename, s->open_flags, 0644);
+fd = qemu_open_old(filename, s->open_flags, 0644);
 ret = fd < 0 ? -errno : 0;
 
 if (ret < 0) {
@@ -1032,13 +1032,13 @@ static int raw_reconfigure_getfd(BlockDriverState *bs, 
int flags,
 }
 }
 
-/* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
+/* If we cannot use fcntl, or fcntl failed, fall back to qemu_open_old() */
 if (fd == -1) {
 const char *normalized_filename = bs->filename;
 ret = raw_normalize_devicepath(_filename, errp);

[PATCH v6 7/8] util: give a specific error message when O_DIRECT doesn't work

2020-09-03 Thread Daniel P . Berrangé

A common error scenario is to tell QEMU to use O_DIRECT in combination
with a filesystem that doesn't support it. To aid users to diagnosing
their mistake we want to provide a clear error message when this happens.

Reviewed-by: Markus Armbruster 
Reviewed-by: Eric Blake 
Signed-off-by: Daniel P. Berrangé 
---
 util/osdep.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/util/osdep.c b/util/osdep.c
index c99f1e7db2..8ea7a807c1 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -332,11 +332,24 @@ qemu_open_internal(const char *name, int flags, mode_t 
mode, Error **errp)
 
 if (ret == -1) {
 const char *action = flags & O_CREAT ? "create" : "open";
+#ifdef O_DIRECT
+/* Give more helpful error message for O_DIRECT */
+if (errno == EINVAL && (flags & O_DIRECT)) {
+ret = open(name, flags & ~O_DIRECT, mode);
+if (ret != -1) {
+close(ret);
+error_setg(errp, "Could not %s '%s': "
+   "filesystem does not support O_DIRECT",
+   action, name);
+errno = EINVAL; /* restore first open()'s errno */
+return -1;
+}
+}
+#endif /* O_DIRECT */
 error_setg_errno(errp, errno, "Could not %s '%s'",
  action, name);
 }
 
-
 return ret;
 }
 
-- 
2.26.2

[PATCH v6 1/8] monitor: simplify functions for getting a dup'd fdset entry

2020-09-03 Thread Daniel P . Berrangé

Currently code has to call monitor_fdset_get_fd, then dup
the return fd, and then add the duplicate FD back into the
fdset. This dance is overly verbose for the caller and
introduces extra failure modes which can be avoided by
folding all the logic into monitor_fdset_dup_fd_add and
removing monitor_fdset_get_fd entirely.

Signed-off-by: Daniel P. Berrangé 
---
 include/monitor/monitor.h |  3 +-
 include/qemu/osdep.h  |  1 +
 monitor/misc.c| 58 +--
 stubs/fdset.c |  8 ++
 util/osdep.c  | 19 ++---
 5 files changed, 32 insertions(+), 57 deletions(-)

diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 1018d754a6..c0170773d4 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -43,8 +43,7 @@ int monitor_read_password(MonitorHMP *mon, ReadLineFunc 
*readline_func,
 AddfdInfo *monitor_fdset_add_fd(int fd, bool has_fdset_id, int64_t fdset_id,
 bool has_opaque, const char *opaque,
 Error **errp);
-int monitor_fdset_get_fd(int64_t fdset_id, int flags);
-int monitor_fdset_dup_fd_add(int64_t fdset_id, int dup_fd);
+int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags);
 void monitor_fdset_dup_fd_remove(int dup_fd);
 int64_t monitor_fdset_dup_fd_find(int dup_fd);
 
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 412962d91a..66ee5bc45d 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -501,6 +501,7 @@ int qemu_open(const char *name, int flags, ...);
 int qemu_close(int fd);
 int qemu_unlink(const char *name);
 #ifndef _WIN32
+int qemu_dup_flags(int fd, int flags);
 int qemu_dup(int fd);
 #endif
 int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive);
diff --git a/monitor/misc.c b/monitor/misc.c
index e847b58a8c..0b1b9b196c 100644
--- a/monitor/misc.c
+++ b/monitor/misc.c
@@ -1547,69 +1547,61 @@ AddfdInfo *monitor_fdset_add_fd(int fd, bool 
has_fdset_id, int64_t fdset_id,
 return fdinfo;
 }
 
-int monitor_fdset_get_fd(int64_t fdset_id, int flags)
+int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
 {
 #ifdef _WIN32
 return -ENOENT;
 #else
 MonFdset *mon_fdset;
-MonFdsetFd *mon_fdset_fd;
-int mon_fd_flags;
-int ret;
 
 qemu_mutex_lock(_fdsets_lock);
 QLIST_FOREACH(mon_fdset, _fdsets, next) {
+MonFdsetFd *mon_fdset_fd;
+MonFdsetFd *mon_fdset_fd_dup;
+int fd = -1;
+int dup_fd;
+int mon_fd_flags;
+
 if (mon_fdset->id != fdset_id) {
 continue;
 }
+
 QLIST_FOREACH(mon_fdset_fd, _fdset->fds, next) {
 mon_fd_flags = fcntl(mon_fdset_fd->fd, F_GETFL);
 if (mon_fd_flags == -1) {
-ret = -errno;
-goto out;
+qemu_mutex_unlock(_fdsets_lock);
+return -1;
 }
 
 if ((flags & O_ACCMODE) == (mon_fd_flags & O_ACCMODE)) {
-ret = mon_fdset_fd->fd;
-goto out;
+fd = mon_fdset_fd->fd;
+break;
 }
 }
-ret = -EACCES;
-goto out;
-}
-ret = -ENOENT;
-
-out:
-qemu_mutex_unlock(_fdsets_lock);
-return ret;
-#endif
-}
-
-int monitor_fdset_dup_fd_add(int64_t fdset_id, int dup_fd)
-{
-MonFdset *mon_fdset;
-MonFdsetFd *mon_fdset_fd_dup;
 
-qemu_mutex_lock(_fdsets_lock);
-QLIST_FOREACH(mon_fdset, _fdsets, next) {
-if (mon_fdset->id != fdset_id) {
-continue;
+if (fd == -1) {
+qemu_mutex_unlock(_fdsets_lock);
+errno = EACCES;
+return -1;
 }
-QLIST_FOREACH(mon_fdset_fd_dup, _fdset->dup_fds, next) {
-if (mon_fdset_fd_dup->fd == dup_fd) {
-goto err;
-}
+
+dup_fd = qemu_dup_flags(fd, flags);
+if (dup_fd == -1) {
+qemu_mutex_unlock(_fdsets_lock);
+return -1;
 }
+
 mon_fdset_fd_dup = g_malloc0(sizeof(*mon_fdset_fd_dup));
 mon_fdset_fd_dup->fd = dup_fd;
 QLIST_INSERT_HEAD(_fdset->dup_fds, mon_fdset_fd_dup, next);
 qemu_mutex_unlock(_fdsets_lock);
-return 0;
+return dup_fd;
 }
 
-err:
 qemu_mutex_unlock(_fdsets_lock);
+errno = ENOENT;
 return -1;
+#endif
 }
 
 static int64_t monitor_fdset_dup_fd_find_remove(int dup_fd, bool remove)
diff --git a/stubs/fdset.c b/stubs/fdset.c
index 67dd5e1d34..56b3663d58 100644
--- a/stubs/fdset.c
+++ b/stubs/fdset.c
@@ -1,8 +1,9 @@
 #include "qemu/osdep.h"
 #include "monitor/monitor.h"
 
-int monitor_fdset_dup_fd_add(int64_t fdset_id, int dup_fd)
+int monitor_fdset_dup_fd_add(int64_t fdset_id, int flags)
 {
+errno = ENOSYS;
 return -1;
 }
 
@@ -11,11 +12,6 @@ int64_t monitor_fdset_dup_fd_find(int dup_fd)
 return -1;
 }
 
-int monitor_fdset_get_fd(int64_t fdset_id, int flags)
-{
-return -ENOENT;
-}
-

[PATCH v6 2/8] util: split off a helper for dealing with O_CLOEXEC flag

2020-09-03 Thread Daniel P . Berrangé

We're going to have multiple callers to open() from qemu_open()
soon. Readability would thus benefit from having a helper for
dealing with O_CLOEXEC.

Reviewed-by: Markus Armbruster 
Signed-off-by: Daniel P. Berrangé 
---
 util/osdep.c | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/util/osdep.c b/util/osdep.c
index 3d94b4d732..0d8fa9f016 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -279,6 +279,20 @@ int qemu_lock_fd_test(int fd, int64_t start, int64_t len, 
bool exclusive)
 }
 #endif
 
+static int qemu_open_cloexec(const char *name, int flags, mode_t mode)
+{
+int ret;
+#ifdef O_CLOEXEC
+ret = open(name, flags | O_CLOEXEC, mode);
+#else
+ret = open(name, flags, mode);
+if (ret >= 0) {
+qemu_set_cloexec(ret);
+}
+#endif
+return ret;
+}
+
 /*
  * Opens a file with FD_CLOEXEC set
  */
@@ -318,14 +332,7 @@ int qemu_open(const char *name, int flags, ...)
 va_end(ap);
 }
 
-#ifdef O_CLOEXEC
-ret = open(name, flags | O_CLOEXEC, mode);
-#else
-ret = open(name, flags, mode);
-if (ret >= 0) {
-qemu_set_cloexec(ret);
-}
-#endif
+ret = qemu_open_cloexec(name, flags, mode);
 
 #ifdef O_DIRECT
 if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
-- 
2.26.2

[PATCH v6 5/8] util: add Error object for qemu_open_internal error reporting

2020-09-03 Thread Daniel P . Berrangé

Instead of relying on the limited information from errno, we can now
also provide detailed error messages to callers that ask for it.

Reviewed-by: Markus Armbruster 
Signed-off-by: Daniel P. Berrangé 
---
 util/osdep.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/util/osdep.c b/util/osdep.c
index 11531e8f59..28aa89adc9 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu/osdep.h"
+#include "qapi/error.h"
 
 /* Needed early for CONFIG_BSD etc. */
 
@@ -297,7 +298,7 @@ static int qemu_open_cloexec(const char *name, int flags, 
mode_t mode)
  * Opens a file with FD_CLOEXEC set
  */
 static int
-qemu_open_internal(const char *name, int flags, mode_t mode)
+qemu_open_internal(const char *name, int flags, mode_t mode, Error **errp)
 {
 int ret;
 
@@ -311,12 +312,15 @@ qemu_open_internal(const char *name, int flags, mode_t 
mode)
 
 fdset_id = qemu_parse_fdset(fdset_id_str);
 if (fdset_id == -1) {
+error_setg(errp, "Could not parse fdset %s", name);
 errno = EINVAL;
 return -1;
 }
 
 dupfd = monitor_fdset_dup_fd_add(fdset_id, flags);
 if (dupfd == -1) {
+error_setg_errno(errp, errno, "Could not dup FD for %s flags %x",
+ name, flags);
 return -1;
 }
 
@@ -326,6 +330,13 @@ qemu_open_internal(const char *name, int flags, mode_t 
mode)
 
 ret = qemu_open_cloexec(name, flags, mode);
 
+if (ret == -1) {
+const char *action = flags & O_CREAT ? "create" : "open";
+error_setg_errno(errp, errno, "Could not %s '%s'",
+ action, name);
+}
+
+
 return ret;
 }
 
@@ -342,7 +353,7 @@ int qemu_open_old(const char *name, int flags, ...)
 }
 va_end(ap);
 
-ret = qemu_open_internal(name, flags, mode);
+ret = qemu_open_internal(name, flags, mode, NULL);
 
 #ifdef O_DIRECT
 if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
-- 
2.26.2

[PATCH v6 6/8] util: introduce qemu_open and qemu_create with error reporting

2020-09-03 Thread Daniel P . Berrangé

qemu_open_old() works like open(): set errno and return -1 on failure.
It has even more failure modes, though.  Reporting the error clearly
to users is basically impossible for many of them.

Our standard cure for "errno is too coarse" is the Error object.
Introduce two new helper methods:

  int qemu_open(const char *name, int flags, Error **errp);
  int qemu_create(const char *name, int flags, mode_t mode, Error **errp);

Note that with this design we no longer require or even accept the
O_CREAT flag. Avoiding overloading the two distinct operations
means we can avoid variable arguments which would prevent 'errp' from
being the last argument. It also gives us a guarantee that the 'mode' is
given when creating files, avoiding a latent security bug.

Reviewed-by: Markus Armbruster 
Signed-off-by: Daniel P. Berrangé 
---
 include/qemu/osdep.h |  6 ++
 util/osdep.c | 16 
 2 files changed, 22 insertions(+)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index ae1234104c..577d9e8315 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -497,7 +497,13 @@ int qemu_madvise(void *addr, size_t len, int advice);
 int qemu_mprotect_rwx(void *addr, size_t size);
 int qemu_mprotect_none(void *addr, size_t size);
 
+/*
+ * Don't introduce new usage of this function, prefer the following
+ * qemu_open/qemu_create that take an "Error **errp"
+ */
 int qemu_open_old(const char *name, int flags, ...);
+int qemu_open(const char *name, int flags, Error **errp);
+int qemu_create(const char *name, int flags, mode_t mode, Error **errp);
 int qemu_close(int fd);
 int qemu_unlink(const char *name);
 #ifndef _WIN32
diff --git a/util/osdep.c b/util/osdep.c
index 28aa89adc9..c99f1e7db2 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -341,6 +341,22 @@ qemu_open_internal(const char *name, int flags, mode_t 
mode, Error **errp)
 }
 
 
+int qemu_open(const char *name, int flags, Error **errp)
+{
+assert(!(flags & O_CREAT));
+
+return qemu_open_internal(name, flags, 0, errp);
+}
+
+
+int qemu_create(const char *name, int flags, mode_t mode, Error **errp)
+{
+assert(!(flags & O_CREAT));
+
+return qemu_open_internal(name, flags | O_CREAT, mode, errp);
+}
+
+
 int qemu_open_old(const char *name, int flags, ...)
 {
 va_list ap;
-- 
2.26.2

[PATCH v6 0/8] block: improve error reporting for unsupported O_DIRECT

2020-09-03 Thread Daniel P . Berrangé

v1: https://lists.gnu.org/archive/html/qemu-devel/2020-07/msg00269.html
v2: https://lists.gnu.org/archive/html/qemu-devel/2020-07/msg00589.html
v3: https://lists.gnu.org/archive/html/qemu-devel/2020-07/msg07098.html
v4: https://lists.gnu.org/archive/html/qemu-devel/2020-08/msg05334.html
v5: https://lists.gnu.org/archive/html/qemu-devel/2020-09/msg00947.html

See patch commit messages for rationale

Ideally we would convert other callers of qemu_open_old to use
qemu_open, and eventually remove qemu_open_old entirely, so every
caller gets use of Error **errp.

Improved in v6:

 - Fix errno regression dup'ing FD
 - Move qapi header to correct patch
 - Fix whitespace and commit messages
 - Converted more use of qemu_open to qemu_open_old after rebase

Improved in v5:

 - Drop reporting of flags in failed open call
 - Split O_CLOEXEC handling off into separate helper
 - Refactor monitor FD set APIs to simplify their use

Improved in v4:

 - Use assert() for programmer mistakes
 - Split second patch into three distinct parts
 - Misc typos
 - Improve commit message

Improved in v3:

 - Re-arrange the patches series, so that the conversion to Error
   takes place first, then the improve O_DIRECT reporting
 - Rename existing method to qemu_open_old
 - Use a pair of new methods qemu_open + qemu_create to improve
   arg checking

Improved in v2:

 - Mention that qemu_open_err is preferred over qemu_open
 - Get rid of obsolete error_report call
 - Simplify O_DIRECT handling
 - Fixup iotests for changed error message text

Daniel P. Berrangé (8):
  monitor: simplify functions for getting a dup'd fdset entry
  util: split off a helper for dealing with O_CLOEXEC flag
  util: rename qemu_open() to qemu_open_old()
  util: refactor qemu_open_old to split off variadic args handling
  util: add Error object for qemu_open_internal error reporting
  util: introduce qemu_open and qemu_create with error reporting
  util: give a specific error message when O_DIRECT doesn't work
  block/file: switch to use qemu_open/qemu_create for improved errors

 accel/kvm/kvm-all.c|   2 +-
 backends/rng-random.c  |   2 +-
 backends/tpm/tpm_passthrough.c |   8 +--
 block/file-posix.c |  16 ++---
 block/file-win32.c |   5 +-
 block/vvfat.c  |   5 +-
 chardev/char-fd.c  |   2 +-
 chardev/char-pipe.c|   6 +-
 chardev/char.c |   2 +-
 dump/dump.c|   2 +-
 hw/s390x/s390-skeys.c  |   2 +-
 hw/usb/host-libusb.c   |   2 +-
 hw/usb/u2f-passthru.c  |   4 +-
 hw/vfio/common.c   |   4 +-
 include/monitor/monitor.h  |   3 +-
 include/qemu/osdep.h   |   9 ++-
 io/channel-file.c  |   2 +-
 monitor/misc.c |  58 --
 net/vhost-vdpa.c   |   2 +-
 os-posix.c |   2 +-
 qga/channel-posix.c|   4 +-
 qga/commands-posix.c   |   6 +-
 stubs/fdset.c  |   8 +--
 target/arm/kvm.c   |   2 +-
 ui/console.c   |   2 +-
 util/osdep.c   | 104 +++--
 util/oslib-posix.c |   2 +-
 27 files changed, 150 insertions(+), 116 deletions(-)

-- 
2.26.2

Re: [PATCH] qemu-img: avoid unaligned read requests during convert

2020-09-03 Thread Max Reitz

On 01.09.20 14:51, Peter Lieven wrote:
> in case of large continous areas that share the same allocation status
> it happens that the value of s->sector_next_status is unaligned to the
> cluster size or even request alignment of the source. Avoid this by
> stripping down the s->sector_next_status position to cluster boundaries.
> 
> Signed-off-by: Peter Lieven 
> ---
>  qemu-img.c | 22 ++
>  1 file changed, 22 insertions(+)

Thanks, applied to my block branch:

https://git.xanclic.moe/XanClic/qemu/commits/branch/block



signature.asc
Description: OpenPGP digital signature

Re: [PATCH v4 06/12] block: Fixes nfs on msys2/mingw

2020-09-03 Thread Thomas Huth



CC: NFS maintainer + qemu-block

On 03/09/2020 10.31, Yonggang Luo wrote:
> Signed-off-by: Yonggang Luo 
> ---
>  block/nfs.c | 26 +-
>  1 file changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/block/nfs.c b/block/nfs.c
> index 61a249a9fc..34b2cd5708 100644
> --- a/block/nfs.c
> +++ b/block/nfs.c
> @@ -24,7 +24,9 @@
>  
>  #include "qemu/osdep.h"
>  
> +#if !defined(_WIN32)
>  #include 
> +#endif
>  #include "qemu/config-file.h"
>  #include "qemu/error-report.h"
>  #include "qapi/error.h"
> @@ -51,6 +53,12 @@
>  #define QEMU_NFS_MAX_PAGECACHE_SIZE (8388608 / NFS_BLKSIZE)
>  #define QEMU_NFS_MAX_DEBUG_LEVEL 2
>  
> +#if defined (_WIN32)
> +#define nfs_stat __stat64
> +#else
> +#define nfs_stat stat
> +#endif
> +
>  typedef struct NFSClient {
>  struct nfs_context *context;
>  struct nfsfh *fh;
> @@ -58,7 +66,7 @@ typedef struct NFSClient {
>  bool has_zero_init;
>  AioContext *aio_context;
>  QemuMutex mutex;
> -blkcnt_t st_blocks;
> +int64_t st_size;
>  bool cache_used;
>  NFSServer *server;
>  char *path;
> @@ -70,7 +78,7 @@ typedef struct NFSRPC {
>  int ret;
>  int complete;
>  QEMUIOVector *iov;
> -struct stat *st;
> +struct nfs_stat *st;
>  Coroutine *co;
>  NFSClient *client;
>  } NFSRPC;
> @@ -419,7 +427,7 @@ static int64_t nfs_client_open(NFSClient *client, 
> BlockdevOptionsNfs *opts,
> int flags, int open_flags, Error **errp)
>  {
>  int64_t ret = -EINVAL;
> -struct stat st;
> +struct nfs_stat st;
>  char *file = NULL, *strp = NULL;
>  
>  qemu_mutex_init(>mutex);
> @@ -545,7 +553,7 @@ static int64_t nfs_client_open(NFSClient *client, 
> BlockdevOptionsNfs *opts,
>  }
>  
>  ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE);
> -client->st_blocks = st.st_blocks;
> +client->st_size = st.st_size;
>  client->has_zero_init = S_ISREG(st.st_mode);
>  *strp = '/';
>  goto out;
> @@ -729,11 +737,11 @@ static int64_t 
> nfs_get_allocated_file_size(BlockDriverState *bs)
>  {
>  NFSClient *client = bs->opaque;
>  NFSRPC task = {0};
> -struct stat st;
> +struct nfs_stat st;
>  
>  if (bdrv_is_read_only(bs) &&
>  !(bs->open_flags & BDRV_O_NOCACHE)) {
> -return client->st_blocks * 512;
> +return client->st_size;
>  }
>  
>  task.bs = bs;
> @@ -746,7 +754,7 @@ static int64_t 
> nfs_get_allocated_file_size(BlockDriverState *bs)
>  nfs_set_events(client);
>  BDRV_POLL_WHILE(bs, !task.complete);
>  
> -return (task.ret < 0 ? task.ret : st.st_blocks * 512);
> +return (task.ret < 0 ? task.ret : st.st_size);
>  }
>  
>  static int coroutine_fn
> @@ -778,7 +786,7 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
>BlockReopenQueue *queue, Error **errp)
>  {
>  NFSClient *client = state->bs->opaque;
> -struct stat st;
> +struct nfs_stat st;
>  int ret = 0;
>  
>  if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) {
> @@ -800,7 +808,7 @@ static int nfs_reopen_prepare(BDRVReopenState *state,
> nfs_get_error(client->context));
>  return ret;
>  }
> -client->st_blocks = st.st_blocks;
> +client->st_size = st.st_size;
>  }
>  
>  return 0;
>

Re: [PATCH] tests/qtest/ahci: Improve error handling (NEGATIVE_RETURNS)

2020-09-03 Thread no-reply

Patchew URL: https://patchew.org/QEMU/20200902080552.159806-1-phi...@redhat.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

Clone of 'https://git.qemu.org/git/dtc.git' into submodule path 'dtc' failed
failed to update submodule dtc
Submodule 'dtc' (https://git.qemu.org/git/dtc.git) unregistered for path 'dtc'
make[1]: *** 
[/var/tmp/patchew-tester-tmp-rwu0f67e/src/docker-src.2020-09-03-10.37.46.30048] 
Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-rwu0f67e/src'
make: *** [docker-run-test-quick@centos7] Error 2

real0m30.996s
user0m2.113s


The full log is available at
http://patchew.org/logs/20200902080552.159806-1-phi...@redhat.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [RFC PATCH 00/12] hw: Forbid DMA write accesses to MMIO regions

2020-09-03 Thread Edgar E. Iglesias

On Thu, Sep 03, 2020 at 02:58:19PM +0100, Peter Maydell wrote:
> On Thu, 3 Sep 2020 at 14:37, Laszlo Ersek  wrote:
> > Peter mentions an approach at the end of
> >  that I believe
> > to understand, but -- according to him -- it seems too much work.
> 
> It also would only be effective for MMIO, not for qemu_irq lines...
> 
> > I don't think such chains work unto arbitrary depths on physical
> > hardware either.
> 
> Real hardware by and large doesn't get designed with this kind
> of DMA-to-self as a consideration either, but unfortunately it's
> not really very useful as a model to base QEMU's behaviour on:
> 
>  (1) real hardware is usually massively parallel, so the logic
>   that handles incoming MMIO is decoupled anyway from logic
>   that does outgoing DMA. (Arguably the "do all DMA in a
>   bottom-half" idea is kind of following the hardware design.)
>   Similarly simple "raise this outbound signal" logic just
>   works as an instantaneous action that causes the device on
>   the other end to change its state/do something parallel,
>   whereas for QEMU we need to actually call some code in the
>   device on the other end and so we serialize this stuff,
>   sandwiching a bit of "device B code" in the middle of a
>   run of "device A code". So a lot more of this stuff "just
>   happens to work" on h/w than we get with QEMU.
>  (2) if software running on real h/w does do something silly with
>   programming a device to DMA to itself then the worst case is
>   generally that they manage to wedge that device (or the whole
>   machine, if you're really unlucky), in which case the response
>   is "don't do that then". There isn't the same "guest code
>   can escape the VM" security boundary that QEMU needs to guard
>   against [*].
> 
> [*] I do wonder about hardware-device-passthrough setups; I
> don't think I would care to pass through an arbitrary device
> to an untrusted guest...

Hmm, I guess it would make sense to have a configurable option in KVM
to isolate passthrough devices so they only can DMA to guest RAM...

Cheers,
Edgar

Re: [PATCH v6 00/12] monitor: Optionally run handlers in coroutines

2020-09-03 Thread Markus Armbruster

Kevin Wolf  writes:

> Am 03.09.2020 um 12:49 hat Markus Armbruster geschrieben:
>> Markus Armbruster  writes:
>> 
>> > Markus Armbruster  writes:
>> >
>> >> I let this series slide to get my Error API rework done, along with much
>> >> else.  My sincere apologies!
>> >>
>> >> Unsurprisingly, it needs a rebase now.  I suggest to let me review it as
>> >> is first.
>> >
>> > I'm done with v6.  Summary:
>> >
>> > * A few trivial things to correct here and there.
>> >
>> > * A few ideas to improve things in relatively minor ways.
>> >
>> > * PATCH 03 looks "why bother" to me until PATCH 09 makes me suspect you
>> >   did the former to enable the latter.  If you had captured that in your
>> >   commit message back then, like you did for the similar PATCH 05, I
>> >   wouldn't be scratching my head now :)
>> >
>> > * I dislike PATCH 06, and would like to explore an alternative idea.
>> >
>> > * PATCH 08 makes hairy monitor code even hairier, but I don't have
>> >   better ideas.
>> >
>> > * I don't feel comfortable as a sole reviewer of the AIO magic in PATCH
>> >   10-12.  Let's ask Stefan for an eye-over.
>> >
>> > I'd like to proceed as follows.  You rebase, and address "easy" review
>> > comments (you decide what's easy).  Post as v7, cc'ing Stefan for the
>> > AIO magic and David Gilbert for HMP.  While they review (hopefully), I
>> > explore a replacement for PATCH 06.  And then we touch bases and decide
>> > how to get this thing wrapped.
>> 
>> I explored:
>> 
>> Subject: Ways to do per-coroutine properties (was: [PATCH v6 06/12] 
>> monitor: Make current monitor a per-coroutine property)
>> Date: Fri, 07 Aug 2020 15:09:19 +0200 (3 weeks, 5 days, 21 hours ago)
>> Message-ID: <87a6z6wqkg.fsf...@dusky.pond.sub.org>
>> 
>> May I have v7?  Feel free to keep your PATCH 06.  If I decide to replace
>> it, I can do it myself, possibly on top.
>
> It's one of the next things on my list. I can't promise anything more
> specific, though.

Take your time.  I've certainly taken mine and then some.

Re: [PATCH v1] sd: sdhci: assert data_count is within fifo_buffer

2020-09-03 Thread Philippe Mathieu-Daudé

On 9/3/20 9:08 AM, P J P wrote:
> From: Prasad J Pandit 
> 
> While doing multi block SDMA, transfer block size may exceed
> the 's->fifo_buffer[s->buf_maxsz]' size. It may leave the
> current element pointer 's->data_count' pointing out of bounds.
> Leading the subsequent DMA r/w operation to OOB access issue.
> Assert that 's->data_count' is within fifo_buffer.
> 
>  -> 
> https://ruhr-uni-bochum.sciebo.de/s/NNWP2GfwzYKeKwE?path=%2Fsdhci_oob_write1
>  ==1459837==ERROR: AddressSanitizer: heap-buffer-overflow
>  WRITE of size 54722048 at 0x6151e280 thread T3
>  #0  __interceptor_memcpy (/lib64/libasan.so.6+0x3a71d)
>  #1  flatview_read_continue ../exec.c:3245
>  #2  flatview_read ../exec.c:3278
>  #3  address_space_read_full ../exec.c:3291
>  #4  address_space_rw ../exec.c:3319
>  #5  dma_memory_rw_relaxed ../include/sysemu/dma.h:87
>  #6  dma_memory_rw ../include/sysemu/dma.h:110
>  #7  dma_memory_read ../include/sysemu/dma.h:116
>  #8  sdhci_sdma_transfer_multi_blocks ../hw/sd/sdhci.c:629
>  #9  sdhci_write ../hw/sd/sdhci.c:1097
>  #10 memory_region_write_accessor ../softmmu/memory.c:483
>  ...
> 
> Reported-by: Ruhr-University 
> Suggested-by: Philippe Mathieu-Daudé 
> Signed-off-by: Prasad J Pandit 
> ---
>  hw/sd/sdhci.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> Update v1: use assert(3) calls
>   -> https://lists.nongnu.org/archive/html/qemu-devel/2020-09/msg00966.html
> 
> diff --git a/hw/sd/sdhci.c b/hw/sd/sdhci.c
> index 1785d7e1f7..023acbed41 100644
> --- a/hw/sd/sdhci.c
> +++ b/hw/sd/sdhci.c
> @@ -604,6 +604,7 @@ static void sdhci_sdma_transfer_multi_blocks(SDHCIState 
> *s)
>  s->blkcnt--;
>  }
>  }
> +assert(s->data_count <= s->buf_maxsz && s->data_count > begin);
>  dma_memory_write(s->dma_as, s->sdmasysad,
>   >fifo_buffer[begin], s->data_count - begin);
>  s->sdmasysad += s->data_count - begin;
> @@ -626,6 +627,7 @@ static void sdhci_sdma_transfer_multi_blocks(SDHCIState 
> *s)
>  s->data_count = block_size;
>  boundary_count -= block_size - begin;
>  }
> +assert(s->data_count <= s->buf_maxsz && s->data_count > begin);
>  dma_memory_read(s->dma_as, s->sdmasysad,
>  >fifo_buffer[begin], s->data_count - begin);
>  s->sdmasysad += s->data_count - begin;
> 

qemu-system-i386: hw/sd/sdhci.c:632: void
sdhci_sdma_transfer_multi_blocks(SDHCIState *): Assertion `s->data_count
<= s->buf_maxsz && s->data_count > begin' failed.

Tested-by: Philippe Mathieu-Daudé 

If you don't mind I might split the assert in 2 when applying:

-assert(s->data_count <= s->buf_maxsz && s->data_count > begin);
+assert(s->data_count <= s->buf_maxsz);
+assert(s->data_count > begin);

To directly display the problem straight:

qemu-system-i386: hw/sd/sdhci.c:635: void
sdhci_sdma_transfer_multi_blocks(SDHCIState *): Assertion `s->data_count
> begin' failed.

Thanks,

Phil.

Re: [RFC PATCH 00/12] hw: Forbid DMA write accesses to MMIO regions

2020-09-03 Thread Peter Maydell

On Thu, 3 Sep 2020 at 14:37, Laszlo Ersek  wrote:
> Peter mentions an approach at the end of
>  that I believe
> to understand, but -- according to him -- it seems too much work.

It also would only be effective for MMIO, not for qemu_irq lines...

> I don't think such chains work unto arbitrary depths on physical
> hardware either.

Real hardware by and large doesn't get designed with this kind
of DMA-to-self as a consideration either, but unfortunately it's
not really very useful as a model to base QEMU's behaviour on:

 (1) real hardware is usually massively parallel, so the logic
  that handles incoming MMIO is decoupled anyway from logic
  that does outgoing DMA. (Arguably the "do all DMA in a
  bottom-half" idea is kind of following the hardware design.)
  Similarly simple "raise this outbound signal" logic just
  works as an instantaneous action that causes the device on
  the other end to change its state/do something parallel,
  whereas for QEMU we need to actually call some code in the
  device on the other end and so we serialize this stuff,
  sandwiching a bit of "device B code" in the middle of a
  run of "device A code". So a lot more of this stuff "just
  happens to work" on h/w than we get with QEMU.
 (2) if software running on real h/w does do something silly with
  programming a device to DMA to itself then the worst case is
  generally that they manage to wedge that device (or the whole
  machine, if you're really unlucky), in which case the response
  is "don't do that then". There isn't the same "guest code
  can escape the VM" security boundary that QEMU needs to guard
  against [*].

[*] I do wonder about hardware-device-passthrough setups; I
don't think I would care to pass through an arbitrary device
to an untrusted guest...

thanks
-- PMM

Re: [RFC qemu 0/6] mirror: implement incremental and bitmap modes

2020-09-03 Thread Max Reitz

On 03.09.20 15:36, Fabian Grünbichler wrote:
> On September 3, 2020 3:23 pm, Kevin Wolf wrote:
>> Am 03.09.2020 um 14:57 hat Max Reitz geschrieben:
>>> On 03.09.20 14:38, Kevin Wolf wrote:
 Am 03.09.2020 um 13:04 hat Max Reitz geschrieben:
> On 03.09.20 12:13, Fabian Grünbichler wrote:
>> On August 21, 2020 3:03 pm, Max Reitz wrote:
>>> On 18.02.20 11:07, Fabian Grünbichler wrote:
>> I am not sure how 
>> the S-O-B by John is supposed to enter the mix - should I just include 
>> it in the squashed patch (which would be partly authored, but 
>> not-yet-signed-off by him otherwise?)?
>
> I’m not too sure on the proceedings, actually.  I think it should be
> fine if you put his S-o-b there, as long as your patch is somehow based
> on a patch that he sent earlier with his S-o-b underneath.  But I’m not
> sure.

 Signed-off-by means that John certifies the DCO for the patch (at least
 the original version that you possibly modified), so you cannot just add
 it without asking him.
>>>
>>> But what if you take a patch from someone and heavily modify it –
>>> wouldn’t you keep the original S-o-b and explain the modifications in
>>> the commit message?
>>
>> Ah, if that patch already had a S-o-b, then yes. You keep it not only to
>> show who touched the patch, but also because your own S-o-b depends on
>> the one from the original author (you only have the rights to contribute
>> it because the original author had them and could pass them on to you).
>>
>> I thought it was based on a patch that came without S-o-b.
> 
> it is (taken from John's git, with his approval, and implicit admission 
> that S-O-B is just missing because it was a WIP branch/tree that I 
> started from). that was also the reason why I kept that patch unmodified 
> and sent my modifications as patches on-top, to make it easier for John 
> to verify that that one patch is his original WIP one and add this 
> missing S-O-B ;)

OK, I see now.  I thought the S-o-b got lost somewhere, but was present
originally.

Max



signature.asc
Description: OpenPGP digital signature

Re: [PATCH] qcow2: Fix removal of list members from BDRVQcow2State.cluster_allocs

2020-09-03 Thread Alberto Garcia

On Thu 03 Sep 2020 03:12:22 PM CEST, Max Reitz wrote:
> On my search for /(\.|->)nb_clusters/ to verify this, I noticed this
> comment for qcow2_alloc_cluster_offset():
>
>  * If the cluster was already allocated, m->nb_clusters is set to 0 and
>  * other fields in m are meaningless.
>
> Should that be dropped, too?

Actually the whole comment should be rewritten (the function does not
allocale just one cluster), I'll send v2 with that and the rest of your
proposed changes.

Berto

Re: [RFC qemu 0/6] mirror: implement incremental and bitmap modes

2020-09-03 Thread Kevin Wolf

Am 03.09.2020 um 15:36 hat Fabian Grünbichler geschrieben:
> On September 3, 2020 3:23 pm, Kevin Wolf wrote:
> > Am 03.09.2020 um 14:57 hat Max Reitz geschrieben:
> >> On 03.09.20 14:38, Kevin Wolf wrote:
> >> > Am 03.09.2020 um 13:04 hat Max Reitz geschrieben:
> >> >> On 03.09.20 12:13, Fabian Grünbichler wrote:
> >> >>> On August 21, 2020 3:03 pm, Max Reitz wrote:
> >>  On 18.02.20 11:07, Fabian Grünbichler wrote:
> >> >>> I am not sure how 
> >> >>> the S-O-B by John is supposed to enter the mix - should I just include 
> >> >>> it in the squashed patch (which would be partly authored, but 
> >> >>> not-yet-signed-off by him otherwise?)?
> >> >>
> >> >> I’m not too sure on the proceedings, actually.  I think it should be
> >> >> fine if you put his S-o-b there, as long as your patch is somehow based
> >> >> on a patch that he sent earlier with his S-o-b underneath.  But I’m not
> >> >> sure.
> >> > 
> >> > Signed-off-by means that John certifies the DCO for the patch (at least
> >> > the original version that you possibly modified), so you cannot just add
> >> > it without asking him.
> >> 
> >> But what if you take a patch from someone and heavily modify it –
> >> wouldn’t you keep the original S-o-b and explain the modifications in
> >> the commit message?
> > 
> > Ah, if that patch already had a S-o-b, then yes. You keep it not only to
> > show who touched the patch, but also because your own S-o-b depends on
> > the one from the original author (you only have the rights to contribute
> > it because the original author had them and could pass them on to you).
> > 
> > I thought it was based on a patch that came without S-o-b.
> 
> it is (taken from John's git, with his approval, and implicit admission 
> that S-O-B is just missing because it was a WIP branch/tree that I 
> started from). that was also the reason why I kept that patch unmodified 
> and sent my modifications as patches on-top, to make it easier for John 
> to verify that that one patch is his original WIP one and add this 
> missing S-O-B ;)

Yeah, then John should just reply to the patch and add the S-o-b.

Complications like this are why I always use 'git commit -s' and have it
already in my development branches rather than only adding it when
preparing the patch emails to send.

Kevin

Re: [PATCH] hw/ide: check null block pointer before blk_drain

2020-09-03 Thread P J P

+-- On Thu, 3 Sep 2020, Philippe Mathieu-Daudé wrote --+
| > +if (s->blk) {
| > +ide_cancel_dma_sync(s);
| > +bm->status &= ~BM_STATUS_DMAING;
| 
| If you don't clear this bit the guest might keep retrying (looping).

Oh, okay will keep it out of the if(s->blk) block.

Thank you.
--
Prasad J Pandit / Red Hat Product Security Team
8685 545E B54C 486B C6EB 271E E285 8B5A F050 DE8D

Re: [RFC PATCH 00/12] hw: Forbid DMA write accesses to MMIO regions

2020-09-03 Thread Laszlo Ersek

Hi Phil,

On 09/03/20 13:08, Philippe Mathieu-Daudé wrote:
> Hi,
>
> I'm not suppose to work on this but I couldn't sleep so kept
> wondering about this problem the whole night and eventually
> woke up to write this quickly, so comments are scarce, sorry.
>
> The first part is obvious anyway, simply pass MemTxAttrs argument.
>
> The main patch is:
> "exec/memattrs: Introduce MemTxAttrs::direct_access field".
> This way we can restrict accesses to ROM/RAM by setting the
> 'direct_access' field. Illegal accesses return MEMTX_BUS_ERROR.
>
> Next patch restrict PCI DMA accesses by setting the direct_access
> field.
>
> Finally we add an assertion for any DMA write access to indirect
> memory to kill a class of bug recently found by Alexander while
> fuzzing.

I've briefly checked LP#1886362 and LP#1888606, and as much as I
understand them, they seem tricky. It's not clear how we can recognize
long chains of DMA-to-MMIO transfers, and interrupt them.

Peter mentions an approach at the end of
 that I believe
to understand, but -- according to him -- it seems too much work. And,
I'm not too familiar with the qemu memory model, so I don't have
comments on your solution.

Maybe we can have some kind of "depth counter" for such
recursive DMA-to-MMIO calls (even across multiple device models), and
put an artificial limit on them, such as 5 or 10. This could be
controlled on the QEMU command line perhaps?

I don't think such chains work unto arbitrary depths on physical
hardware either.

Sorry that I don't have any sensible comments here. I hope I didn't
misunderstand the problem at least.

Laszlo

Re: [RFC qemu 0/6] mirror: implement incremental and bitmap modes

2020-09-03 Thread Fabian Grünbichler

On September 3, 2020 3:23 pm, Kevin Wolf wrote:
> Am 03.09.2020 um 14:57 hat Max Reitz geschrieben:
>> On 03.09.20 14:38, Kevin Wolf wrote:
>> > Am 03.09.2020 um 13:04 hat Max Reitz geschrieben:
>> >> On 03.09.20 12:13, Fabian Grünbichler wrote:
>> >>> On August 21, 2020 3:03 pm, Max Reitz wrote:
>>  On 18.02.20 11:07, Fabian Grünbichler wrote:
>> >>> I am not sure how 
>> >>> the S-O-B by John is supposed to enter the mix - should I just include 
>> >>> it in the squashed patch (which would be partly authored, but 
>> >>> not-yet-signed-off by him otherwise?)?
>> >>
>> >> I’m not too sure on the proceedings, actually.  I think it should be
>> >> fine if you put his S-o-b there, as long as your patch is somehow based
>> >> on a patch that he sent earlier with his S-o-b underneath.  But I’m not
>> >> sure.
>> > 
>> > Signed-off-by means that John certifies the DCO for the patch (at least
>> > the original version that you possibly modified), so you cannot just add
>> > it without asking him.
>> 
>> But what if you take a patch from someone and heavily modify it –
>> wouldn’t you keep the original S-o-b and explain the modifications in
>> the commit message?
> 
> Ah, if that patch already had a S-o-b, then yes. You keep it not only to
> show who touched the patch, but also because your own S-o-b depends on
> the one from the original author (you only have the rights to contribute
> it because the original author had them and could pass them on to you).
> 
> I thought it was based on a patch that came without S-o-b.

it is (taken from John's git, with his approval, and implicit admission 
that S-O-B is just missing because it was a WIP branch/tree that I 
started from). that was also the reason why I kept that patch unmodified 
and sent my modifications as patches on-top, to make it easier for John 
to verify that that one patch is his original WIP one and add this 
missing S-O-B ;)

Re: [RFC qemu 0/6] mirror: implement incremental and bitmap modes

2020-09-03 Thread Kevin Wolf

Am 03.09.2020 um 14:57 hat Max Reitz geschrieben:
> On 03.09.20 14:38, Kevin Wolf wrote:
> > Am 03.09.2020 um 13:04 hat Max Reitz geschrieben:
> >> On 03.09.20 12:13, Fabian Grünbichler wrote:
> >>> On August 21, 2020 3:03 pm, Max Reitz wrote:
>  On 18.02.20 11:07, Fabian Grünbichler wrote:
> >>> I am not sure how 
> >>> the S-O-B by John is supposed to enter the mix - should I just include 
> >>> it in the squashed patch (which would be partly authored, but 
> >>> not-yet-signed-off by him otherwise?)?
> >>
> >> I’m not too sure on the proceedings, actually.  I think it should be
> >> fine if you put his S-o-b there, as long as your patch is somehow based
> >> on a patch that he sent earlier with his S-o-b underneath.  But I’m not
> >> sure.
> > 
> > Signed-off-by means that John certifies the DCO for the patch (at least
> > the original version that you possibly modified), so you cannot just add
> > it without asking him.
> 
> But what if you take a patch from someone and heavily modify it –
> wouldn’t you keep the original S-o-b and explain the modifications in
> the commit message?

Ah, if that patch already had a S-o-b, then yes. You keep it not only to
show who touched the patch, but also because your own S-o-b depends on
the one from the original author (you only have the rights to contribute
it because the original author had them and could pass them on to you).

I thought it was based on a patch that came without S-o-b.

Kevin


signature.asc
Description: PGP signature

Re: [RFC PATCH 11/12] hw/pci: Only allow PCI slave devices to write to direct memory

2020-09-03 Thread Philippe Mathieu-Daudé

On 9/3/20 2:26 PM, Paolo Bonzini wrote:
> On 03/09/20 13:08, Philippe Mathieu-Daudé wrote:
>> Do not allow PCI slaves to write to indirect memory
>> regions such MMIO.
>>
>> This fixes LP#1886362 and LP#1888606.
> 
> What is a "PCI slave"?

TBH I at a quick look at the PCIe SPEC, but not PCI.
PCIe might be able to check the requester_id to check if the
access is valid, but I'm not sure where to add this check.

> Which devices would still be allowed to write?

As of this patch, all the non-PCI, but I plan to add a similar
check for USB on top of this series.

> I'm worried that there are cases of MMIO reads that would be broken.
> They are certainly niche these days, but they should still work; the
> most "famous" one is perhaps the old BASIC
> 
>DEF SEG=
>BLOAD "picture.pic", 0

This looks like ISA stuff. I don't think ISA does such checks
(and didn't plan to add them there) but I'd need to verify.

Do you have an acceptance test?

> 
> Paolo
>

Re: [PATCH] qcow2: Fix removal of list members from BDRVQcow2State.cluster_allocs

2020-09-03 Thread Max Reitz

On 02.09.20 19:42, Alberto Garcia wrote:
> When a write request needs to allocate new clusters (or change the L2
> bitmap of existing ones) a QCowL2Meta structure is created so the L2
> metadata can be later updated and any copy-on-write can be performed
> if necessary.
> 
> A write request can span a region consisting of an arbitrary
> combination of previously unallocated and allocated clusters, and if
> the unallocated ones can be put contiguous to the existing ones then
> QEMU will do so in order to minimize the number of write operations.
> 
> In practice this means that a write request has not just one but a
> number of QCowL2Meta structures. All of them are added to the
> cluster_allocs list that is stored in BDRVQcow2State and is used to
> detect overlapping requests. After the write request finishes all its
> associated QCowL2Meta are removed from that list. calculate_l2_meta()
> takes care of creating and putting those structures in the list, and
> qcow2_handle_l2meta() takes care of removing them.
> 
> The problem is that the error path in handle_alloc() also tries to
> remove an item in that list, a remnant from the time when this was
> handled there (that code would not even be correct anymore because
> it only removes one struct and not all the ones from the same write
> request).
> 
> This can trigger a double removal of the same item from the list,
> causing a crash. This is not easy to reproduce in practice because
> it requires that do_alloc_cluster_offset() fails after a successful
> previous allocation during the same write request, but it can be
> reproduced with the included test case.
> 
> As a last thing, this patch also removes the condition that
> l2meta->nb_clusters is not 0 in qcow2_handle_l2meta(). This was only
> necessary when that structure was allocated in the stack in order
> to detect if it had been initialized or not. This changed in commit
> f50f88b9fe and nowadays nb_clusters is guaranteed to be > 0.

On my search for /(\.|->)nb_clusters/ to verify this, I noticed this
comment for qcow2_alloc_cluster_offset():

 * If the cluster was already allocated, m->nb_clusters is set to 0 and



 * other fields in m are meaningless.

Should that be dropped, too?

> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c  |  3 --
>  block/qcow2.c  |  4 +-
>  tests/qemu-iotests/305 | 75 ++
>  tests/qemu-iotests/305.out | 16 
>  tests/qemu-iotests/group   |  1 +
>  5 files changed, 93 insertions(+), 6 deletions(-)
>  create mode 100755 tests/qemu-iotests/305
>  create mode 100644 tests/qemu-iotests/305.out

[...]

> diff --git a/tests/qemu-iotests/305 b/tests/qemu-iotests/305
> new file mode 100755
> index 00..6de3180c17
> --- /dev/null
> +++ b/tests/qemu-iotests/305
> @@ -0,0 +1,75 @@

[...]

> +_supported_fmt qcow2
> +_supported_proto file
> +_supported_os Linux
> +_unsupported_imgopts cluster_size refcount_bits extended_l2

data_file, too, because the refcounts work differently there.  (As in:
Not at all for data clusters.)

Also, compat(=0.10), because that wouldn’t allow -o refcount_bits=64.

> +
> +refcount_table_offset=$((0x400))

I would like to suggest $(peek_file_be "$TEST_IMG" 48 8), to set an
example for future generations; but not strictly necessary, of course. O:)

Anyway, at least with the _unsupported_imgopts line completed:

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature

Re: [PATCH] hw/ide: check null block pointer before blk_drain

2020-09-03 Thread Philippe Mathieu-Daudé

On 9/3/20 1:05 PM, P J P wrote:
> +-- On Mon, 31 Aug 2020, Philippe Mathieu-Daudé wrote --+
> | > +++ b/hw/ide/core.c
> | > @@ -718,7 +718,7 @@ void ide_cancel_dma_sync(IDEState *s)
> | > -if (s->bus->dma->aiocb) {
> | > +if (s->blk && s->bus->dma->aiocb) {
> | 
> | But s->blk mustn't be null here... IMHO we should assert() here and add a
> | check earlier.
> 
> ===
> diff --git a/hw/ide/core.c b/hw/ide/core.c
> index f76f7e5234..8105187f49 100644
> --- a/hw/ide/core.c
> +++ b/hw/ide/core.c
> @@ -718,6 +718,7 @@ void ide_cancel_dma_sync(IDEState *s)
>   * whole DMA operation will be submitted to disk with a single
>   * aio operation with preadv/pwritev.
>   */
> +assert(s->blk);
>  if (s->bus->dma->aiocb) {
>  trace_ide_cancel_dma_sync_remaining();
>  blk_drain(s->blk);
> diff --git a/hw/ide/pci.c b/hw/ide/pci.c
> index b50091b615..51bb9c9abc 100644
> --- a/hw/ide/pci.c
> +++ b/hw/ide/pci.c
> @@ -295,8 +295,11 @@ void bmdma_cmd_writeb(BMDMAState *bm, uint32_t val)
>  /* Ignore writes to SSBM if it keeps the old value */
>  if ((val & BM_CMD_START) != (bm->cmd & BM_CMD_START)) {
>  if (!(val & BM_CMD_START)) {
> -ide_cancel_dma_sync(idebus_active_if(bm->bus));
> -bm->status &= ~BM_STATUS_DMAING;
> +IDEState *s = idebus_active_if(bm->bus);
> +if (s->blk) {
> +ide_cancel_dma_sync(s);
> +bm->status &= ~BM_STATUS_DMAING;

If you don't clear this bit the guest might keep retrying (looping).

> +}
>  } else {
>  bm->cur_addr = bm->addr;
>  if (!(bm->status & BM_STATUS_DMAING)) {
> ===
> 
> 
> Does it look okay?
> 
> Thank you.
> --
> Prasad J Pandit / Red Hat Product Security Team
> 8685 545E B54C 486B C6EB 271E E285 8B5A F050 DE8D
>

Re: [RFC qemu 0/6] mirror: implement incremental and bitmap modes

2020-09-03 Thread Max Reitz

On 03.09.20 14:38, Kevin Wolf wrote:
> Am 03.09.2020 um 13:04 hat Max Reitz geschrieben:
>> On 03.09.20 12:13, Fabian Grünbichler wrote:
>>> On August 21, 2020 3:03 pm, Max Reitz wrote:
 On 18.02.20 11:07, Fabian Grünbichler wrote:
>>> I am not sure how 
>>> the S-O-B by John is supposed to enter the mix - should I just include 
>>> it in the squashed patch (which would be partly authored, but 
>>> not-yet-signed-off by him otherwise?)?
>>
>> I’m not too sure on the proceedings, actually.  I think it should be
>> fine if you put his S-o-b there, as long as your patch is somehow based
>> on a patch that he sent earlier with his S-o-b underneath.  But I’m not
>> sure.
> 
> Signed-off-by means that John certifies the DCO for the patch (at least
> the original version that you possibly modified), so you cannot just add
> it without asking him.

But what if you take a patch from someone and heavily modify it –
wouldn’t you keep the original S-o-b and explain the modifications in
the commit message?

Max

> John should reply with a Signed-off-by line to the patch in question.
> Then you (Fabian) can add it in the next version of the series (if I
> understand correctly, you're going to respin anyway).
> 
> I see that patch 2 doesn't have any S-o-b at all. It should have both
> John's and Fabian's.
> 
> Kevin
> 




signature.asc
Description: OpenPGP digital signature

Re: [PATCH 32/63] ahci: Rename ICH_AHCI to ICH9_AHCI

2020-09-03 Thread Philippe Mathieu-Daudé

On 9/3/20 12:42 AM, Eduardo Habkost wrote:
> Make the type checking macro name consistent with the TYPE_*
> constant.
> 
> Signed-off-by: Eduardo Habkost 

Reviewed-by: Philippe Mathieu-Daudé 

> ---
> Cc: John Snow 
> Cc: qemu-block@nongnu.org
> Cc: qemu-de...@nongnu.org
> ---
>  include/hw/ide/ahci.h | 2 +-
>  hw/ide/ahci.c | 4 ++--
>  hw/ide/ich.c  | 8 
>  3 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/include/hw/ide/ahci.h b/include/hw/ide/ahci.h
> index 4cf6813d80..da3cddcc65 100644
> --- a/include/hw/ide/ahci.h
> +++ b/include/hw/ide/ahci.h
> @@ -54,7 +54,7 @@ typedef struct AHCIState {
>  typedef struct AHCIPCIState AHCIPCIState;
>  
>  #define TYPE_ICH9_AHCI "ich9-ahci"
> -DECLARE_INSTANCE_CHECKER(AHCIPCIState, ICH_AHCI,
> +DECLARE_INSTANCE_CHECKER(AHCIPCIState, ICH9_AHCI,
>   TYPE_ICH9_AHCI)
>  
>  int32_t ahci_get_num_ports(PCIDevice *dev);
> diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
> index b696c6291a..ee1d47ff75 100644
> --- a/hw/ide/ahci.c
> +++ b/hw/ide/ahci.c
> @@ -1819,7 +1819,7 @@ type_init(sysbus_ahci_register_types)
>  
>  int32_t ahci_get_num_ports(PCIDevice *dev)
>  {
> -AHCIPCIState *d = ICH_AHCI(dev);
> +AHCIPCIState *d = ICH9_AHCI(dev);
>  AHCIState *ahci = >ahci;
>  
>  return ahci->ports;
> @@ -1827,7 +1827,7 @@ int32_t ahci_get_num_ports(PCIDevice *dev)
>  
>  void ahci_ide_create_devs(PCIDevice *dev, DriveInfo **hd)
>  {
> -AHCIPCIState *d = ICH_AHCI(dev);
> +AHCIPCIState *d = ICH9_AHCI(dev);
>  AHCIState *ahci = >ahci;
>  int i;
>  
> diff --git a/hw/ide/ich.c b/hw/ide/ich.c
> index eff3188fff..51cd2f38b7 100644
> --- a/hw/ide/ich.c
> +++ b/hw/ide/ich.c
> @@ -91,14 +91,14 @@ static const VMStateDescription vmstate_ich9_ahci = {
>  
>  static void pci_ich9_reset(DeviceState *dev)
>  {
> -AHCIPCIState *d = ICH_AHCI(dev);
> +AHCIPCIState *d = ICH9_AHCI(dev);
>  
>  ahci_reset(>ahci);
>  }
>  
>  static void pci_ich9_ahci_init(Object *obj)
>  {
> -struct AHCIPCIState *d = ICH_AHCI(obj);
> +struct AHCIPCIState *d = ICH9_AHCI(obj);
>  
>  ahci_init(>ahci, DEVICE(obj));
>  }
> @@ -108,7 +108,7 @@ static void pci_ich9_ahci_realize(PCIDevice *dev, Error 
> **errp)
>  struct AHCIPCIState *d;
>  int sata_cap_offset;
>  uint8_t *sata_cap;
> -d = ICH_AHCI(dev);
> +d = ICH9_AHCI(dev);
>  int ret;
>  
>  ahci_realize(>ahci, DEVICE(dev), pci_get_address_space(dev), 6);
> @@ -154,7 +154,7 @@ static void pci_ich9_ahci_realize(PCIDevice *dev, Error 
> **errp)
>  static void pci_ich9_uninit(PCIDevice *dev)
>  {
>  struct AHCIPCIState *d;
> -d = ICH_AHCI(dev);
> +d = ICH9_AHCI(dev);
>  
>  msi_uninit(dev);
>  ahci_uninit(>ahci);
>

Re: [PATCH] iotests: Allow running from different directory

2020-09-03 Thread Max Reitz

On 02.09.20 13:03, Kevin Wolf wrote:
> It is convenient to be able to edit the tests and run them without
> changing the current working directory back and forth. Instead of
> assuming that $PWD is the qemu-iotests build directory, derive the build
> directory from the executed script.
> 
> This allows 'check' to find the required files even when called from
> another directory. The scratch directory will still be in the current
> working directory.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  tests/qemu-iotests/check | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
> index 3ab859ac1a..22ada6a549 100755
> --- a/tests/qemu-iotests/check
> +++ b/tests/qemu-iotests/check
> @@ -44,7 +44,7 @@ then
>  _init_error "failed to obtain source tree name from check symlink"
>  fi
>  source_iotests=$(cd "$source_iotests"; pwd) || _init_error "failed to 
> enter source tree"
> -build_iotests=$PWD
> +build_iotests=$(dirname "$0")

This breaks running check from the build tree.
(i.e. cd $build/tests/qemu-iotests; ./check)

The problem is that to run the test, we do cd to the source directory
($source_iotests), and so $build_iotests then becomes invalid if it’s
just a relative path.  In my case, this leads to the following error:

-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+./common.rc: line 139: $QEMU/tests/qemu-iotests/../../qemu-img: No such
file or directory

I think this could be resolved by wrapping the $(dirname) in
$(realpath), i.e.

build_iotests=$(realpath "$(dirname "$0")")

Max

>  else
>  # called from the source tree
>  source_iotests=$PWD
> 



signature.asc
Description: OpenPGP digital signature

Re: [PATCH v6 00/12] monitor: Optionally run handlers in coroutines

2020-09-03 Thread Kevin Wolf

Am 03.09.2020 um 12:49 hat Markus Armbruster geschrieben:
> Markus Armbruster  writes:
> 
> > Markus Armbruster  writes:
> >
> >> I let this series slide to get my Error API rework done, along with much
> >> else.  My sincere apologies!
> >>
> >> Unsurprisingly, it needs a rebase now.  I suggest to let me review it as
> >> is first.
> >
> > I'm done with v6.  Summary:
> >
> > * A few trivial things to correct here and there.
> >
> > * A few ideas to improve things in relatively minor ways.
> >
> > * PATCH 03 looks "why bother" to me until PATCH 09 makes me suspect you
> >   did the former to enable the latter.  If you had captured that in your
> >   commit message back then, like you did for the similar PATCH 05, I
> >   wouldn't be scratching my head now :)
> >
> > * I dislike PATCH 06, and would like to explore an alternative idea.
> >
> > * PATCH 08 makes hairy monitor code even hairier, but I don't have
> >   better ideas.
> >
> > * I don't feel comfortable as a sole reviewer of the AIO magic in PATCH
> >   10-12.  Let's ask Stefan for an eye-over.
> >
> > I'd like to proceed as follows.  You rebase, and address "easy" review
> > comments (you decide what's easy).  Post as v7, cc'ing Stefan for the
> > AIO magic and David Gilbert for HMP.  While they review (hopefully), I
> > explore a replacement for PATCH 06.  And then we touch bases and decide
> > how to get this thing wrapped.
> 
> I explored:
> 
> Subject: Ways to do per-coroutine properties (was: [PATCH v6 06/12] 
> monitor: Make current monitor a per-coroutine property)
> Date: Fri, 07 Aug 2020 15:09:19 +0200 (3 weeks, 5 days, 21 hours ago)
> Message-ID: <87a6z6wqkg.fsf...@dusky.pond.sub.org>
> 
> May I have v7?  Feel free to keep your PATCH 06.  If I decide to replace
> it, I can do it myself, possibly on top.

It's one of the next things on my list. I can't promise anything more
specific, though.

Kevin

[PATCH v7 14/15] block/nvme: Extract nvme_poll_queue()

2020-09-03 Thread Philippe Mathieu-Daudé

As we want to do per-queue polling, extract the nvme_poll_queue()
method which operates on a single queue.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 44 +++-
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 914a3c4ab31..e3719d3bd14 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -590,31 +590,41 @@ out:
 qemu_vfree(id);
 }
 
+static bool nvme_poll_queue(NVMeQueuePair *q)
+{
+bool progress = false;
+
+const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
+NvmeCqe *cqe = (NvmeCqe *)>cq.queue[cqe_offset];
+
+/*
+ * Do an early check for completions. q->lock isn't needed because
+ * nvme_process_completion() only runs in the event loop thread and
+ * cannot race with itself.
+ */
+if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
+return false;
+}
+
+qemu_mutex_lock(>lock);
+while (nvme_process_completion(q)) {
+/* Keep polling */
+progress = true;
+}
+qemu_mutex_unlock(>lock);
+
+return progress;
+}
+
 static bool nvme_poll_queues(BDRVNVMeState *s)
 {
 bool progress = false;
 int i;
 
 for (i = 0; i < s->nr_queues; i++) {
-NVMeQueuePair *q = s->queues[i];
-const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
-NvmeCqe *cqe = (NvmeCqe *)>cq.queue[cqe_offset];
-
-/*
- * Do an early check for completions. q->lock isn't needed because
- * nvme_process_completion() only runs in the event loop thread and
- * cannot race with itself.
- */
-if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
-continue;
-}
-
-qemu_mutex_lock(>lock);
-while (nvme_process_completion(q)) {
-/* Keep polling */
+if (nvme_poll_queue(s->queues[i])) {
 progress = true;
 }
-qemu_mutex_unlock(>lock);
 }
 return progress;
 }
-- 
2.26.2

[PATCH v7 13/15] block/nvme: Simplify nvme_create_queue_pair() arguments

2020-09-03 Thread Philippe Mathieu-Daudé

nvme_create_queue_pair() doesn't require BlockDriverState anymore.
Replace it by BDRVNVMeState and AioContext to simplify.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index b03735129d3..914a3c4ab31 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -208,12 +208,12 @@ static void nvme_free_req_queue_cb(void *opaque)
 qemu_mutex_unlock(>lock);
 }
 
-static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
+static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
+ AioContext *aio_context,
  int idx, int size,
  Error **errp)
 {
 int i, r;
-BDRVNVMeState *s = bs->opaque;
 Error *local_err = NULL;
 NVMeQueuePair *q;
 uint64_t prp_list_iova;
@@ -232,8 +232,7 @@ static NVMeQueuePair 
*nvme_create_queue_pair(BlockDriverState *bs,
 q->s = s;
 q->index = idx;
 qemu_co_queue_init(>free_req_queue);
-q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
-  nvme_process_completion_bh, q);
+q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
   s->page_size * NVME_NUM_REQS,
   false, _list_iova);
@@ -637,7 +636,8 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 NvmeCmd cmd;
 int queue_size = NVME_QUEUE_SIZE;
 
-q = nvme_create_queue_pair(bs, n, queue_size, errp);
+q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
+   n, queue_size, errp);
 if (!q) {
 return false;
 }
@@ -683,6 +683,7 @@ static int nvme_init(BlockDriverState *bs, const char 
*device, int namespace,
  Error **errp)
 {
 BDRVNVMeState *s = bs->opaque;
+AioContext *aio_context = bdrv_get_aio_context(bs);
 int ret;
 uint64_t cap;
 uint64_t timeout_ms;
@@ -743,7 +744,7 @@ static int nvme_init(BlockDriverState *bs, const char 
*device, int namespace,
 
 /* Set up admin queue. */
 s->queues = g_new(NVMeQueuePair *, 1);
-s->queues[INDEX_ADMIN] = nvme_create_queue_pair(bs, 0,
+s->queues[INDEX_ADMIN] = nvme_create_queue_pair(s, aio_context, 0,
   NVME_QUEUE_SIZE,
   errp);
 if (!s->queues[INDEX_ADMIN]) {
-- 
2.26.2

[PATCH v7 08/15] block/nvme: Use union of NvmeIdCtrl / NvmeIdNs structures

2020-09-03 Thread Philippe Mathieu-Daudé

We allocate an unique chunk of memory then use it for two
different structures. By using an union, we make it clear
the data is overlapping (and we can remove the casts).

Suggested-by: Stefan Hajnoczi 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 31 +++
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 0992f2fb579..b0d55ecfb25 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -508,9 +508,10 @@ static int nvme_cmd_sync(BlockDriverState *bs, 
NVMeQueuePair *q,
 static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 {
 BDRVNVMeState *s = bs->opaque;
-NvmeIdCtrl *idctrl;
-NvmeIdNs *idns;
-uint8_t *id;
+union {
+NvmeIdCtrl ctrl;
+NvmeIdNs ns;
+} *id;
 NvmeLBAF *lbaf;
 uint16_t oncs;
 int r;
@@ -520,14 +521,12 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 .cdw10 = cpu_to_le32(0x1),
 };
 
-id = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
+id = qemu_try_blockalign0(bs, sizeof(*id));
 if (!id) {
 error_setg(errp, "Cannot allocate buffer for identify response");
 goto out;
 }
-idctrl = (NvmeIdCtrl *)id;
-idns = (NvmeIdNs *)id;
-r = qemu_vfio_dma_map(s->vfio, id, sizeof(NvmeIdCtrl), true, );
+r = qemu_vfio_dma_map(s->vfio, id, sizeof(*id), true, );
 if (r) {
 error_setg(errp, "Cannot map buffer for DMA");
 goto out;
@@ -539,22 +538,22 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 goto out;
 }
 
-if (le32_to_cpu(idctrl->nn) < namespace) {
+if (le32_to_cpu(id->ctrl.nn) < namespace) {
 error_setg(errp, "Invalid namespace");
 goto out;
 }
-s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1;
-s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size;
+s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1;
+s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size;
 /* For now the page list buffer per command is one page, to hold at most
  * s->page_size / sizeof(uint64_t) entries. */
 s->max_transfer = MIN_NON_ZERO(s->max_transfer,
   s->page_size / sizeof(uint64_t) * s->page_size);
 
-oncs = le16_to_cpu(idctrl->oncs);
+oncs = le16_to_cpu(id->ctrl.oncs);
 s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
 s->supports_discard = !!(oncs & NVME_ONCS_DSM);
 
-memset(id, 0, 4096);
+memset(id, 0, sizeof(*id));
 cmd.cdw10 = 0;
 cmd.nsid = cpu_to_le32(namespace);
 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
@@ -562,11 +561,11 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 goto out;
 }
 
-s->nsze = le64_to_cpu(idns->nsze);
-lbaf = >lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
+s->nsze = le64_to_cpu(id->ns.nsze);
+lbaf = >ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)];
 
-if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) &&
-NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
+if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) &&
+NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) ==
 NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
 bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
 }
-- 
2.26.2

Re: [RFC qemu 0/6] mirror: implement incremental and bitmap modes

2020-09-03 Thread Kevin Wolf

Am 03.09.2020 um 13:04 hat Max Reitz geschrieben:
> On 03.09.20 12:13, Fabian Grünbichler wrote:
> > On August 21, 2020 3:03 pm, Max Reitz wrote:
> >> On 18.02.20 11:07, Fabian Grünbichler wrote:
> > I am not sure how 
> > the S-O-B by John is supposed to enter the mix - should I just include 
> > it in the squashed patch (which would be partly authored, but 
> > not-yet-signed-off by him otherwise?)?
> 
> I’m not too sure on the proceedings, actually.  I think it should be
> fine if you put his S-o-b there, as long as your patch is somehow based
> on a patch that he sent earlier with his S-o-b underneath.  But I’m not
> sure.

Signed-off-by means that John certifies the DCO for the patch (at least
the original version that you possibly modified), so you cannot just add
it without asking him.

John should reply with a Signed-off-by line to the patch in question.
Then you (Fabian) can add it in the next version of the series (if I
understand correctly, you're going to respin anyway).

I see that patch 2 doesn't have any S-o-b at all. It should have both
John's and Fabian's.

Kevin

signature.asc
Description: PGP signature

[PATCH v7 07/15] block/nvme: Rename local variable

2020-09-03 Thread Philippe Mathieu-Daudé

We are going to modify the code in the next commit. Renaming
the 'resp' variable to 'id' first makes the next commit easier
to review. No logical changes.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 19 +--
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 909a565184d..0992f2fb579 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -510,8 +510,8 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 BDRVNVMeState *s = bs->opaque;
 NvmeIdCtrl *idctrl;
 NvmeIdNs *idns;
+uint8_t *id;
 NvmeLBAF *lbaf;
-uint8_t *resp;
 uint16_t oncs;
 int r;
 uint64_t iova;
@@ -520,14 +520,14 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 .cdw10 = cpu_to_le32(0x1),
 };
 
-resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
-if (!resp) {
+id = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
+if (!id) {
 error_setg(errp, "Cannot allocate buffer for identify response");
 goto out;
 }
-idctrl = (NvmeIdCtrl *)resp;
-idns = (NvmeIdNs *)resp;
-r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, );
+idctrl = (NvmeIdCtrl *)id;
+idns = (NvmeIdNs *)id;
+r = qemu_vfio_dma_map(s->vfio, id, sizeof(NvmeIdCtrl), true, );
 if (r) {
 error_setg(errp, "Cannot map buffer for DMA");
 goto out;
@@ -554,8 +554,7 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
 s->supports_discard = !!(oncs & NVME_ONCS_DSM);
 
-memset(resp, 0, 4096);
-
+memset(id, 0, 4096);
 cmd.cdw10 = 0;
 cmd.nsid = cpu_to_le32(namespace);
 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
@@ -587,8 +586,8 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 
 s->blkshift = lbaf->ds;
 out:
-qemu_vfio_dma_unmap(s->vfio, resp);
-qemu_vfree(resp);
+qemu_vfio_dma_unmap(s->vfio, id);
+qemu_vfree(id);
 }
 
 static bool nvme_poll_queues(BDRVNVMeState *s)
-- 
2.26.2

[PATCH v7 11/15] block/nvme: Simplify nvme_init_queue() arguments

2020-09-03 Thread Philippe Mathieu-Daudé

nvme_init_queue() doesn't require BlockDriverState anymore.
Replace it by BDRVNVMeState to simplify.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 0c8ad1d60b6..e04e1fa4f8f 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -165,10 +165,9 @@ static QemuOptsList runtime_opts = {
 },
 };
 
-static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
+static void nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
 int nentries, int entry_bytes, Error **errp)
 {
-BDRVNVMeState *s = bs->opaque;
 size_t bytes;
 int r;
 
@@ -251,14 +250,14 @@ static NVMeQueuePair 
*nvme_create_queue_pair(BlockDriverState *bs,
 req->prp_list_iova = prp_list_iova + i * s->page_size;
 }
 
-nvme_init_queue(bs, >sq, size, NVME_SQ_ENTRY_BYTES, _err);
+nvme_init_queue(s, >sq, size, NVME_SQ_ENTRY_BYTES, _err);
 if (local_err) {
 error_propagate(errp, local_err);
 goto fail;
 }
 q->sq.doorbell = >regs->doorbells[idx * 2 * s->doorbell_scale];
 
-nvme_init_queue(bs, >cq, size, NVME_CQ_ENTRY_BYTES, _err);
+nvme_init_queue(s, >cq, size, NVME_CQ_ENTRY_BYTES, _err);
 if (local_err) {
 error_propagate(errp, local_err);
 goto fail;
-- 
2.26.2

[PATCH v7 10/15] block/nvme: Replace qemu_try_blockalign(bs) by qemu_try_memalign(pg_sz)

2020-09-03 Thread Philippe Mathieu-Daudé

qemu_try_blockalign() is a generic API that call back to the
block driver to return its page alignment. As we call from
within the very same driver, we already know to page alignment
stored in our state. Remove indirections and use the value from
BDRVNVMeState.
This change is required to later remove the BlockDriverState
argument, to make nvme_init_queue() per hardware, and not per
block driver.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 60e39b69a23..0c8ad1d60b6 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -174,7 +174,7 @@ static void nvme_init_queue(BlockDriverState *bs, NVMeQueue 
*q,
 
 bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
 q->head = q->tail = 0;
-q->queue = qemu_try_blockalign(bs, bytes);
+q->queue = qemu_try_memalign(s->page_size, bytes);
 if (!q->queue) {
 error_setg(errp, "Cannot allocate queue");
 return;
@@ -223,7 +223,7 @@ static NVMeQueuePair 
*nvme_create_queue_pair(BlockDriverState *bs,
 if (!q) {
 return NULL;
 }
-q->prp_list_pages = qemu_try_blockalign(bs,
+q->prp_list_pages = qemu_try_memalign(s->page_size,
   s->page_size * NVME_NUM_REQS);
 if (!q->prp_list_pages) {
 goto fail;
@@ -522,7 +522,7 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 .cdw10 = cpu_to_le32(0x1),
 };
 
-id = qemu_try_blockalign(bs, sizeof(*id));
+id = qemu_try_memalign(s->page_size, sizeof(*id));
 if (!id) {
 error_setg(errp, "Cannot allocate buffer for identify response");
 goto out;
@@ -1141,7 +1141,7 @@ static int nvme_co_prw(BlockDriverState *bs, uint64_t 
offset, uint64_t bytes,
 return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
 }
 trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
-buf = qemu_try_blockalign(bs, bytes);
+buf = qemu_try_memalign(s->page_size, bytes);
 
 if (!buf) {
 return -ENOMEM;
@@ -1285,7 +1285,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState 
*bs,
 
 assert(s->nr_queues > 1);
 
-buf = qemu_try_blockalign(bs, s->page_size);
+buf = qemu_try_memalign(s->page_size, s->page_size);
 if (!buf) {
 return -ENOMEM;
 }
-- 
2.26.2

[PATCH v7 15/15] block/nvme: Use an array of EventNotifier

2020-09-03 Thread Philippe Mathieu-Daudé

In preparation of using multiple IRQ (thus multiple eventfds)
make BDRVNVMeState::irq_notifier an array (for now of a single
element, the admin queue notifier).

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 31 +--
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index e3719d3bd14..24e6e7f0866 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -106,6 +106,12 @@ QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
 #define INDEX_ADMIN 0
 #define INDEX_IO(n) (1 + n)
 
+/* This driver shares a single MSIX IRQ for the admin and I/O queues */
+enum {
+MSIX_SHARED_IRQ_IDX = 0,
+MSIX_IRQ_COUNT = 1
+};
+
 struct BDRVNVMeState {
 AioContext *aio_context;
 QEMUVFIOState *vfio;
@@ -120,7 +126,7 @@ struct BDRVNVMeState {
 /* How many uint32_t elements does each doorbell entry take. */
 size_t doorbell_scale;
 bool write_cache_supported;
-EventNotifier irq_notifier;
+EventNotifier irq_notifier[MSIX_IRQ_COUNT];
 
 uint64_t nsze; /* Namespace size reported by identify command */
 int nsid;  /* The namespace id to read/write data. */
@@ -631,7 +637,8 @@ static bool nvme_poll_queues(BDRVNVMeState *s)
 
 static void nvme_handle_event(EventNotifier *n)
 {
-BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier);
+BDRVNVMeState *s = container_of(n, BDRVNVMeState,
+irq_notifier[MSIX_SHARED_IRQ_IDX]);
 
 trace_nvme_handle_event(s);
 event_notifier_test_and_clear(n);
@@ -683,7 +690,8 @@ out_error:
 static bool nvme_poll_cb(void *opaque)
 {
 EventNotifier *e = opaque;
-BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier);
+BDRVNVMeState *s = container_of(e, BDRVNVMeState,
+irq_notifier[MSIX_SHARED_IRQ_IDX]);
 
 trace_nvme_poll_cb(s);
 return nvme_poll_queues(s);
@@ -705,7 +713,7 @@ static int nvme_init(BlockDriverState *bs, const char 
*device, int namespace,
 s->device = g_strdup(device);
 s->nsid = namespace;
 s->aio_context = bdrv_get_aio_context(bs);
-ret = event_notifier_init(>irq_notifier, 0);
+ret = event_notifier_init(>irq_notifier[MSIX_SHARED_IRQ_IDX], 0);
 if (ret) {
 error_setg(errp, "Failed to init event notifier");
 return ret;
@@ -784,12 +792,13 @@ static int nvme_init(BlockDriverState *bs, const char 
*device, int namespace,
 }
 }
 
-ret = qemu_vfio_pci_init_irq(s->vfio, >irq_notifier,
+ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier,
  VFIO_PCI_MSIX_IRQ_INDEX, errp);
 if (ret) {
 goto out;
 }
-aio_set_event_notifier(bdrv_get_aio_context(bs), >irq_notifier,
+aio_set_event_notifier(bdrv_get_aio_context(bs),
+   >irq_notifier[MSIX_SHARED_IRQ_IDX],
false, nvme_handle_event, nvme_poll_cb);
 
 nvme_identify(bs, namespace, _err);
@@ -872,9 +881,10 @@ static void nvme_close(BlockDriverState *bs)
 nvme_free_queue_pair(s->queues[i]);
 }
 g_free(s->queues);
-aio_set_event_notifier(bdrv_get_aio_context(bs), >irq_notifier,
+aio_set_event_notifier(bdrv_get_aio_context(bs),
+   >irq_notifier[MSIX_SHARED_IRQ_IDX],
false, NULL, NULL);
-event_notifier_cleanup(>irq_notifier);
+event_notifier_cleanup(>irq_notifier[MSIX_SHARED_IRQ_IDX]);
 qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
 qemu_vfio_close(s->vfio);
 
@@ -1381,7 +1391,8 @@ static void nvme_detach_aio_context(BlockDriverState *bs)
 q->completion_bh = NULL;
 }
 
-aio_set_event_notifier(bdrv_get_aio_context(bs), >irq_notifier,
+aio_set_event_notifier(bdrv_get_aio_context(bs),
+   >irq_notifier[MSIX_SHARED_IRQ_IDX],
false, NULL, NULL);
 }
 
@@ -1391,7 +1402,7 @@ static void nvme_attach_aio_context(BlockDriverState *bs,
 BDRVNVMeState *s = bs->opaque;
 
 s->aio_context = new_context;
-aio_set_event_notifier(new_context, >irq_notifier,
+aio_set_event_notifier(new_context, >irq_notifier[MSIX_SHARED_IRQ_IDX],
false, nvme_handle_event, nvme_poll_cb);
 
 for (int i = 0; i < s->nr_queues; i++) {
-- 
2.26.2

[PATCH v7 12/15] block/nvme: Replace BDRV_POLL_WHILE by AIO_WAIT_WHILE

2020-09-03 Thread Philippe Mathieu-Daudé

BDRV_POLL_WHILE() is defined as:

  #define BDRV_POLL_WHILE(bs, cond) ({  \
  BlockDriverState *bs_ = (bs); \
  AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \
 cond); })

As we will remove the BlockDriverState use in the next commit,
start by using the exploded version of BDRV_POLL_WHILE().

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index e04e1fa4f8f..b03735129d3 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -493,6 +493,7 @@ static void nvme_cmd_sync_cb(void *opaque, int ret)
 static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
  NvmeCmd *cmd)
 {
+AioContext *aio_context = bdrv_get_aio_context(bs);
 NVMeRequest *req;
 int ret = -EINPROGRESS;
 req = nvme_get_free_req(q);
@@ -501,7 +502,7 @@ static int nvme_cmd_sync(BlockDriverState *bs, 
NVMeQueuePair *q,
 }
 nvme_submit_command(q, req, cmd, nvme_cmd_sync_cb, );
 
-BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
+AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
 return ret;
 }
 
-- 
2.26.2

Re: flatview_write_continue global mutex deadlock

2020-09-03 Thread Paolo Bonzini

On 03/09/20 13:16, Vladimir Sementsov-Ogievskiy wrote:
> (gdb) info thr
>   Id   Target Id    Frame
> * 1    Thread 0x7fb9f0f39e00 (LWP 215115) "qemu-system-x86"
> 0x7fb9d784f54d in __lll_lock_wait () from /lib64/libpthread.so.0
> [...]
> #1  0x56069bfbd3f1 in qemu_poll_ns (fds=0x7fb9401dcdf0, nfds=1,
> timeout=542076652475) at ../util/qemu-timer.c:347
> #2  0x56069bfd949f in fdmon_poll_wait (ctx=0x56069e6864c0,
> ready_list=0x7fb9481fc200, timeout=542076652475) at ../util/fdmon-poll.c:79
> #3  0x56069bfcdf4c in aio_poll (ctx=0x56069e6864c0, blocking=true)
> at ../util/aio-posix.c:601
> #4  0x56069be80cf3 in bdrv_do_drained_begin (bs=0x56069e6c0950,
> recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at
> ../block/io.c:427
> #5  0x56069be80ddb in bdrv_drained_begin (bs=0x56069e6c0950) at
> ../block/io.c:433
> #6  0x56069bf1e5b4 in blk_drain (blk=0x56069e6adb50) at
> ../block/block-backend.c:1718
> #7  0x56069ba40fb5 in ide_cancel_dma_sync (s=0x56069f0d1548) at
> ../hw/ide/core.c:723
> [...]
> #13 0x56069bd965e2 in flatview_write_continue (fv=0x7fb9401ce100,
> addr=49152, attrs=..., ptr=0x7fb9f0f87000, len=1, addr1=0, l=1,
> mr=0x56069f0d2420) at ../exec.c:3176

So this is a vCPU thread.  The question is, why is the reconnect timer
not on the same AioContext?  If it were, aio_poll would execute it.

Paolo

[PATCH v7 03/15] block/nvme: Let nvme_create_queue_pair() fail gracefully

2020-09-03 Thread Philippe Mathieu-Daudé

As nvme_create_queue_pair() is allowed to fail, replace the
alloc() calls by try_alloc() to avoid aborting QEMU.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 4d4f728159f..ca8b039f4f0 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -213,14 +213,22 @@ static NVMeQueuePair 
*nvme_create_queue_pair(BlockDriverState *bs,
 int i, r;
 BDRVNVMeState *s = bs->opaque;
 Error *local_err = NULL;
-NVMeQueuePair *q = g_new0(NVMeQueuePair, 1);
+NVMeQueuePair *q;
 uint64_t prp_list_iova;
 
+q = g_try_new0(NVMeQueuePair, 1);
+if (!q) {
+return NULL;
+}
+q->prp_list_pages = qemu_try_blockalign0(bs,
+  s->page_size * NVME_NUM_REQS);
+if (!q->prp_list_pages) {
+goto fail;
+}
 qemu_mutex_init(>lock);
 q->s = s;
 q->index = idx;
 qemu_co_queue_init(>free_req_queue);
-q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_NUM_REQS);
 q->completion_bh = aio_bh_new(bdrv_get_aio_context(bs),
   nvme_process_completion_bh, q);
 r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
-- 
2.26.2

[PATCH v7 05/15] block/nvme: Improve error message when IO queue creation failed

2020-09-03 Thread Philippe Mathieu-Daudé

Do not use the same error message for different failures.
Display a different error whether it is the CQ or the SQ.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 488d4ddb3b8..91dba4ec5db 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -648,7 +648,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 .cdw11 = cpu_to_le32(0x3),
 };
 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
-error_setg(errp, "Failed to create io queue [%d]", n);
+error_setg(errp, "Failed to create CQ io queue [%d]", n);
 nvme_free_queue_pair(q);
 return false;
 }
@@ -659,7 +659,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 .cdw11 = cpu_to_le32(0x1 | (n << 16)),
 };
 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
-error_setg(errp, "Failed to create io queue [%d]", n);
+error_setg(errp, "Failed to create SQ io queue [%d]", n);
 nvme_free_queue_pair(q);
 return false;
 }
-- 
2.26.2

[PATCH v7 06/15] block/nvme: Use common error path in nvme_add_io_queue()

2020-09-03 Thread Philippe Mathieu-Daudé

Rearrange nvme_add_io_queue() by using a common error path.
This will be proven useful in few commits where we add IRQ
notification to the IO queues.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 91dba4ec5db..909a565184d 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -649,8 +649,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 };
 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
 error_setg(errp, "Failed to create CQ io queue [%d]", n);
-nvme_free_queue_pair(q);
-return false;
+goto out_error;
 }
 cmd = (NvmeCmd) {
 .opcode = NVME_ADM_CMD_CREATE_SQ,
@@ -660,13 +659,15 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 };
 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
 error_setg(errp, "Failed to create SQ io queue [%d]", n);
-nvme_free_queue_pair(q);
-return false;
+goto out_error;
 }
 s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
 s->queues[n] = q;
 s->nr_queues++;
 return true;
+out_error:
+nvme_free_queue_pair(q);
+return false;
 }
 
 static bool nvme_poll_cb(void *opaque)
-- 
2.26.2

[PATCH v7 02/15] block/nvme: Avoid further processing if trace event not enabled

2020-09-03 Thread Philippe Mathieu-Daudé

Avoid further processing if TRACE_NVME_SUBMIT_COMMAND_RAW is
not enabled. This is an untested intend of performance optimization.

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/nvme.c b/block/nvme.c
index a2b006be56d..4d4f728159f 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -441,6 +441,9 @@ static void nvme_trace_command(const NvmeCmd *cmd)
 {
 int i;
 
+if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) {
+return;
+}
 for (i = 0; i < 8; ++i) {
 uint8_t *cmdp = (uint8_t *)cmd + i * 8;
 trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
-- 
2.26.2

[PATCH v7 09/15] block/nvme: Replace qemu_try_blockalign0 by qemu_try_blockalign/memset

2020-09-03 Thread Philippe Mathieu-Daudé

In the next commit we'll get rid of qemu_try_blockalign().
To ease review, first replace qemu_try_blockalign0() by explicit
calls to qemu_try_blockalign() and memset().

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index b0d55ecfb25..60e39b69a23 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -174,12 +174,12 @@ static void nvme_init_queue(BlockDriverState *bs, 
NVMeQueue *q,
 
 bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
 q->head = q->tail = 0;
-q->queue = qemu_try_blockalign0(bs, bytes);
-
+q->queue = qemu_try_blockalign(bs, bytes);
 if (!q->queue) {
 error_setg(errp, "Cannot allocate queue");
 return;
 }
+memset(q->queue, 0, bytes);
 r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, >iova);
 if (r) {
 error_setg(errp, "Cannot map queue");
@@ -223,11 +223,12 @@ static NVMeQueuePair 
*nvme_create_queue_pair(BlockDriverState *bs,
 if (!q) {
 return NULL;
 }
-q->prp_list_pages = qemu_try_blockalign0(bs,
+q->prp_list_pages = qemu_try_blockalign(bs,
   s->page_size * NVME_NUM_REQS);
 if (!q->prp_list_pages) {
 goto fail;
 }
+memset(q->prp_list_pages, 0, s->page_size * NVME_NUM_REQS);
 qemu_mutex_init(>lock);
 q->s = s;
 q->index = idx;
@@ -521,7 +522,7 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 .cdw10 = cpu_to_le32(0x1),
 };
 
-id = qemu_try_blockalign0(bs, sizeof(*id));
+id = qemu_try_blockalign(bs, sizeof(*id));
 if (!id) {
 error_setg(errp, "Cannot allocate buffer for identify response");
 goto out;
@@ -531,8 +532,9 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 error_setg(errp, "Cannot map buffer for DMA");
 goto out;
 }
-cmd.dptr.prp1 = cpu_to_le64(iova);
 
+memset(id, 0, sizeof(*id));
+cmd.dptr.prp1 = cpu_to_le64(iova);
 if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
 error_setg(errp, "Failed to identify controller");
 goto out;
@@ -1283,11 +1285,11 @@ static int coroutine_fn 
nvme_co_pdiscard(BlockDriverState *bs,
 
 assert(s->nr_queues > 1);
 
-buf = qemu_try_blockalign0(bs, s->page_size);
+buf = qemu_try_blockalign(bs, s->page_size);
 if (!buf) {
 return -ENOMEM;
 }
-
+memset(buf, 0, s->page_size);
 buf->nlb = cpu_to_le32(bytes >> s->blkshift);
 buf->slba = cpu_to_le64(offset >> s->blkshift);
 buf->cattr = 0;
-- 
2.26.2

[PATCH v7 00/15] block/nvme: Various cleanups required to use multiple queues

2020-09-03 Thread Philippe Mathieu-Daudé

Hi Kevin,

You already queued v6, but 2 patches from Klaus modified
block/nvme.c so the series needed a rebase... Sorry for
the extra work :/

Since v6: rebased on top of:
- c26f2173704 ("hw/block/nvme: bump spec data structures to v1.3")
- 69265150aa5 ("hw/block/nvme: be consistent about zeros vs zeroes")

Thanks,

Phil.

Philippe Mathieu-Daudé (15):
  block/nvme: Replace magic value by SCALE_MS definition
  block/nvme: Avoid further processing if trace event not enabled
  block/nvme: Let nvme_create_queue_pair() fail gracefully
  block/nvme: Define INDEX macros to ease code review
  block/nvme: Improve error message when IO queue creation failed
  block/nvme: Use common error path in nvme_add_io_queue()
  block/nvme: Rename local variable
  block/nvme: Use union of NvmeIdCtrl / NvmeIdNs structures
  block/nvme: Replace qemu_try_blockalign0 by qemu_try_blockalign/memset
  block/nvme: Replace qemu_try_blockalign(bs) by
qemu_try_memalign(pg_sz)
  block/nvme: Simplify nvme_init_queue() arguments
  block/nvme: Replace BDRV_POLL_WHILE by AIO_WAIT_WHILE
  block/nvme: Simplify nvme_create_queue_pair() arguments
  block/nvme: Extract nvme_poll_queue()
  block/nvme: Use an array of EventNotifier

 block/nvme.c | 211 ++-
 1 file changed, 125 insertions(+), 86 deletions(-)

-- 
2.26.2

[PATCH v7 04/15] block/nvme: Define INDEX macros to ease code review

2020-09-03 Thread Philippe Mathieu-Daudé

Use definitions instead of '0' or '1' indexes. Also this will
be useful when using multi-queues later.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 33 +++--
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index ca8b039f4f0..488d4ddb3b8 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -103,6 +103,9 @@ typedef volatile struct {
 
 QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
 
+#define INDEX_ADMIN 0
+#define INDEX_IO(n) (1 + n)
+
 struct BDRVNVMeState {
 AioContext *aio_context;
 QEMUVFIOState *vfio;
@@ -531,7 +534,7 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 }
 cmd.dptr.prp1 = cpu_to_le64(iova);
 
-if (nvme_cmd_sync(bs, s->queues[0], )) {
+if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
 error_setg(errp, "Failed to identify controller");
 goto out;
 }
@@ -555,7 +558,7 @@ static void nvme_identify(BlockDriverState *bs, int 
namespace, Error **errp)
 
 cmd.cdw10 = 0;
 cmd.nsid = cpu_to_le32(namespace);
-if (nvme_cmd_sync(bs, s->queues[0], )) {
+if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
 error_setg(errp, "Failed to identify namespace");
 goto out;
 }
@@ -644,7 +647,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
 .cdw11 = cpu_to_le32(0x3),
 };
-if (nvme_cmd_sync(bs, s->queues[0], )) {
+if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
 error_setg(errp, "Failed to create io queue [%d]", n);
 nvme_free_queue_pair(q);
 return false;
@@ -655,7 +658,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
**errp)
 .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
 .cdw11 = cpu_to_le32(0x1 | (n << 16)),
 };
-if (nvme_cmd_sync(bs, s->queues[0], )) {
+if (nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], )) {
 error_setg(errp, "Failed to create io queue [%d]", n);
 nvme_free_queue_pair(q);
 return false;
@@ -739,16 +742,18 @@ static int nvme_init(BlockDriverState *bs, const char 
*device, int namespace,
 
 /* Set up admin queue. */
 s->queues = g_new(NVMeQueuePair *, 1);
-s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp);
-if (!s->queues[0]) {
+s->queues[INDEX_ADMIN] = nvme_create_queue_pair(bs, 0,
+  NVME_QUEUE_SIZE,
+  errp);
+if (!s->queues[INDEX_ADMIN]) {
 ret = -EINVAL;
 goto out;
 }
 s->nr_queues = 1;
 QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
 s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
-s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova);
-s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova);
+s->regs->asq = cpu_to_le64(s->queues[INDEX_ADMIN]->sq.iova);
+s->regs->acq = cpu_to_le64(s->queues[INDEX_ADMIN]->cq.iova);
 
 /* After setting up all control registers we can enable device now. */
 s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
@@ -839,7 +844,7 @@ static int nvme_enable_disable_write_cache(BlockDriverState 
*bs, bool enable,
 .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
 };
 
-ret = nvme_cmd_sync(bs, s->queues[0], );
+ret = nvme_cmd_sync(bs, s->queues[INDEX_ADMIN], );
 if (ret) {
 error_setg(errp, "Failed to configure NVMe write cache");
 }
@@ -1056,7 +1061,7 @@ static coroutine_fn int 
nvme_co_prw_aligned(BlockDriverState *bs,
 {
 int r;
 BDRVNVMeState *s = bs->opaque;
-NVMeQueuePair *ioq = s->queues[1];
+NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
 NVMeRequest *req;
 
 uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0x) |
@@ -1171,7 +1176,7 @@ static coroutine_fn int nvme_co_pwritev(BlockDriverState 
*bs,
 static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
 {
 BDRVNVMeState *s = bs->opaque;
-NVMeQueuePair *ioq = s->queues[1];
+NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
 NVMeRequest *req;
 NvmeCmd cmd = {
 .opcode = NVME_CMD_FLUSH,
@@ -1202,7 +1207,7 @@ static coroutine_fn int 
nvme_co_pwrite_zeroes(BlockDriverState *bs,
   BdrvRequestFlags flags)
 {
 BDRVNVMeState *s = bs->opaque;
-NVMeQueuePair *ioq = s->queues[1];
+NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
 NVMeRequest *req;
 
 uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0x;
@@ -1255,7 +1260,7 @@ static int coroutine_fn nvme_co_pdiscard(BlockDriverState 
*bs,
  int bytes)
 {
 BDRVNVMeState *s = bs->opaque;
-NVMeQueuePair *ioq = s->queues[1];
+NVMeQueuePair *ioq =

[PATCH v7 01/15] block/nvme: Replace magic value by SCALE_MS definition

2020-09-03 Thread Philippe Mathieu-Daudé

Use self-explicit SCALE_MS definition instead of magic value.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Signed-off-by: Philippe Mathieu-Daudé 
---
 block/nvme.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/nvme.c b/block/nvme.c
index 05485fdd118..a2b006be56d 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -715,7 +715,7 @@ static int nvme_init(BlockDriverState *bs, const char 
*device, int namespace,
 /* Reset device to get a clean state. */
 s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE);
 /* Wait for CSTS.RDY = 0. */
-deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 
100ULL;
+deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
 while (le32_to_cpu(s->regs->csts) & 0x1) {
 if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
 error_setg(errp, "Timeout while waiting for device to reset (%"
-- 
2.26.2

Re: [RFC PATCH 11/12] hw/pci: Only allow PCI slave devices to write to direct memory

2020-09-03 Thread Paolo Bonzini

On 03/09/20 13:08, Philippe Mathieu-Daudé wrote:
> Do not allow PCI slaves to write to indirect memory
> regions such MMIO.
> 
> This fixes LP#1886362 and LP#1888606.

What is a "PCI slave"?  Which devices would still be allowed to write?

I'm worried that there are cases of MMIO reads that would be broken.
They are certainly niche these days, but they should still work; the
most "famous" one is perhaps the old BASIC

   DEF SEG=
   BLOAD "picture.pic", 0

Paolo

[PATCH v1 5/8] qemu-iotests: move check-block back to Makefiles

2020-09-03 Thread Alex Bennée

From: Paolo Bonzini 

check-block has its own test harness, unlike every other test.  If
we capture its output, as is in general nicer to do without V=1,
there will be no sign of progress.  So for lack of a better option
just move the invocation of the test back to Makefile rules.

As a side effect, this will also fix "make check" in --disable-tools
builds, as they were trying to run qemu-iotests without having
made qemu-img before.

Signed-off-by: Paolo Bonzini 
Signed-off-by: Alex Bennée 
Message-Id: <20200902080046.10412-1-pbonz...@redhat.com>
---
 meson.build|  2 --
 tests/Makefile.include | 15 ---
 tests/qemu-iotests/meson.build |  4 
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/meson.build b/meson.build
index 9b5076452b2..599306f59ad 100644
--- a/meson.build
+++ b/meson.build
@@ -1100,11 +1100,9 @@ if have_tools
  dependencies: [authz, block, crypto, io, qom, qemuutil], install: 
true)
   qemu_io = executable('qemu-io', files('qemu-io.c'),
  dependencies: [block, qemuutil], install: true)
-  qemu_block_tools += [qemu_img, qemu_io]
   if targetos != 'windows'
 qemu_nbd = executable('qemu-nbd', files('qemu-nbd.c'),
dependencies: [block, qemuutil], install: true)
-qemu_block_tools += [qemu_nbd]
   endif
 
   subdir('storage-daemon')
diff --git a/tests/Makefile.include b/tests/Makefile.include
index 9ac8f5b86a6..08301f5bc9b 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -468,7 +468,6 @@ check-tcg: $(RUN_TCG_TARGET_RULES)
 .PHONY: clean-tcg
 clean-tcg: $(CLEAN_TCG_TARGET_RULES)
 
-
 # Python venv for running tests
 
 .PHONY: check-venv check-acceptance
@@ -523,8 +522,18 @@ check-acceptance: check-venv $(TESTS_RESULTS_DIR) 
get-vm-images
 # Consolidated targets
 
 .PHONY: check-block check-unit check check-clean get-vm-images
-check-block:
-check-build: build-unit
+check:
+
+ifeq ($(CONFIG_TOOLS)$(CONFIG_POSIX),yy)
+QEMU_IOTESTS_HELPERS-$(CONFIG_LINUX) = 
tests/qemu-iotests/socket_scm_helper$(EXESUF)
+check: check-block
+check-block: $(SRC_PATH)/tests/check-block.sh qemu-img$(EXESUF) \
+   qemu-io$(EXESUF) qemu-nbd$(EXESUF) $(QEMU_IOTESTS_HELPERS-y) \
+   $(patsubst %-softmmu,qemu-system-%,$(filter 
%-softmmu,$(TARGET_DIRS)))
+   @$<
+endif
+
+check-build: build-unit $(QEMU_IOTESTS_HELPERS-y)
 
 check-clean:
rm -rf $(check-unit-y) tests/*.o tests/*/*.o $(QEMU_IOTESTS_HELPERS-y)
diff --git a/tests/qemu-iotests/meson.build b/tests/qemu-iotests/meson.build
index 3de09fb8fab..60470936b45 100644
--- a/tests/qemu-iotests/meson.build
+++ b/tests/qemu-iotests/meson.build
@@ -4,7 +4,3 @@ if 'CONFIG_LINUX' in config_host
 else
 socket_scm_helper = []
 endif
-test('qemu-iotests', sh, args: [files('../check-block.sh')],
- depends: [qemu_block_tools, emulators, socket_scm_helper],
- suite: 'block', timeout: 1)
-
-- 
2.20.1

[RFC PATCH 10/12] exec/memattrs: Introduce MemTxAttrs::direct_access field

2020-09-03 Thread Philippe Mathieu-Daudé

Add the 'direct_access' bit to the memory attributes to restrict
bus master access to ROM/RAM.
Have read/write accessors return MEMTX_BUS_ERROR if an access is
restricted and the region is not ROM/RAM ('direct').
Add corresponding trace events.

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/exec/memattrs.h | 3 +++
 exec.c  | 8 
 trace-events| 1 +
 3 files changed, 12 insertions(+)

diff --git a/include/exec/memattrs.h b/include/exec/memattrs.h
index 95f2d20d55b..c27cf639a96 100644
--- a/include/exec/memattrs.h
+++ b/include/exec/memattrs.h
@@ -35,6 +35,8 @@ typedef struct MemTxAttrs {
 unsigned int secure:1;
 /* Memory access is usermode (unprivileged) */
 unsigned int user:1;
+/* Bus master restricted to ROM/RAM slaves */
+unsigned int direct_access:1;
 /* Requester ID (for MSI for example) */
 unsigned int requester_id:16;
 /* Invert endianness for this page */
@@ -66,6 +68,7 @@ typedef struct MemTxAttrs {
 #define MEMTX_OK 0
 #define MEMTX_ERROR (1U << 0) /* device returned an error */
 #define MEMTX_DECODE_ERROR  (1U << 1) /* nothing at that address */
+#define MEMTX_BUS_ERROR (1U << 2) /* bus returned an error */
 typedef uint32_t MemTxResult;
 
 #endif
diff --git a/exec.c b/exec.c
index 7683afb6a8e..e6185eb04de 100644
--- a/exec.c
+++ b/exec.c
@@ -3213,6 +3213,10 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr 
addr, MemTxAttrs attrs,
 
 l = len;
 mr = flatview_translate(fv, addr, , , true, attrs);
+if (attrs.direct_access && !memory_access_is_direct(mr, true)) {
+trace_memory_access_illegal(true, addr, len, mr->name);
+return MEMTX_BUS_ERROR;
+}
 result = flatview_write_continue(fv, addr, attrs, buf, len,
  addr1, l, mr);
 
@@ -3275,6 +3279,10 @@ static MemTxResult flatview_read(FlatView *fv, hwaddr 
addr,
 
 l = len;
 mr = flatview_translate(fv, addr, , , false, attrs);
+if (attrs.direct_access && !memory_access_is_direct(mr, false)) {
+trace_memory_access_illegal(false, addr, len, mr->name);
+return MEMTX_BUS_ERROR;
+}
 return flatview_read_continue(fv, addr, attrs, buf, len,
   addr1, l, mr);
 }
diff --git a/trace-events b/trace-events
index 42107ebc697..931896735b1 100644
--- a/trace-events
+++ b/trace-events
@@ -54,6 +54,7 @@ find_ram_offset_loop(uint64_t size, uint64_t candidate, 
uint64_t offset, uint64_
 ram_block_discard_range(const char *rbname, void *hva, size_t length, bool 
need_madvise, bool need_fallocate, int ret) "%s@%p + 0x%zx: madvise: %d 
fallocate: %d ret: %d"
 memory_notdirty_write_access(uint64_t vaddr, uint64_t ram_addr, unsigned size) 
"0x%" PRIx64 " ram_addr 0x%" PRIx64 " size %u"
 memory_notdirty_set_dirty(uint64_t vaddr) "0x%" PRIx64
+memory_access_illegal(bool is_write, uint64_t addr, uint64_t len, const char 
*mr_name) "is_write:%u addr:0x%08" PRIx64 " size:0x%04" PRIx64 " region:'%s'"
 
 # memory.c
 memory_region_ops_read(int cpu_index, void *mr, uint64_t addr, uint64_t value, 
unsigned size) "cpu %d mr %p addr 0x%"PRIx64" value 0x%"PRIx64" size %u"
-- 
2.26.2

flatview_write_continue global mutex deadlock

2020-09-03 Thread Vladimir Sementsov-Ogievskiy


Hi all!

I can trigger long io request with help of nbd reconnect-delay option, which 
make the request wait for some time for the connection to establish again and 
retry (of course, this works if connection is already lost). So, the request 
itself may be long. And this triggers the deadlock, which seems unrelated to 
nbd itself.

So, what I do:

1. Create an image:
   qemu-img create -f qcow2 xx 100M

2. Start NBD server:
   qemu-nbd xx

3. Start vm with second nbd disk on node2, like this:

  ./build/x86_64-softmmu/qemu-system-x86_64 -nodefaults -drive 
file=/work/images/cent7.qcow2 -drive 
driver=nbd,server.type=inet,server.host=192.168.100.5,server.port=10809,reconnect-delay=60
 -vnc :0 -m 2G -enable-kvm -vga std


4. Access the vm through vnc (or some other way?), and check that NBD
   drive works:

   dd if=/dev/sdb of=/dev/null bs=1M count=10

   - the command should succeed.

5. Now, kill the nbd server, and run dd in the guest again:

   dd if=/dev/sdb of=/dev/null bs=1M count=10

Now Qemu is trying to reconnect, and dd-generated requests are waiting for the 
connection (they will wait up to 60 seconds (see reconnect-delay option above) 
and than fail). But suddenly, vm may totally hang in the deadlock. You may need 
to increase reconnect-delay period to catch the dead-lock.

Guest os is Centos 7.3.1611, kernel 3.10.0-514.el7.x86_64

The dead lock looks as follows:

(gdb) bt
#0  0x7fb9d784f54d in __lll_lock_wait () from /lib64/libpthread.so.0
#1  0x7fb9d784ae9b in _L_lock_883 () from /lib64/libpthread.so.0
#2  0x7fb9d784ad68 in pthread_mutex_lock () from /lib64/libpthread.so.0
#3  0x56069bfb3b06 in qemu_mutex_lock_impl (mutex=0x56069c7a3fe0 , 
file=0x56069c24c79b "../util/main-loop.c", line=238) at ../util/qemu-thread-posix.c:79
#4  0x56069bd00056 in qemu_mutex_lock_iothread_impl (file=0x56069c24c79b 
"../util/main-loop.c", line=238) at ../softmmu/cpus.c:1782
#5  0x56069bfcfd6f in os_host_main_loop_wait (timeout=151823947) at 
../util/main-loop.c:238
#6  0x56069bfcfe7a in main_loop_wait (nonblocking=0) at 
../util/main-loop.c:516
#7  0x56069bdb in qemu_main_loop () at ../softmmu/vl.c:1676
#8  0x56069b95fec2 in main (argc=13, argv=0x7fffd42bff08, 
envp=0x7fffd42bff78) at ../softmmu/main.c:50
(gdb) p qemu_global_mutex
$1 = {lock = {__data = {__lock = 2, __count = 0, __owner = 215121, __nusers = 
1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0, __next = 
0x0}},
__size = "\002\000\000\000\000\000\000\000QH\003\000\001", '\000' , 
__align = 2}, file = 0x56069c1d597d "../exec.c", line = 3139, initialized = true}

exec.c:3139 is in prepare_mmio_access(), called from flatview_write_continue(). 
Let's check qemu_global_mutex owner thread:

(gdb) info thr
  Id   Target IdFrame
* 1Thread 0x7fb9f0f39e00 (LWP 215115) "qemu-system-x86" 0x7fb9d784f54d 
in __lll_lock_wait () from /lib64/libpthread.so.0
  2Thread 0x7fb9ca20e700 (LWP 215116) "qemu-system-x86" 0x7fb9d756bbf9 
in syscall () from /lib64/libc.so.6
  3Thread 0x7fb9481ff700 (LWP 215121) "qemu-system-x86" 0x7fb9d7566cff 
in ppoll () from /lib64/libc.so.6
  4Thread 0x7fb9461ff700 (LWP 215123) "qemu-system-x86" 0x7fb9d784ca35 
in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
(gdb) thr 3
[Switching to thread 3 (Thread 0x7fb9481ff700 (LWP 215121))]
#0  0x7fb9d7566cff in ppoll () from /lib64/libc.so.6
(gdb) bt
#0  0x7fb9d7566cff in ppoll () from /lib64/libc.so.6
#1  0x56069bfbd3f1 in qemu_poll_ns (fds=0x7fb9401dcdf0, nfds=1, 
timeout=542076652475) at ../util/qemu-timer.c:347
#2  0x56069bfd949f in fdmon_poll_wait (ctx=0x56069e6864c0, 
ready_list=0x7fb9481fc200, timeout=542076652475) at ../util/fdmon-poll.c:79
#3  0x56069bfcdf4c in aio_poll (ctx=0x56069e6864c0, blocking=true) at 
../util/aio-posix.c:601
#4  0x56069be80cf3 in bdrv_do_drained_begin (bs=0x56069e6c0950, 
recursive=false, parent=0x0, ignore_bds_parents=false, poll=true) at 
../block/io.c:427
#5  0x56069be80ddb in bdrv_drained_begin (bs=0x56069e6c0950) at 
../block/io.c:433
#6  0x56069bf1e5b4 in blk_drain (blk=0x56069e6adb50) at 
../block/block-backend.c:1718
#7  0x56069ba40fb5 in ide_cancel_dma_sync (s=0x56069f0d1548) at 
../hw/ide/core.c:723
#8  0x56069bb90d29 in bmdma_cmd_writeb (bm=0x56069f0d22d0, val=8) at 
../hw/ide/pci.c:298
#9  0x56069b9fa529 in bmdma_write (opaque=0x56069f0d22d0, addr=0, val=8, 
size=1) at ../hw/ide/piix.c:75
#10 0x56069bd5d9d5 in memory_region_write_accessor (mr=0x56069f0d2420, 
addr=0, value=0x7fb9481fc4d8, size=1, shift=0, mask=255, attrs=...) at 
../softmmu/memory.c:483
#11 0x56069bd5dbf3 in access_with_adjusted_size (addr=0, value=0x7fb9481fc4d8, 
size=1, access_size_min=1, access_size_max=4, access_fn=0x56069bd5d8f6 
, mr=0x56069f0d2420,
attrs=...) at ../softmmu/memory.c:544
#12 0x56069bd60bda in memory_region_dispatch_write (mr=0x56069f0d2420,

[PATCH 07/12] dma: Let dma_memory_map() take MemTxAttrs argument

2020-09-03 Thread Philippe Mathieu-Daudé

Patch created mechanically using spatch with this script:

  @@
  expression E1, E2, E3, E4;
  @@
  - dma_memory_map(E1, E2, E3, E4)
  + dma_memory_map(E1, E2, E3, E4, MEMTXATTRS_UNSPECIFIED)

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/pci/pci.h| 3 ++-
 include/sysemu/dma.h| 4 ++--
 dma-helpers.c   | 3 ++-
 hw/display/virtio-gpu.c | 8 ++--
 hw/hyperv/vmbus.c   | 8 +---
 hw/ide/ahci.c   | 9 ++---
 hw/usb/libhw.c  | 3 ++-
 hw/virtio/virtio.c  | 6 --
 8 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 0c3217e019c..a221dfb3b08 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -831,7 +831,8 @@ static inline void *pci_dma_map(PCIDevice *dev, dma_addr_t 
addr,
 {
 void *buf;
 
-buf = dma_memory_map(pci_get_address_space(dev), addr, plen, dir);
+buf = dma_memory_map(pci_get_address_space(dev), addr, plen, dir,
+ MEMTXATTRS_UNSPECIFIED);
 return buf;
 }
 
diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h
index f0880b79e74..9d0e145d76f 100644
--- a/include/sysemu/dma.h
+++ b/include/sysemu/dma.h
@@ -134,13 +134,13 @@ int dma_memory_set(AddressSpace *as, dma_addr_t addr,
 
 static inline void *dma_memory_map(AddressSpace *as,
dma_addr_t addr, dma_addr_t *len,
-   DMADirection dir)
+   DMADirection dir, MemTxAttrs attrs)
 {
 hwaddr xlen = *len;
 void *p;
 
 p = address_space_map(as, addr, , dir == DMA_DIRECTION_FROM_DEVICE,
-  MEMTXATTRS_UNSPECIFIED);
+  attrs);
 *len = xlen;
 return p;
 }
diff --git a/dma-helpers.c b/dma-helpers.c
index 50473bb1996..dfac123680b 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -143,7 +143,8 @@ static void dma_blk_cb(void *opaque, int ret)
 while (dbs->sg_cur_index < dbs->sg->nsg) {
 cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
 cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
-mem = dma_memory_map(dbs->sg->as, cur_addr, _len, dbs->dir);
+mem = dma_memory_map(dbs->sg->as, cur_addr, _len, dbs->dir,
+ MEMTXATTRS_UNSPECIFIED);
 /*
  * Make reads deterministic in icount mode. Windows sometimes issues
  * disk read requests with overlapping SGs. It leads
diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index 5f0dd7c1500..be7f5cdee46 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -648,7 +648,9 @@ int virtio_gpu_create_mapping_iov(VirtIOGPU *g,
 hwaddr len = l;
 (*iov)[i].iov_len = l;
 (*iov)[i].iov_base = dma_memory_map(VIRTIO_DEVICE(g)->dma_as,
-a, , DMA_DIRECTION_TO_DEVICE);
+a, ,
+DMA_DIRECTION_TO_DEVICE,
+MEMTXATTRS_UNSPECIFIED);
 if (addr) {
 (*addr)[i] = a;
 }
@@ -1049,7 +1051,9 @@ static int virtio_gpu_load(QEMUFile *f, void *opaque, 
size_t size,
 hwaddr len = res->iov[i].iov_len;
 res->iov[i].iov_base =
 dma_memory_map(VIRTIO_DEVICE(g)->dma_as,
-   res->addrs[i], , DMA_DIRECTION_TO_DEVICE);
+   res->addrs[i], ,
+   DMA_DIRECTION_TO_DEVICE,
+   MEMTXATTRS_UNSPECIFIED);
 
 if (!res->iov[i].iov_base || len != res->iov[i].iov_len) {
 /* Clean up the half-a-mapping we just created... */
diff --git a/hw/hyperv/vmbus.c b/hw/hyperv/vmbus.c
index 75af6b83dde..56621d72e5b 100644
--- a/hw/hyperv/vmbus.c
+++ b/hw/hyperv/vmbus.c
@@ -372,7 +372,8 @@ static ssize_t gpadl_iter_io(GpadlIter *iter, void *buf, 
uint32_t len)
 
 maddr = (iter->gpadl->gfns[idx] << TARGET_PAGE_BITS) | off_in_page;
 
-iter->map = dma_memory_map(iter->as, maddr, , iter->dir);
+iter->map = dma_memory_map(iter->as, maddr, , iter->dir,
+   MEMTXATTRS_UNSPECIFIED);
 if (mlen != pgleft) {
 dma_memory_unmap(iter->as, iter->map, mlen, iter->dir, 0);
 iter->map = NULL;
@@ -488,7 +489,8 @@ int vmbus_map_sgl(VMBusChanReq *req, DMADirection dir, 
struct iovec *iov,
 goto err;
 }
 
-iov[ret_cnt].iov_base = dma_memory_map(sgl->as, a, , dir);
+iov[ret_cnt].iov_base = dma_memory_map(sgl->as, a, , dir,
+   MEMTXATTRS_UNSPECIFIED);
 if (!l) {
 ret = -EFAULT;
 goto err;
@@ -564,7 +566,7 @@ static vmbus_ring_buffer 
*ringbuf_map_hdr(VMBusRingBufCommon *ringbuf)

[PATCH 03/12] dma: Let dma_memory_set() take MemTxAttrs argument

2020-09-03 Thread Philippe Mathieu-Daudé

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/ppc/spapr_vio.h | 3 ++-
 include/sysemu/dma.h   | 3 ++-
 dma-helpers.c  | 6 +++---
 hw/nvram/fw_cfg.c  | 3 ++-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/hw/ppc/spapr_vio.h b/include/hw/ppc/spapr_vio.h
index f134f6cf574..6e5c0840248 100644
--- a/include/hw/ppc/spapr_vio.h
+++ b/include/hw/ppc/spapr_vio.h
@@ -116,7 +116,8 @@ static inline int spapr_vio_dma_write(SpaprVioDevice *dev, 
uint64_t taddr,
 static inline int spapr_vio_dma_set(SpaprVioDevice *dev, uint64_t taddr,
 uint8_t c, uint32_t size)
 {
-return (dma_memory_set(>as, taddr, c, size) != 0) ?
+return (dma_memory_set(>as, taddr,
+   c, size, MEMTXATTRS_UNSPECIFIED) != 0) ?
 H_DEST_PARM : H_SUCCESS;
 }
 
diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h
index fe3999dba59..34f957cc278 100644
--- a/include/sysemu/dma.h
+++ b/include/sysemu/dma.h
@@ -123,7 +123,8 @@ static inline int dma_memory_write(AddressSpace *as, 
dma_addr_t addr,
  DMA_DIRECTION_FROM_DEVICE);
 }
 
-int dma_memory_set(AddressSpace *as, dma_addr_t addr, uint8_t c, dma_addr_t 
len);
+int dma_memory_set(AddressSpace *as, dma_addr_t addr,
+   uint8_t c, dma_addr_t len, MemTxAttrs attrs);
 
 static inline void *dma_memory_map(AddressSpace *as,
dma_addr_t addr, dma_addr_t *len,
diff --git a/dma-helpers.c b/dma-helpers.c
index 41ef24a63b6..49d66716469 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -18,7 +18,8 @@
 
 /* #define DEBUG_IOMMU */
 
-int dma_memory_set(AddressSpace *as, dma_addr_t addr, uint8_t c, dma_addr_t 
len)
+int dma_memory_set(AddressSpace *as, dma_addr_t addr,
+   uint8_t c, dma_addr_t len, MemTxAttrs attrs)
 {
 dma_barrier(as, DMA_DIRECTION_FROM_DEVICE);
 
@@ -30,8 +31,7 @@ int dma_memory_set(AddressSpace *as, dma_addr_t addr, uint8_t 
c, dma_addr_t len)
 memset(fillbuf, c, FILLBUF_SIZE);
 while (len > 0) {
 l = len < FILLBUF_SIZE ? len : FILLBUF_SIZE;
-error |= address_space_write(as, addr, MEMTXATTRS_UNSPECIFIED,
- fillbuf, l);
+error |= address_space_write(as, addr, attrs, fillbuf, l);
 len -= l;
 addr += l;
 }
diff --git a/hw/nvram/fw_cfg.c b/hw/nvram/fw_cfg.c
index f3a4728288e..a15de06a10c 100644
--- a/hw/nvram/fw_cfg.c
+++ b/hw/nvram/fw_cfg.c
@@ -397,7 +397,8 @@ static void fw_cfg_dma_transfer(FWCfgState *s)
  * tested before.
  */
 if (read) {
-if (dma_memory_set(s->dma_as, dma.address, 0, len)) {
+if (dma_memory_set(s->dma_as, dma.address, 0, len,
+   MEMTXATTRS_UNSPECIFIED)) {
 dma.control |= FW_CFG_DMA_CTL_ERROR;
 }
 }
-- 
2.26.2

[PATCH 05/12] dma: Let dma_memory_rw() take MemTxAttrs argument

2020-09-03 Thread Philippe Mathieu-Daudé

Signed-off-by: Philippe Mathieu-Daudé 
---
 include/hw/pci/pci.h |  3 ++-
 include/sysemu/dma.h | 10 +-
 dma-helpers.c|  3 ++-
 hw/intc/spapr_xive.c |  3 ++-
 hw/usb/hcd-ohci.c| 10 ++
 5 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 896cef9ad47..0c3217e019c 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -788,7 +788,8 @@ static inline AddressSpace *pci_get_address_space(PCIDevice 
*dev)
 static inline int pci_dma_rw(PCIDevice *dev, dma_addr_t addr,
  void *buf, dma_addr_t len, DMADirection dir)
 {
-return dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir);
+return dma_memory_rw(pci_get_address_space(dev), addr, buf, len,
+ dir, MEMTXATTRS_UNSPECIFIED);
 }
 
 static inline int pci_dma_read(PCIDevice *dev, dma_addr_t addr,
diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h
index 6068323e48f..f03edeab173 100644
--- a/include/sysemu/dma.h
+++ b/include/sysemu/dma.h
@@ -106,25 +106,25 @@ static inline int dma_memory_write_relaxed(AddressSpace 
*as, dma_addr_t addr,
 
 static inline int dma_memory_rw(AddressSpace *as, dma_addr_t addr,
 void *buf, dma_addr_t len,
-DMADirection dir)
+DMADirection dir, MemTxAttrs attrs)
 {
 dma_barrier(as, dir);
 
-return dma_memory_rw_relaxed(as, addr, buf, len, dir,
- MEMTXATTRS_UNSPECIFIED);
+return dma_memory_rw_relaxed(as, addr, buf, len, dir, attrs);
 }
 
 static inline int dma_memory_read(AddressSpace *as, dma_addr_t addr,
   void *buf, dma_addr_t len)
 {
-return dma_memory_rw(as, addr, buf, len, DMA_DIRECTION_TO_DEVICE);
+return dma_memory_rw(as, addr, buf, len,
+ DMA_DIRECTION_TO_DEVICE, MEMTXATTRS_UNSPECIFIED);
 }
 
 static inline int dma_memory_write(AddressSpace *as, dma_addr_t addr,
const void *buf, dma_addr_t len)
 {
 return dma_memory_rw(as, addr, (void *)buf, len,
- DMA_DIRECTION_FROM_DEVICE);
+ DMA_DIRECTION_FROM_DEVICE, MEMTXATTRS_UNSPECIFIED);
 }
 
 int dma_memory_set(AddressSpace *as, dma_addr_t addr,
diff --git a/dma-helpers.c b/dma-helpers.c
index 49d66716469..50473bb1996 100644
--- a/dma-helpers.c
+++ b/dma-helpers.c
@@ -305,7 +305,8 @@ static uint64_t dma_buf_rw(uint8_t *ptr, int32_t len, 
QEMUSGList *sg,
 while (len > 0) {
 ScatterGatherEntry entry = sg->sg[sg_cur_index++];
 int32_t xfer = MIN(len, entry.len);
-dma_memory_rw(sg->as, entry.base, ptr, xfer, dir);
+dma_memory_rw(sg->as, entry.base, ptr, xfer, dir,
+  MEMTXATTRS_UNSPECIFIED);
 ptr += xfer;
 len -= xfer;
 resid -= xfer;
diff --git a/hw/intc/spapr_xive.c b/hw/intc/spapr_xive.c
index 4bd0d606ba1..dbf73a8bf47 100644
--- a/hw/intc/spapr_xive.c
+++ b/hw/intc/spapr_xive.c
@@ -1666,7 +1666,8 @@ static target_ulong h_int_esb(PowerPCCPU *cpu,
 mmio_addr = xive->vc_base + xive_source_esb_mgmt(xsrc, lisn) + offset;
 
 if (dma_memory_rw(_space_memory, mmio_addr, , 8,
-  (flags & SPAPR_XIVE_ESB_STORE))) {
+  (flags & SPAPR_XIVE_ESB_STORE),
+  MEMTXATTRS_UNSPECIFIED)) {
 qemu_log_mask(LOG_GUEST_ERROR, "XIVE: failed to access ESB @0x%"
   HWADDR_PRIx "\n", mmio_addr);
 return H_HARDWARE;
diff --git a/hw/usb/hcd-ohci.c b/hw/usb/hcd-ohci.c
index 1e6e85e86a8..bac1adf439c 100644
--- a/hw/usb/hcd-ohci.c
+++ b/hw/usb/hcd-ohci.c
@@ -586,7 +586,8 @@ static int ohci_copy_td(OHCIState *ohci, struct ohci_td *td,
 if (n > len)
 n = len;
 
-if (dma_memory_rw(ohci->as, ptr + ohci->localmem_base, buf, n, dir)) {
+if (dma_memory_rw(ohci->as, ptr + ohci->localmem_base, buf,
+  n, dir, MEMTXATTRS_UNSPECIFIED)) {
 return -1;
 }
 if (n == len) {
@@ -595,7 +596,7 @@ static int ohci_copy_td(OHCIState *ohci, struct ohci_td *td,
 ptr = td->be & ~0xfffu;
 buf += n;
 if (dma_memory_rw(ohci->as, ptr + ohci->localmem_base, buf,
-  len - n, dir)) {
+  len - n, dir, MEMTXATTRS_UNSPECIFIED)) {
 return -1;
 }
 return 0;
@@ -613,7 +614,8 @@ static int ohci_copy_iso_td(OHCIState *ohci,
 if (n > len)
 n = len;
 
-if (dma_memory_rw(ohci->as, ptr + ohci->localmem_base, buf, n, dir)) {
+if (dma_memory_rw(ohci->as, ptr + ohci->localmem_base, buf,
+  n, dir, MEMTXATTRS_UNSPECIFIED)) {
 return -1;
 }
 if (n == len) {
@@ -622,7 +624,7 @@ static int ohci_copy_iso_td(OHCIState *ohci,
 ptr = end_addr & ~0xfffu;
 buf += n;
 if (dma_memory_rw(ohci->as, ptr + ohci->localmem_base, buf,

1 2 >

1 - 100 of 129 matches

Mail list logo