date:20160715

[Qemu-devel] [PATCH v2 17/19] block: Kill .bdrv_co_discard()

2016-07-15 Thread Eric Blake

Now that all drivers have a byte-based .bdrv_co_pdiscard(), we
no longer need to worry about the sector-based version.  We can
also relax our minimum alignment to 1 for drivers that support it.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int.h | 2 --
 block/io.c| 9 ++---
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index b4d4cd2..42bbed4 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -165,8 +165,6 @@ struct BlockDriver {
  */
 int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
 int64_t offset, int count, BdrvRequestFlags flags);
-int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors);
 int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
 int64_t offset, int count);
 int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
diff --git a/block/io.c b/block/io.c
index ee87fbf..b81d1fc 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2403,14 +2403,12 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, 
int64_t offset,
 return 0;
 }

-if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_co_pdiscard &&
-!bs->drv->bdrv_aio_pdiscard) {
+if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
 return 0;
 }

 /* Discard is advisory, so ignore any unaligned head or tail */
-align = MAX(BDRV_SECTOR_SIZE,
-MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment));
+align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
 assert(is_power_of_2(align));
 head = MIN(count, -offset & (align - 1));
 if (head) {
@@ -2438,9 +2436,6 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, 
int64_t offset,

 if (bs->drv->bdrv_co_pdiscard) {
 ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
-} else if (bs->drv->bdrv_co_discard) {
-ret = bs->drv->bdrv_co_discard(bs, offset >> BDRV_SECTOR_BITS,
-   num >> BDRV_SECTOR_BITS);
 } else {
 BlockAIOCB *acb;
 CoroutineIOCompletion co = {
-- 
2.5.5

[Qemu-devel] [PATCH v2 14/19] qcow2: Switch .bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards killing off sector-based block APIs.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/qcow2.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index a6bca73..d620d0a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2479,15 +2479,15 @@ static coroutine_fn int 
qcow2_co_pwrite_zeroes(BlockDriverState *bs,
 return ret;
 }

-static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors)
+static coroutine_fn int qcow2_co_pdiscard(BlockDriverState *bs,
+  int64_t offset, int count)
 {
 int ret;
 BDRVQcow2State *s = bs->opaque;

 qemu_co_mutex_lock(&s->lock);
-ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
-nb_sectors, QCOW2_DISCARD_REQUEST, false);
+ret = qcow2_discard_clusters(bs, offset, count >> BDRV_SECTOR_BITS,
+ QCOW2_DISCARD_REQUEST, false);
 qemu_co_mutex_unlock(&s->lock);
 return ret;
 }
@@ -3410,7 +3410,7 @@ BlockDriver bdrv_qcow2 = {
 .bdrv_co_flush_to_os= qcow2_co_flush_to_os,

 .bdrv_co_pwrite_zeroes  = qcow2_co_pwrite_zeroes,
-.bdrv_co_discard= qcow2_co_discard,
+.bdrv_co_pdiscard   = qcow2_co_pdiscard,
 .bdrv_truncate  = qcow2_truncate,
 .bdrv_write_compressed  = qcow2_write_compressed,
 .bdrv_make_empty= qcow2_make_empty,
-- 
2.5.5

[Qemu-devel] [PATCH v2 15/19] raw_bsd: Switch .bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards killing off sector-based block APIs.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/raw_bsd.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index 68f0a91..961aa13 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -134,11 +134,10 @@ static int coroutine_fn 
raw_co_pwrite_zeroes(BlockDriverState *bs,
 return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
 }

-static int coroutine_fn raw_co_discard(BlockDriverState *bs,
-   int64_t sector_num, int nb_sectors)
+static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
+int64_t offset, int count)
 {
-return bdrv_co_pdiscard(bs->file->bs, sector_num << BDRV_SECTOR_BITS,
-nb_sectors << BDRV_SECTOR_BITS);
+return bdrv_co_pdiscard(bs->file->bs, offset, count);
 }

 static int64_t raw_getlength(BlockDriverState *bs)
@@ -244,7 +243,7 @@ BlockDriver bdrv_raw = {
 .bdrv_co_readv= &raw_co_readv,
 .bdrv_co_writev_flags = &raw_co_writev_flags,
 .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
-.bdrv_co_discard  = &raw_co_discard,
+.bdrv_co_pdiscard = &raw_co_pdiscard,
 .bdrv_co_get_block_status = &raw_co_get_block_status,
 .bdrv_truncate= &raw_truncate,
 .bdrv_getlength   = &raw_getlength,
-- 
2.5.5

[Qemu-devel] [PATCH v2 16/19] sheepdog: Switch .bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards killing off sector-based block APIs.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/sheepdog.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index e739c56..66e1cb2 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2800,8 +2800,8 @@ static int sd_load_vmstate(BlockDriverState *bs, 
QEMUIOVector *qiov,
 }


-static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num,
-  int nb_sectors)
+static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
+  int count)
 {
 SheepdogAIOCB *acb;
 BDRVSheepdogState *s = bs->opaque;
@@ -2811,7 +2811,7 @@ static coroutine_fn int sd_co_discard(BlockDriverState 
*bs, int64_t sector_num,
 uint32_t zero = 0;

 if (!s->discard_supported) {
-return 0;
+return 0;
 }

 memset(&discard_iov, 0, sizeof(discard_iov));
@@ -2820,7 +2820,10 @@ static coroutine_fn int sd_co_discard(BlockDriverState 
*bs, int64_t sector_num,
 iov.iov_len = sizeof(zero);
 discard_iov.iov = &iov;
 discard_iov.niov = 1;
-acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors);
+assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
+assert((count & (BDRV_SECTOR_SIZE - 1)) == 0);
+acb = sd_aio_setup(bs, &discard_iov, offset >> BDRV_SECTOR_BITS,
+   count >> BDRV_SECTOR_BITS);
 acb->aiocb_type = AIOCB_DISCARD_OBJ;
 acb->aio_done_func = sd_finish_aiocb;

@@ -2954,7 +2957,7 @@ static BlockDriver bdrv_sheepdog = {
 .bdrv_co_readv  = sd_co_readv,
 .bdrv_co_writev = sd_co_writev,
 .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-.bdrv_co_discard = sd_co_discard,
+.bdrv_co_pdiscard = sd_co_pdiscard,
 .bdrv_co_get_block_status = sd_co_get_block_status,

 .bdrv_snapshot_create   = sd_snapshot_create,
@@ -2990,7 +2993,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
 .bdrv_co_readv  = sd_co_readv,
 .bdrv_co_writev = sd_co_writev,
 .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-.bdrv_co_discard = sd_co_discard,
+.bdrv_co_pdiscard = sd_co_pdiscard,
 .bdrv_co_get_block_status = sd_co_get_block_status,

 .bdrv_snapshot_create   = sd_snapshot_create,
@@ -3026,7 +3029,7 @@ static BlockDriver bdrv_sheepdog_unix = {
 .bdrv_co_readv  = sd_co_readv,
 .bdrv_co_writev = sd_co_writev,
 .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
-.bdrv_co_discard = sd_co_discard,
+.bdrv_co_pdiscard = sd_co_pdiscard,
 .bdrv_co_get_block_status = sd_co_get_block_status,

 .bdrv_snapshot_create   = sd_snapshot_create,
-- 
2.5.5

[Qemu-devel] [PATCH v2 18/19] nbd: Convert to byte-based interface

2016-07-15 Thread Eric Blake

The NBD protocol doesn't have any notion of sectors, so it is
a fairly easy conversion to use byte-based read and write.

Signed-off-by: Eric Blake 
Acked-by: Paolo Bonzini 

---
v2: fix typo in commit message
---
 block/nbd-client.h  |  8 
 include/block/nbd.h |  1 -
 block/nbd-client.c  | 30 +-
 block/nbd.c | 12 ++--
 4 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/block/nbd-client.h b/block/nbd-client.h
index 62dec33..fa9817b 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -46,10 +46,10 @@ void nbd_client_close(BlockDriverState *bs);

 int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int nbd_client_co_flush(BlockDriverState *bs);
-int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, QEMUIOVector *qiov, int flags);
-int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
-int nb_sectors, QEMUIOVector *qiov);
+int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
+  uint64_t bytes, QEMUIOVector *qiov, int flags);
+int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, QEMUIOVector *qiov, int flags);

 void nbd_client_detach_aio_context(BlockDriverState *bs);
 void nbd_client_attach_aio_context(BlockDriverState *bs,
diff --git a/include/block/nbd.h b/include/block/nbd.h
index 503f514..cb91820 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -77,7 +77,6 @@ enum {

 /* Maximum size of a single READ/WRITE data buffer */
 #define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024)
-#define NBD_MAX_SECTORS (NBD_MAX_BUFFER_SIZE / BDRV_SECTOR_SIZE)

 /* Maximum size of an export name. The NBD spec requires 256 and
  * suggests that servers support up to 4096, but we stick to only the
diff --git a/block/nbd-client.c b/block/nbd-client.c
index d22feea..2cf3237 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -218,17 +218,20 @@ static void nbd_coroutine_end(NbdClientSession *s,
 }
 }

-int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
-int nb_sectors, QEMUIOVector *qiov)
+int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
 NbdClientSession *client = nbd_get_client_session(bs);
-struct nbd_request request = { .type = NBD_CMD_READ };
+struct nbd_request request = {
+.type = NBD_CMD_READ,
+.from = offset,
+.len = bytes,
+};
 struct nbd_reply reply;
 ssize_t ret;

-assert(nb_sectors <= NBD_MAX_SECTORS);
-request.from = sector_num * 512;
-request.len = nb_sectors * 512;
+assert(bytes <= NBD_MAX_BUFFER_SIZE);
+assert(!flags);

 nbd_coroutine_start(client, &request);
 ret = nbd_co_send_request(bs, &request, NULL);
@@ -239,14 +242,17 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t 
sector_num,
 }
 nbd_coroutine_end(client, &request);
 return -reply.error;
-
 }

-int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, QEMUIOVector *qiov, int flags)
+int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
+  uint64_t bytes, QEMUIOVector *qiov, int flags)
 {
 NbdClientSession *client = nbd_get_client_session(bs);
-struct nbd_request request = { .type = NBD_CMD_WRITE };
+struct nbd_request request = {
+.type = NBD_CMD_WRITE,
+.from = offset,
+.len = bytes,
+};
 struct nbd_reply reply;
 ssize_t ret;

@@ -255,9 +261,7 @@ int nbd_client_co_writev(BlockDriverState *bs, int64_t 
sector_num,
 request.type |= NBD_CMD_FLAG_FUA;
 }

-assert(nb_sectors <= NBD_MAX_SECTORS);
-request.from = sector_num * 512;
-request.len = nb_sectors * 512;
+assert(bytes <= NBD_MAX_BUFFER_SIZE);

 nbd_coroutine_start(client, &request);
 ret = nbd_co_send_request(bs, &request, qiov);
diff --git a/block/nbd.c b/block/nbd.c
index 42cae0e..8d57220 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -438,8 +438,8 @@ static BlockDriver bdrv_nbd = {
 .instance_size  = sizeof(BDRVNBDState),
 .bdrv_parse_filename= nbd_parse_filename,
 .bdrv_file_open = nbd_open,
-.bdrv_co_readv  = nbd_client_co_readv,
-.bdrv_co_writev_flags   = nbd_client_co_writev,
+.bdrv_co_preadv = nbd_client_co_preadv,
+.bdrv_co_pwritev= nbd_client_co_pwritev,
 .bdrv_close = nbd_close,
 .bdrv_co_flush_to_os= nbd_co_flush,
 .bdrv_co_pdiscard   = nbd_client_co_pdiscard,
@@ -456,8 +456,8 @@ static BlockDriver bdrv_nbd_tcp = {
 .instance_size  = sizeof(BDRVNBDState),
 .bdrv_parse_filename= nbd_parse_filename,
 .bdrv_file_open = nbd_open

[Qemu-devel] [PATCH v2 13/19] nbd: Switch .bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards killing off sector-based block APIs.

While at it, call directly into nbd-client.c instead of having
a pointless trivial wrapper in nbd.c.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/nbd-client.h |  3 +--
 block/nbd-client.c | 11 ++-
 block/nbd.c| 12 +++-
 3 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/block/nbd-client.h b/block/nbd-client.h
index c618dad..62dec33 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -44,8 +44,7 @@ int nbd_client_init(BlockDriverState *bs,
 Error **errp);
 void nbd_client_close(BlockDriverState *bs);

-int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
-  int nb_sectors);
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int nbd_client_co_flush(BlockDriverState *bs);
 int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
  int nb_sectors, QEMUIOVector *qiov, int flags);
diff --git a/block/nbd-client.c b/block/nbd-client.c
index f184844..d22feea 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -295,19 +295,20 @@ int nbd_client_co_flush(BlockDriverState *bs)
 return -reply.error;
 }

-int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num,
-  int nb_sectors)
+int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
 NbdClientSession *client = nbd_get_client_session(bs);
-struct nbd_request request = { .type = NBD_CMD_TRIM };
+struct nbd_request request = {
+.type = NBD_CMD_TRIM,
+.from = offset,
+.len = count,
+};
 struct nbd_reply reply;
 ssize_t ret;

 if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) {
 return 0;
 }
-request.from = sector_num * 512;
-request.len = nb_sectors * 512;

 nbd_coroutine_start(client, &request);
 ret = nbd_co_send_request(bs, &request, NULL);
diff --git a/block/nbd.c b/block/nbd.c
index 8a13078..42cae0e 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -360,12 +360,6 @@ static void nbd_refresh_limits(BlockDriverState *bs, Error 
**errp)
 bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE;
 }

-static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
-  int nb_sectors)
-{
-return nbd_client_co_discard(bs, sector_num, nb_sectors);
-}
-
 static void nbd_close(BlockDriverState *bs)
 {
 nbd_client_close(bs);
@@ -448,7 +442,7 @@ static BlockDriver bdrv_nbd = {
 .bdrv_co_writev_flags   = nbd_client_co_writev,
 .bdrv_close = nbd_close,
 .bdrv_co_flush_to_os= nbd_co_flush,
-.bdrv_co_discard= nbd_co_discard,
+.bdrv_co_pdiscard   = nbd_client_co_pdiscard,
 .bdrv_refresh_limits= nbd_refresh_limits,
 .bdrv_getlength = nbd_getlength,
 .bdrv_detach_aio_context= nbd_detach_aio_context,
@@ -466,7 +460,7 @@ static BlockDriver bdrv_nbd_tcp = {
 .bdrv_co_writev_flags   = nbd_client_co_writev,
 .bdrv_close = nbd_close,
 .bdrv_co_flush_to_os= nbd_co_flush,
-.bdrv_co_discard= nbd_co_discard,
+.bdrv_co_pdiscard   = nbd_client_co_pdiscard,
 .bdrv_refresh_limits= nbd_refresh_limits,
 .bdrv_getlength = nbd_getlength,
 .bdrv_detach_aio_context= nbd_detach_aio_context,
@@ -484,7 +478,7 @@ static BlockDriver bdrv_nbd_unix = {
 .bdrv_co_writev_flags   = nbd_client_co_writev,
 .bdrv_close = nbd_close,
 .bdrv_co_flush_to_os= nbd_co_flush,
-.bdrv_co_discard= nbd_co_discard,
+.bdrv_co_pdiscard   = nbd_client_co_pdiscard,
 .bdrv_refresh_limits= nbd_refresh_limits,
 .bdrv_getlength = nbd_getlength,
 .bdrv_detach_aio_context= nbd_detach_aio_context,
-- 
2.5.5

[Qemu-devel] [PATCH v2 01/19] block: Convert bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards byte-based interfaces everywhere.  Replace
the sector-based bdrv_co_discard() with a new byte-based
bdrv_co_pdiscard(), which silently ignores any unaligned head
or tail.  Driver callbacks will be converted in followup patches.

By calculating the alignment outside of the loop, and clamping
the max discard to an aligned value, we can simplify the actions
done within the loop.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---

Yes, this patch is yet one more place that will need to be fixed up
before 2.7, in order to support iscsi devices that advertise a 15M
opt_discard and max_discard.  I plan on submitting that as a followup
series (as a bug fix, that qualifies as post-hard-freeze; while this
series has been around since before soft freeze but probably doesn't
qualify for inclusion once hard freeze hits)

 include/block/block.h |  2 +-
 block/blkreplay.c |  3 ++-
 block/block-backend.c |  3 ++-
 block/io.c| 67 +++
 block/raw_bsd.c   |  3 ++-
 5 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 616d8b9..4f5cebf 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -342,7 +342,7 @@ void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
 void bdrv_drain_all(void);

 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
-int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
+int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
 int bdrv_has_zero_init(BlockDriverState *bs);
 bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs);
diff --git a/block/blkreplay.c b/block/blkreplay.c
index 3368c8c..c69e5a5 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -118,7 +118,8 @@ static int coroutine_fn 
blkreplay_co_discard(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors)
 {
 uint64_t reqid = request_id++;
-int ret = bdrv_co_discard(bs->file->bs, sector_num, nb_sectors);
+int ret = bdrv_co_pdiscard(bs->file->bs, sector_num << BDRV_SECTOR_BITS,
+   nb_sectors << BDRV_SECTOR_BITS);
 block_request_create(reqid, bs, qemu_coroutine_self());
 qemu_coroutine_yield();

diff --git a/block/block-backend.c b/block/block-backend.c
index f9cea1b..d982cf9 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1113,7 +1113,8 @@ int blk_co_discard(BlockBackend *blk, int64_t sector_num, 
int nb_sectors)
 return ret;
 }

-return bdrv_co_discard(blk_bs(blk), sector_num, nb_sectors);
+return bdrv_co_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS,
+nb_sectors << BDRV_SECTOR_BITS);
 }

 int blk_co_flush(BlockBackend *blk)
diff --git a/block/io.c b/block/io.c
index 86db77e..4e04df2 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2198,7 +2198,8 @@ static void coroutine_fn bdrv_aio_discard_co_entry(void 
*opaque)
 BlockAIOCBCoroutine *acb = opaque;
 BlockDriverState *bs = acb->common.bs;

-acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors);
+acb->req.error = bdrv_co_pdiscard(bs, acb->req.sector << BDRV_SECTOR_BITS,
+  acb->req.nb_sectors << BDRV_SECTOR_BITS);
 bdrv_co_complete(acb);
 }

@@ -2378,20 +2379,22 @@ static void coroutine_fn bdrv_discard_co_entry(void 
*opaque)
 {
 DiscardCo *rwco = opaque;

-rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
+rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->sector_num << 
BDRV_SECTOR_BITS,
+ rwco->nb_sectors << BDRV_SECTOR_BITS);
 }

-int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors)
+int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
+  int count)
 {
 BdrvTrackedRequest req;
-int max_discard, ret;
+int max_pdiscard, ret;
+int head, align;

 if (!bs->drv) {
 return -ENOMEDIUM;
 }

-ret = bdrv_check_request(bs, sector_num, nb_sectors);
+ret = bdrv_check_byte_request(bs, offset, count);
 if (ret < 0) {
 return ret;
 } else if (bs->read_only) {
@@ -2408,45 +2411,45 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, 
int64_t sector_num,
 return 0;
 }

-tracked_request_begin(&req, bs, sector_num << BDRV_SECTOR_BITS,
-  nb_sectors << BDRV_SECTOR_BITS, 
BDRV_TRACKED_DISCARD);
+/* Discard is advisory, so ignore any unaligned head or tail */
+align = MAX(BDRV_SECTOR_SIZE,
+MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment));
+assert(is_power_of_2(align));
+head = MIN(count, -offset & (align - 1));
+if (head) {
+count -= head;
+offset += head;
+}
+count =

[Qemu-devel] [PATCH v2 19/19] raw_bsd: Convert to byte-based interface

2016-07-15 Thread Eric Blake

Since the raw format driver is just passing things through, we can
do byte-based read and write if the underlying protocol does
likewise.

There's one tricky part - if we probed the image format, we document
that we restrict operations on the initial sector.  It's easiest to
keep this guarantee by enforcing read-modify-write on sub-sector
operations (yes, this partially reverts commit ad82be2f).

Signed-off-by: Eric Blake 

---
v2: Rather than reject sub-sector write to sector 0, enforce a
RMW by setting request_alignment [Paolo]
---
 block/raw_bsd.c | 45 ++---
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index 961aa13..588d408 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -50,33 +50,30 @@ static int raw_reopen_prepare(BDRVReopenState *reopen_state,
 return 0;
 }

-static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, QEMUIOVector *qiov)
+static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
+  uint64_t bytes, QEMUIOVector *qiov,
+  int flags)
 {
 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov);
+return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }

-static int coroutine_fn
-raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-QEMUIOVector *qiov, int flags)
+static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
+   uint64_t bytes, QEMUIOVector *qiov,
+   int flags)
 {
 void *buf = NULL;
 BlockDriver *drv;
 QEMUIOVector local_qiov;
 int ret;

-if (bs->probed && sector_num == 0) {
-/* As long as these conditions are true, we can't get partial writes to
- * the probe buffer and can just directly check the request. */
+if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
+/* Handling partial writes would be a pain - so we just
+ * require that guests have 512-byte request alignment if
+ * probing occurred */
 QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512);
 QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512);
-
-if (nb_sectors == 0) {
-/* qemu_iovec_to_buf() would fail, but we want to return success
- * instead of -EINVAL in this case. */
-return 0;
-}
+assert(offset == 0 && bytes >= BLOCK_PROBE_BUF_SIZE);

 buf = qemu_try_blockalign(bs->file->bs, 512);
 if (!buf) {
@@ -105,8 +102,7 @@ raw_co_writev_flags(BlockDriverState *bs, int64_t 
sector_num, int nb_sectors,
 }

 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-ret = bdrv_co_pwritev(bs->file, sector_num * BDRV_SECTOR_SIZE,
-  nb_sectors * BDRV_SECTOR_SIZE, qiov, flags);
+ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);

 fail:
 if (qiov == &local_qiov) {
@@ -150,6 +146,16 @@ static int raw_get_info(BlockDriverState *bs, 
BlockDriverInfo *bdi)
 return bdrv_get_info(bs->file->bs, bdi);
 }

+static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+if (bs->probed) {
+/* To make it easier to protect the first sector, any probed
+ * image is restricted to read-modify-write on sub-sector
+ * operations. */
+bs->bl.request_alignment = BDRV_SECTOR_SIZE;
+}
+}
+
 static int raw_truncate(BlockDriverState *bs, int64_t offset)
 {
 return bdrv_truncate(bs->file->bs, offset);
@@ -240,8 +246,8 @@ BlockDriver bdrv_raw = {
 .bdrv_open= &raw_open,
 .bdrv_close   = &raw_close,
 .bdrv_create  = &raw_create,
-.bdrv_co_readv= &raw_co_readv,
-.bdrv_co_writev_flags = &raw_co_writev_flags,
+.bdrv_co_preadv   = &raw_co_preadv,
+.bdrv_co_pwritev  = &raw_co_pwritev,
 .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
 .bdrv_co_pdiscard = &raw_co_pdiscard,
 .bdrv_co_get_block_status = &raw_co_get_block_status,
@@ -249,6 +255,7 @@ BlockDriver bdrv_raw = {
 .bdrv_getlength   = &raw_getlength,
 .has_variable_length  = true,
 .bdrv_get_info= &raw_get_info,
+.bdrv_refresh_limits  = &raw_refresh_limits,
 .bdrv_probe_blocksizes = &raw_probe_blocksizes,
 .bdrv_probe_geometry  = &raw_probe_geometry,
 .bdrv_media_changed   = &raw_media_changed,
-- 
2.5.5

[Qemu-devel] [PATCH v2 12/19] iscsi: Switch .bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards killing off sector-based block APIs.

Unlike write_zeroes, where we can be handed unaligned requests
and must fail gracefully with -ENOTSUP for a fallback, we are
guaranteed that discard requests are always aligned because the
block layer already ignored unaligned head/tail.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/iscsi.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index bdc7ade..da509df 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -923,29 +923,26 @@ iscsi_getlength(BlockDriverState *bs)
 }

 static int
-coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num,
-   int nb_sectors)
+coroutine_fn iscsi_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
 IscsiLun *iscsilun = bs->opaque;
 struct IscsiTask iTask;
 struct unmap_list list;

-if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-return -EINVAL;
-}
+assert(is_byte_request_lun_aligned(offset, count, iscsilun));

 if (!iscsilun->lbp.lbpu) {
 /* UNMAP is not supported by the target */
 return 0;
 }

-list.lba = sector_qemu2lun(sector_num, iscsilun);
-list.num = sector_qemu2lun(nb_sectors, iscsilun);
+list.lba = offset / iscsilun->block_size;
+list.num = count / iscsilun->block_size;

 iscsi_co_init_iscsitask(iscsilun, &iTask);
 retry:
 if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1,
- iscsi_co_generic_cb, &iTask) == NULL) {
+ iscsi_co_generic_cb, &iTask) == NULL) {
 return -ENOMEM;
 }

@@ -975,7 +972,8 @@ retry:
 return iTask.err_code;
 }

-iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
+iscsi_allocationmap_clear(iscsilun, offset >> BDRV_SECTOR_BITS,
+  count >> BDRV_SECTOR_BITS);

 return 0;
 }
@@ -1862,7 +1860,7 @@ static BlockDriver bdrv_iscsi = {
 .bdrv_refresh_limits = iscsi_refresh_limits,

 .bdrv_co_get_block_status = iscsi_co_get_block_status,
-.bdrv_co_discard  = iscsi_co_discard,
+.bdrv_co_pdiscard  = iscsi_co_pdiscard,
 .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
 .bdrv_co_readv = iscsi_co_readv,
 .bdrv_co_writev_flags  = iscsi_co_writev_flags,
-- 
2.5.5

[Qemu-devel] [PATCH v2 10/19] blkreplay: Switch .bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards killing off sector-based block APIs.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/blkreplay.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/blkreplay.c b/block/blkreplay.c
index c69e5a5..30f9d5f 100755
--- a/block/blkreplay.c
+++ b/block/blkreplay.c
@@ -114,12 +114,11 @@ static int coroutine_fn 
blkreplay_co_pwrite_zeroes(BlockDriverState *bs,
 return ret;
 }

-static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors)
+static int coroutine_fn blkreplay_co_pdiscard(BlockDriverState *bs,
+  int64_t offset, int count)
 {
 uint64_t reqid = request_id++;
-int ret = bdrv_co_pdiscard(bs->file->bs, sector_num << BDRV_SECTOR_BITS,
-   nb_sectors << BDRV_SECTOR_BITS);
+int ret = bdrv_co_pdiscard(bs->file->bs, offset, count);
 block_request_create(reqid, bs, qemu_coroutine_self());
 qemu_coroutine_yield();

@@ -149,7 +148,7 @@ static BlockDriver bdrv_blkreplay = {
 .bdrv_co_pwritev= blkreplay_co_pwritev,

 .bdrv_co_pwrite_zeroes  = blkreplay_co_pwrite_zeroes,
-.bdrv_co_discard= blkreplay_co_discard,
+.bdrv_co_pdiscard   = blkreplay_co_pdiscard,
 .bdrv_co_flush  = blkreplay_co_flush,
 };

-- 
2.5.5

[Qemu-devel] [PATCH v2 08/19] block: Convert .bdrv_aio_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards byte-based interfaces everywhere.  Replace
the sector-based driver callback .bdrv_aio_discard() with a new
byte-based .bdrv_aio_pdiscard().  Only raw-posix and RBD drivers
are affected, so it was not worth splitting into multiple patches.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int.h |  4 ++--
 block/io.c|  7 +++
 block/raw-posix.c | 18 --
 block/rbd.c   | 15 +++
 4 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 8054146..0cbe250 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -142,8 +142,8 @@ struct BlockDriver {
 BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
 BlockCompletionFunc *cb, void *opaque);
-BlockAIOCB *(*bdrv_aio_discard)(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors,
+BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
+int64_t offset, int count,
 BlockCompletionFunc *cb, void *opaque);

 int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
diff --git a/block/io.c b/block/io.c
index 3babbdc..40d8444 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2403,7 +2403,7 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, 
int64_t offset,
 return 0;
 }

-if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) {
+if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_pdiscard) {
 return 0;
 }

@@ -2444,9 +2444,8 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, 
int64_t offset,
 .coroutine = qemu_coroutine_self(),
 };

-acb = bs->drv->bdrv_aio_discard(bs, offset >> BDRV_SECTOR_BITS,
-num >> BDRV_SECTOR_BITS,
-bdrv_co_io_em_complete, &co);
+acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
+ bdrv_co_io_em_complete, &co);
 if (acb == NULL) {
 ret = -EIO;
 goto out;
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 6ed329d..2c98cab 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1865,14 +1865,13 @@ static int64_t coroutine_fn 
raw_co_get_block_status(BlockDriverState *bs,
 return ret | BDRV_BLOCK_OFFSET_VALID | start;
 }

-static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors,
+static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
+int64_t offset, int count,
 BlockCompletionFunc *cb, void *opaque)
 {
 BDRVRawState *s = bs->opaque;

-return paio_submit(bs, s->fd, sector_num << BDRV_SECTOR_BITS, NULL,
-   nb_sectors << BDRV_SECTOR_BITS,
+return paio_submit(bs, s->fd, offset, NULL, count,
cb, opaque, QEMU_AIO_DISCARD);
 }

@@ -1944,7 +1943,7 @@ BlockDriver bdrv_file = {
 .bdrv_co_preadv = raw_co_preadv,
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_aio_flush = raw_aio_flush,
-.bdrv_aio_discard = raw_aio_discard,
+.bdrv_aio_pdiscard = raw_aio_pdiscard,
 .bdrv_refresh_limits = raw_refresh_limits,
 .bdrv_io_plug = raw_aio_plug,
 .bdrv_io_unplug = raw_aio_unplug,
@@ -2286,8 +2285,8 @@ static int fd_open(BlockDriverState *bs)
 return -EIO;
 }

-static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors,
+static coroutine_fn BlockAIOCB *hdev_aio_pdiscard(BlockDriverState *bs,
+int64_t offset, int count,
 BlockCompletionFunc *cb, void *opaque)
 {
 BDRVRawState *s = bs->opaque;
@@ -2295,8 +2294,7 @@ static coroutine_fn BlockAIOCB 
*hdev_aio_discard(BlockDriverState *bs,
 if (fd_open(bs) < 0) {
 return NULL;
 }
-return paio_submit(bs, s->fd, sector_num << BDRV_SECTOR_BITS, NULL,
-   nb_sectors << BDRV_SECTOR_BITS,
+return paio_submit(bs, s->fd, offset, NULL, count,
cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }

@@ -2391,7 +2389,7 @@ static BlockDriver bdrv_host_device = {
 .bdrv_co_preadv = raw_co_preadv,
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_aio_flush= raw_aio_flush,
-.bdrv_aio_discard   = hdev_aio_discard,
+.bdrv_aio_pdiscard   = hdev_aio_pdiscard,
 .bdrv_refresh_limits = raw_refresh_limits,
 .bdrv_io_plug = raw_aio_plug,
 .bdrv_io_unplug = raw_aio_unplug,
diff --git a/block/rbd.c b/block/rbd.c
index 01cbb63..0106fea 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -930,14 +930,13 @@ static int qemu_rbd_snap_list(BlockDriverState *bs,
 }

 #ifdef LIBRBD_SUPPORTS_DISCARD
-static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs,
-int64_t sector_num,
-

[Qemu-devel] [PATCH v2 05/19] block: Convert BB interface to byte-based discards

2016-07-15 Thread Eric Blake

Change sector-based blk_discard(), blk_co_discard(), and
blk_aio_discard() to instead be byte-based blk_pdiscard(),
blk_co_pdiscard(), and blk_aio_pdiscard().  NBD gets a lot
simpler now that ignoring the unaligned portion of a
byte-based discard request is handled under the hood by
the block layer.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 

---
v2: tweak commit message for grep'ability
---
 include/sysemu/block-backend.h |  9 -
 block/block-backend.c  | 25 +++--
 block/mirror.c |  5 +++--
 hw/block/xen_disk.c|  7 ---
 hw/ide/core.c  |  6 --
 hw/scsi/scsi-disk.c|  8 
 nbd/server.c   | 19 +--
 qemu-io-cmds.c |  3 +--
 8 files changed, 36 insertions(+), 46 deletions(-)

diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 3c3e82f..2da4905 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -139,15 +139,14 @@ BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t 
offset,
 BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *blk_aio_flush(BlockBackend *blk,
   BlockCompletionFunc *cb, void *opaque);
-BlockAIOCB *blk_aio_discard(BlockBackend *blk,
-int64_t sector_num, int nb_sectors,
-BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk, int64_t offset, int count,
+ BlockCompletionFunc *cb, void *opaque);
 void blk_aio_cancel(BlockAIOCB *acb);
 void blk_aio_cancel_async(BlockAIOCB *acb);
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
 BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
   BlockCompletionFunc *cb, void *opaque);
-int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors);
+int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count);
 int blk_co_flush(BlockBackend *blk);
 int blk_flush(BlockBackend *blk);
 int blk_flush_all(void);
@@ -207,7 +206,7 @@ int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, 
int64_t offset,
 int blk_write_compressed(BlockBackend *blk, int64_t sector_num,
  const uint8_t *buf, int nb_sectors);
 int blk_truncate(BlockBackend *blk, int64_t offset);
-int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors);
+int blk_pdiscard(BlockBackend *blk, int64_t offset, int count);
 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
  int64_t pos, int size);
 int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size);
diff --git a/block/block-backend.c b/block/block-backend.c
index 8b16b95..effa038 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1065,17 +1065,16 @@ BlockAIOCB *blk_aio_flush(BlockBackend *blk,
 return bdrv_aio_flush(blk_bs(blk), cb, opaque);
 }

-BlockAIOCB *blk_aio_discard(BlockBackend *blk,
-int64_t sector_num, int nb_sectors,
-BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
+ int64_t offset, int count,
+ BlockCompletionFunc *cb, void *opaque)
 {
-int ret = blk_check_request(blk, sector_num, nb_sectors);
+int ret = blk_check_byte_request(blk, offset, count);
 if (ret < 0) {
 return blk_abort_aio_request(blk, cb, opaque, ret);
 }

-return bdrv_aio_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS,
- nb_sectors << BDRV_SECTOR_BITS, cb, opaque);
+return bdrv_aio_pdiscard(blk_bs(blk), offset, count, cb, opaque);
 }

 void blk_aio_cancel(BlockAIOCB *acb)
@@ -1107,15 +1106,14 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned 
long int req, void *buf,
 return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque);
 }

-int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
+int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int count)
 {
-int ret = blk_check_request(blk, sector_num, nb_sectors);
+int ret = blk_check_byte_request(blk, offset, count);
 if (ret < 0) {
 return ret;
 }

-return bdrv_co_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS,
-nb_sectors << BDRV_SECTOR_BITS);
+return bdrv_co_pdiscard(blk_bs(blk), offset, count);
 }

 int blk_co_flush(BlockBackend *blk)
@@ -1506,15 +1504,14 @@ int blk_truncate(BlockBackend *blk, int64_t offset)
 return bdrv_truncate(blk_bs(blk), offset);
 }

-int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors)
+int blk_pdiscard(BlockBackend *blk, int64_t offset, int count)
 {
-int ret = blk_check_request(blk, sector_num, nb_sectors);
+int ret = blk_check_byte_request(blk, offset, count);
 if (ret <

[Qemu-devel] [PATCH v2 09/19] block: Add .bdrv_co_pdiscard() driver callback

2016-07-15 Thread Eric Blake

There's enough drivers with a sector-based callback that it will
be easier to switch one at a time.  This patch adds a byte-based
callback, and then after all drivers are swapped, we'll drop the
sector-based callback.

[checkpatch doesn't like the space after coroutine_fn in
block_int.h, but it's consistent with the rest of the file]

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block_int.h | 2 ++
 block/io.c| 7 +--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 0cbe250..b4d4cd2 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -167,6 +167,8 @@ struct BlockDriver {
 int64_t offset, int count, BdrvRequestFlags flags);
 int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors);
+int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
+int64_t offset, int count);
 int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors, int *pnum,
 BlockDriverState **file);
diff --git a/block/io.c b/block/io.c
index 40d8444..ee87fbf 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2403,7 +2403,8 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, 
int64_t offset,
 return 0;
 }

-if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_pdiscard) {
+if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_co_pdiscard &&
+!bs->drv->bdrv_aio_pdiscard) {
 return 0;
 }

@@ -2435,7 +2436,9 @@ int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, 
int64_t offset,
 int ret;
 int num = MIN(count, max_pdiscard);

-if (bs->drv->bdrv_co_discard) {
+if (bs->drv->bdrv_co_pdiscard) {
+ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
+} else if (bs->drv->bdrv_co_discard) {
 ret = bs->drv->bdrv_co_discard(bs, offset >> BDRV_SECTOR_BITS,
num >> BDRV_SECTOR_BITS);
 } else {
-- 
2.5.5

[Qemu-devel] [PATCH v2 03/19] block: Switch BlockRequest to byte-based

2016-07-15 Thread Eric Blake

BlockRequest is the internal struct used by bdrv_aio_*.  At the
moment, all such calls were sector-based, but we will eventually
convert to byte-based; start by changing the internal variables
to be byte-based.  No change to behavior, although the read and
write code can now go byte-based through more of the stack.

Signed-off-by: Eric Blake 

---
v2: Don't pass out-of-scope local qiov through aio, rebase to master
---
 block/io.c | 62 ++
 1 file changed, 30 insertions(+), 32 deletions(-)

diff --git a/block/io.c b/block/io.c
index 2b4dc6e..478aade 100644
--- a/block/io.c
+++ b/block/io.c
@@ -33,14 +33,13 @@

 #define NOT_DONE 0x7fff /* used while emulated sync operation in progress 
*/

-static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BdrvRequestFlags flags,
- BlockCompletionFunc *cb,
- void *opaque,
- bool is_write);
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+  int64_t offset,
+  QEMUIOVector *qiov,
+  BdrvRequestFlags flags,
+  BlockCompletionFunc *cb,
+  void *opaque,
+  bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 int64_t offset, int count, BdrvRequestFlags flags);
@@ -2014,8 +2013,9 @@ BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t 
sector_num,
 {
 trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);

-return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0,
- cb, opaque, false);
+assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+  0, cb, opaque, false);
 }

 BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
@@ -2024,8 +2024,9 @@ BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t 
sector_num,
 {
 trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);

-return bdrv_co_aio_rw_vector(child, sector_num, qiov, nb_sectors, 0,
- cb, opaque, true);
+assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
+return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
+  0, cb, opaque, true);
 }

 void bdrv_aio_cancel(BlockAIOCB *acb)
@@ -2061,8 +2062,8 @@ typedef struct BlockRequest {
 union {
 /* Used during read, write, trim */
 struct {
-int64_t sector;
-int nb_sectors;
+int64_t offset;
+int bytes;
 int flags;
 QEMUIOVector *qiov;
 };
@@ -2126,24 +2127,23 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque)
 BlockAIOCBCoroutine *acb = opaque;

 if (!acb->is_write) {
-acb->req.error = bdrv_co_do_readv(acb->child, acb->req.sector,
-acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
+acb->req.qiov->size, acb->req.qiov, acb->req.flags);
 } else {
-acb->req.error = bdrv_co_do_writev(acb->child, acb->req.sector,
-acb->req.nb_sectors, acb->req.qiov, acb->req.flags);
+acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
+acb->req.qiov->size, acb->req.qiov, acb->req.flags);
 }

 bdrv_co_complete(acb);
 }

-static BlockAIOCB *bdrv_co_aio_rw_vector(BdrvChild *child,
- int64_t sector_num,
- QEMUIOVector *qiov,
- int nb_sectors,
- BdrvRequestFlags flags,
- BlockCompletionFunc *cb,
- void *opaque,
- bool is_write)
+static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
+  int64_t offset,
+  QEMUIOVector *qiov,
+  BdrvRequestFlags flags,
+  BlockCompletionFunc *cb,
+  void *opaque,
+  bool is_write)
 {
 Coroutine *co;
 BlockAIOCBCoroutine *acb;
@@ -2152,8

[Qemu-devel] [PATCH v2 06/19] raw-posix: Switch paio_submit() to byte-based

2016-07-15 Thread Eric Blake

The only remaining uses of paio_submit() were flush (with no
offset or count) and discard (which we are switching to byte-based);
furthermore, the similarly named paio_submit_co() is already
byte-based.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/raw-posix.c  | 14 --
 block/raw-win32.c  | 19 +++
 block/trace-events |  2 +-
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index d1c3bd8..6ed329d 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1287,7 +1287,7 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
 }

 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
-int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+int64_t offset, QEMUIOVector *qiov, int count,
 BlockCompletionFunc *cb, void *opaque, int type)
 {
 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
@@ -1297,8 +1297,8 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int 
fd,
 acb->aio_type = type;
 acb->aio_fildes = fd;

-acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE;
-acb->aio_offset = sector_num * BDRV_SECTOR_SIZE;
+acb->aio_nbytes = count;
+acb->aio_offset = offset;

 if (qiov) {
 acb->aio_iov = qiov->iov;
@@ -1306,7 +1306,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int 
fd,
 assert(qiov->size == acb->aio_nbytes);
 }

-trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+trace_paio_submit(acb, opaque, offset, count, type);
 pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
@@ -1871,7 +1871,8 @@ static coroutine_fn BlockAIOCB 
*raw_aio_discard(BlockDriverState *bs,
 {
 BDRVRawState *s = bs->opaque;

-return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+return paio_submit(bs, s->fd, sector_num << BDRV_SECTOR_BITS, NULL,
+   nb_sectors << BDRV_SECTOR_BITS,
cb, opaque, QEMU_AIO_DISCARD);
 }

@@ -2294,7 +2295,8 @@ static coroutine_fn BlockAIOCB 
*hdev_aio_discard(BlockDriverState *bs,
 if (fd_open(bs) < 0) {
 return NULL;
 }
-return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors,
+return paio_submit(bs, s->fd, sector_num << BDRV_SECTOR_BITS, NULL,
+   nb_sectors << BDRV_SECTOR_BITS,
cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }

diff --git a/block/raw-win32.c b/block/raw-win32.c
index 62edb1a..3ff53d6 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -142,7 +142,7 @@ static int aio_worker(void *arg)
 }

 static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile,
-int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+int64_t offset, QEMUIOVector *qiov, int count,
 BlockCompletionFunc *cb, void *opaque, int type)
 {
 RawWin32AIOData *acb = g_new(RawWin32AIOData, 1);
@@ -155,11 +155,12 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, 
HANDLE hfile,
 if (qiov) {
 acb->aio_iov = qiov->iov;
 acb->aio_niov = qiov->niov;
+assert(qiov->size == count);
 }
-acb->aio_nbytes = nb_sectors * 512;
-acb->aio_offset = sector_num * 512;
+acb->aio_nbytes = count;
+acb->aio_offset = offset;

-trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
+trace_paio_submit(acb, opaque, offset, count, type);
 pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
 return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque);
 }
@@ -378,9 +379,10 @@ static BlockAIOCB *raw_aio_readv(BlockDriverState *bs,
 BDRVRawState *s = bs->opaque;
 if (s->aio) {
 return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-nb_sectors, cb, opaque, QEMU_AIO_READ); 
+nb_sectors, cb, opaque, QEMU_AIO_READ);
 } else {
-return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
+   nb_sectors << BDRV_SECTOR_BITS,
cb, opaque, QEMU_AIO_READ);
 }
 }
@@ -392,9 +394,10 @@ static BlockAIOCB *raw_aio_writev(BlockDriverState *bs,
 BDRVRawState *s = bs->opaque;
 if (s->aio) {
 return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
-nb_sectors, cb, opaque, QEMU_AIO_WRITE); 
+nb_sectors, cb, opaque, QEMU_AIO_WRITE);
 } else {
-return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+return paio_submit(bs, s->hfile, sector_num << BDRV_SECTOR_BITS, qiov,
+   nb_sectors << BDRV_SECTOR_BITS,
cb, opaque, QEMU_AIO_WRITE);
 }
 }
diff --git a/block/trace-events b/block/trace-events
index 90d618a..978ef4f 100644
--- a/

[Qemu-devel] [PATCH v2 04/19] block: Convert bdrv_aio_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards byte-based interfaces everywhere.  Replace
the sector-based bdrv_aio_discard() with a new byte-based
bdrv_aio_pdiscard(), which silently ignores any unaligned head
or tail.  Driver callbacks will be converted in followup patches.

Signed-off-by: Eric Blake 

---
v2: rebase to master and to qiov change in previous patch
---
 include/block/block.h |  6 +++---
 block/block-backend.c |  3 ++-
 block/io.c| 15 +++
 block/trace-events|  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 94cabbb..11c162d 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -316,9 +316,9 @@ BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t 
sector_num,
 BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
BlockCompletionFunc *cb, void *opaque);
-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
- int64_t sector_num, int nb_sectors,
- BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs,
+  int64_t offset, int count,
+  BlockCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockAIOCB *acb);
 void bdrv_aio_cancel_async(BlockAIOCB *acb);

diff --git a/block/block-backend.c b/block/block-backend.c
index 83b6407..8b16b95 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1074,7 +1074,8 @@ BlockAIOCB *blk_aio_discard(BlockBackend *blk,
 return blk_abort_aio_request(blk, cb, opaque, ret);
 }

-return bdrv_aio_discard(blk_bs(blk), sector_num, nb_sectors, cb, opaque);
+return bdrv_aio_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS, cb, opaque);
 }

 void blk_aio_cancel(BlockAIOCB *acb)
diff --git a/block/io.c b/block/io.c
index 478aade..3babbdc 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2192,7 +2192,7 @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
 return &acb->common;
 }

-static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque)
 {
 BlockAIOCBCoroutine *acb = opaque;
 BlockDriverState *bs = acb->common.bs;
@@ -2201,21 +2201,20 @@ static void coroutine_fn bdrv_aio_discard_co_entry(void 
*opaque)
 bdrv_co_complete(acb);
 }

-BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors,
-BlockCompletionFunc *cb, void *opaque)
+BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count,
+  BlockCompletionFunc *cb, void *opaque)
 {
 Coroutine *co;
 BlockAIOCBCoroutine *acb;

-trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque);
+trace_bdrv_aio_pdiscard(bs, offset, count, opaque);

 acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
 acb->need_bh = true;
 acb->req.error = -EINPROGRESS;
-acb->req.offset = sector_num << BDRV_SECTOR_BITS;
-acb->req.bytes = nb_sectors << BDRV_SECTOR_BITS;
-co = qemu_coroutine_create(bdrv_aio_discard_co_entry, acb);
+acb->req.offset = offset;
+acb->req.bytes = count;
+co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb);
 qemu_coroutine_enter(co);

 bdrv_co_maybe_schedule_bh(acb);
diff --git a/block/trace-events b/block/trace-events
index 354967e..90d618a 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -9,7 +9,7 @@ blk_co_preadv(void *blk, void *bs, int64_t offset, unsigned int 
bytes, int flags
 blk_co_pwritev(void *blk, void *bs, int64_t offset, unsigned int bytes, int 
flags) "blk %p bs %p offset %"PRId64" bytes %u flags %x"

 # block/io.c
-bdrv_aio_discard(void *bs, int64_t sector_num, int nb_sectors, void *opaque) 
"bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
+bdrv_aio_pdiscard(void *bs, int64_t offset, int count, void *opaque) "bs %p 
offset %"PRId64" count %d opaque %p"
 bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
 bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs 
%p sector_num %"PRId64" nb_sectors %d opaque %p"
 bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) 
"bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
-- 
2.5.5

[Qemu-devel] [PATCH v2 02/19] block: Convert bdrv_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards byte-based interfaces everywhere.  Replace
the sector-based bdrv_discard() with a new byte-based
bdrv_pdiscard(), which silently ignores any unaligned head
or tail.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 

---
v2: rebase to master, trivial enough to keep R-b
---
 include/block/block.h  |  2 +-
 block/block-backend.c  |  3 ++-
 block/io.c | 19 +--
 block/qcow2-refcount.c |  4 +---
 4 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 4f5cebf..94cabbb 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -341,7 +341,7 @@ void bdrv_drain(BlockDriverState *bs);
 void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
 void bdrv_drain_all(void);

-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int count);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
 int bdrv_has_zero_init(BlockDriverState *bs);
diff --git a/block/block-backend.c b/block/block-backend.c
index d982cf9..83b6407 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1512,7 +1512,8 @@ int blk_discard(BlockBackend *blk, int64_t sector_num, 
int nb_sectors)
 return ret;
 }

-return bdrv_discard(blk_bs(blk), sector_num, nb_sectors);
+return bdrv_pdiscard(blk_bs(blk), sector_num << BDRV_SECTOR_BITS,
+ nb_sectors << BDRV_SECTOR_BITS);
 }

 int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
diff --git a/block/io.c b/block/io.c
index 4e04df2..2b4dc6e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2371,16 +2371,15 @@ int bdrv_flush(BlockDriverState *bs)

 typedef struct DiscardCo {
 BlockDriverState *bs;
-int64_t sector_num;
-int nb_sectors;
+int64_t offset;
+int count;
 int ret;
 } DiscardCo;
-static void coroutine_fn bdrv_discard_co_entry(void *opaque)
+static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
 {
 DiscardCo *rwco = opaque;

-rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->sector_num << 
BDRV_SECTOR_BITS,
- rwco->nb_sectors << BDRV_SECTOR_BITS);
+rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
 }

 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
@@ -2474,23 +2473,23 @@ out:
 return ret;
 }

-int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
 {
 Coroutine *co;
 DiscardCo rwco = {
 .bs = bs,
-.sector_num = sector_num,
-.nb_sectors = nb_sectors,
+.offset = offset,
+.count = count,
 .ret = NOT_DONE,
 };

 if (qemu_in_coroutine()) {
 /* Fast-path if already in coroutine context */
-bdrv_discard_co_entry(&rwco);
+bdrv_pdiscard_co_entry(&rwco);
 } else {
 AioContext *aio_context = bdrv_get_aio_context(bs);

-co = qemu_coroutine_create(bdrv_discard_co_entry, &rwco);
+co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
 qemu_coroutine_enter(co);
 while (rwco.ret == NOT_DONE) {
 aio_poll(aio_context, true);
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 49b6ce6..cbfb3fe 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -615,9 +615,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret)

 /* Discard is optional, ignore the return value */
 if (ret >= 0) {
-bdrv_discard(bs->file->bs,
- d->offset >> BDRV_SECTOR_BITS,
- d->bytes >> BDRV_SECTOR_BITS);
+bdrv_pdiscard(bs->file->bs, d->offset, d->bytes);
 }

 g_free(d);
-- 
2.5.5

[Qemu-devel] [PATCH v2 07/19] rbd: Switch rbd_start_aio() to byte-based

2016-07-15 Thread Eric Blake

The internal function converts to byte-based before calling into
RBD code; hoist the conversion to the callers so that callers
can then be switched to byte-based themselves.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/rbd.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 0a5840d..01cbb63 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -649,9 +649,9 @@ static int rbd_aio_flush_wrapper(rbd_image_t image,
 }

 static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
- int64_t sector_num,
+ int64_t off,
  QEMUIOVector *qiov,
- int nb_sectors,
+ int64_t size,
  BlockCompletionFunc *cb,
  void *opaque,
  RBDAIOCmd cmd)
@@ -659,7 +659,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
 RBDAIOCB *acb;
 RADOSCB *rcb = NULL;
 rbd_completion_t c;
-int64_t off, size;
 char *buf;
 int r;

@@ -668,6 +667,7 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,
 acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque);
 acb->cmd = cmd;
 acb->qiov = qiov;
+assert(!qiov || qiov->size == size);
 if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) {
 acb->bounce = NULL;
 } else {
@@ -687,9 +687,6 @@ static BlockAIOCB *rbd_start_aio(BlockDriverState *bs,

 buf = acb->bounce;

-off = sector_num * BDRV_SECTOR_SIZE;
-size = nb_sectors * BDRV_SECTOR_SIZE;
-
 rcb = g_new(RADOSCB, 1);
 rcb->acb = acb;
 rcb->buf = buf;
@@ -739,7 +736,8 @@ static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs,
   BlockCompletionFunc *cb,
   void *opaque)
 {
-return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
+return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
+ nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
  RBD_AIO_READ);
 }

@@ -750,7 +748,8 @@ static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs,
BlockCompletionFunc *cb,
void *opaque)
 {
-return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque,
+return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, qiov,
+ nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
  RBD_AIO_WRITE);
 }

@@ -937,7 +936,8 @@ static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState 
*bs,
 BlockCompletionFunc *cb,
 void *opaque)
 {
-return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque,
+return rbd_start_aio(bs, sector_num << BDRV_SECTOR_BITS, NULL,
+ nb_sectors << BDRV_SECTOR_BITS, cb, opaque,
  RBD_AIO_DISCARD);
 }
 #endif
-- 
2.5.5

[Qemu-devel] [PATCH for-2.7 v2 00/19] byte-based block discard

2016-07-15 Thread Eric Blake

Allow NBD to pass a byte-aligned discard request over the wire.

Prerequisite: Kevin's block branch merged with current qemu.git master,
plus my work on auto-fragmenting (v3 at the moment):
https://lists.gnu.org/archive/html/qemu-devel/2016-07/msg03550.html

Also available as a tag at:
git fetch git://repo.or.cz/qemu/ericb.git nbd-discard-v2

This is a merge of two series both posted before soft freeze, both
of which had initial positive review with only a few changes needed:
byte-based block discard
https://lists.gnu.org/archive/html/qemu-devel/2016-06/msg06491.html
Switch raw NBD to byte-based
https://lists.gnu.org/archive/html/qemu-devel/2016-06/msg07030.html

I'd still hoping for one other series to make it into 2.7, based
on being posted prior to soft freeze:
nbd: efficient write zeroes
https://lists.gnu.org/archive/html/qemu-devel/2016-06/msg07463.html

Changes since v1:
- rebase to latest master (several contextual conflicts)
- fix bug with aio_pdiscard using a stack variable qiov after
its scope ends
- tweak raw_bsd to remain sector-based if probing occurred

001/19:[] [--] 'block: Convert bdrv_co_discard() to byte-based'
002/19:[0004] [FC] 'block: Convert bdrv_discard() to byte-based'
003/19:[0034] [FC] 'block: Switch BlockRequest to byte-based'
004/19:[0008] [FC] 'block: Convert bdrv_aio_discard() to byte-based'
005/19:[] [--] 'block: Convert BB interface to byte-based discards'
006/19:[] [--] 'raw-posix: Switch paio_submit() to byte-based'
007/19:[] [--] 'rbd: Switch rbd_start_aio() to byte-based'
008/19:[] [--] 'block: Convert .bdrv_aio_discard() to byte-based'
009/19:[] [--] 'block: Add .bdrv_co_pdiscard() driver callback'
010/19:[] [-C] 'blkreplay: Switch .bdrv_co_discard() to byte-based'
011/19:[] [--] 'gluster: Switch .bdrv_co_discard() to byte-based'
012/19:[] [--] 'iscsi: Switch .bdrv_co_discard() to byte-based'
013/19:[] [--] 'nbd: Switch .bdrv_co_discard() to byte-based'
014/19:[] [--] 'qcow2: Switch .bdrv_co_discard() to byte-based'
015/19:[] [-C] 'raw_bsd: Switch .bdrv_co_discard() to byte-based'
016/19:[] [--] 'sheepdog: Switch .bdrv_co_discard() to byte-based'
017/19:[] [--] 'block: Kill .bdrv_co_discard()'
018/19:[] [--] 'nbd: Convert to byte-based interface'
019/19:[0028] [FC] 'raw_bsd: Convert to byte-based interface'

Eric Blake (19):
  block: Convert bdrv_co_discard() to byte-based
  block: Convert bdrv_discard() to byte-based
  block: Switch BlockRequest to byte-based
  block: Convert bdrv_aio_discard() to byte-based
  block: Convert BB interface to byte-based discards
  raw-posix: Switch paio_submit() to byte-based
  rbd: Switch rbd_start_aio() to byte-based
  block: Convert .bdrv_aio_discard() to byte-based
  block: Add .bdrv_co_pdiscard() driver callback
  blkreplay: Switch .bdrv_co_discard() to byte-based
  gluster: Switch .bdrv_co_discard() to byte-based
  iscsi: Switch .bdrv_co_discard() to byte-based
  nbd: Switch .bdrv_co_discard() to byte-based
  qcow2: Switch .bdrv_co_discard() to byte-based
  raw_bsd: Switch .bdrv_co_discard() to byte-based
  sheepdog: Switch .bdrv_co_discard() to byte-based
  block: Kill .bdrv_co_discard()
  nbd: Convert to byte-based interface
  raw_bsd: Convert to byte-based interface

 block/nbd-client.h |  11 ++-
 include/block/block.h  |  10 +--
 include/block/block_int.h  |   8 +--
 include/block/nbd.h|   1 -
 include/sysemu/block-backend.h |   9 ++-
 block/blkreplay.c  |   8 +--
 block/block-backend.c  |  22 +++---
 block/gluster.c|  14 ++--
 block/io.c | 154 -
 block/iscsi.c  |  18 +++--
 block/mirror.c |   5 +-
 block/nbd-client.c |  41 ++-
 block/nbd.c|  24 +++
 block/qcow2-refcount.c |   4 +-
 block/qcow2.c  |  10 +--
 block/raw-posix.c  |  24 +++
 block/raw-win32.c  |  19 ++---
 block/raw_bsd.c|  53 --
 block/rbd.c|  29 
 block/sheepdog.c   |  17 +++--
 hw/block/xen_disk.c|   7 +-
 hw/ide/core.c  |   6 +-
 hw/scsi/scsi-disk.c|   8 +--
 nbd/server.c   |  19 ++---
 qemu-io-cmds.c |   3 +-
 block/trace-events |   4 +-
 26 files changed, 260 insertions(+), 268 deletions(-)

-- 
2.5.5

[Qemu-devel] [PATCH v2 11/19] gluster: Switch .bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

Another step towards killing off sector-based block APIs.

Signed-off-by: Eric Blake 
Reviewed-by: Stefan Hajnoczi 
---
 block/gluster.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/block/gluster.c b/block/gluster.c
index 406c1e6..ef3b0de 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -724,14 +724,12 @@ error:
 }

 #ifdef CONFIG_GLUSTERFS_DISCARD
-static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors)
+static coroutine_fn int qemu_gluster_co_pdiscard(BlockDriverState *bs,
+ int64_t offset, int size)
 {
 int ret;
 GlusterAIOCB acb;
 BDRVGlusterState *s = bs->opaque;
-size_t size = nb_sectors * BDRV_SECTOR_SIZE;
-off_t offset = sector_num * BDRV_SECTOR_SIZE;

 acb.size = 0;
 acb.ret = 0;
@@ -976,7 +974,7 @@ static BlockDriver bdrv_gluster = {
 .bdrv_co_flush_to_disk= qemu_gluster_co_flush_to_disk,
 .bdrv_has_zero_init   = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-.bdrv_co_discard  = qemu_gluster_co_discard,
+.bdrv_co_pdiscard = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
 .bdrv_co_pwrite_zeroes= qemu_gluster_co_pwrite_zeroes,
@@ -1004,7 +1002,7 @@ static BlockDriver bdrv_gluster_tcp = {
 .bdrv_co_flush_to_disk= qemu_gluster_co_flush_to_disk,
 .bdrv_has_zero_init   = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-.bdrv_co_discard  = qemu_gluster_co_discard,
+.bdrv_co_pdiscard = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
 .bdrv_co_pwrite_zeroes= qemu_gluster_co_pwrite_zeroes,
@@ -1032,7 +1030,7 @@ static BlockDriver bdrv_gluster_unix = {
 .bdrv_co_flush_to_disk= qemu_gluster_co_flush_to_disk,
 .bdrv_has_zero_init   = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-.bdrv_co_discard  = qemu_gluster_co_discard,
+.bdrv_co_pdiscard = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
 .bdrv_co_pwrite_zeroes= qemu_gluster_co_pwrite_zeroes,
@@ -1060,7 +1058,7 @@ static BlockDriver bdrv_gluster_rdma = {
 .bdrv_co_flush_to_disk= qemu_gluster_co_flush_to_disk,
 .bdrv_has_zero_init   = qemu_gluster_has_zero_init,
 #ifdef CONFIG_GLUSTERFS_DISCARD
-.bdrv_co_discard  = qemu_gluster_co_discard,
+.bdrv_co_pdiscard = qemu_gluster_co_pdiscard,
 #endif
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
 .bdrv_co_pwrite_zeroes= qemu_gluster_co_pwrite_zeroes,
-- 
2.5.5

[Qemu-devel] [Bug 1603580] [NEW] [gdbstub] qemu is killed when using remote debugger with qemu -S -s

2016-07-15 Thread Alon Bar-Lev

Public bug reported:

Hello,

REPRODUCE

$ qemu-system-x86_64 -s -S -nographic

QEMU: Terminated via GDBStub

$ gdb
(gdb) target remote :1234
(gdb) load /bin/ls
(gdb) target exec
A program is being debugged already. Kill it? (y or no) y
No executable file now.

EXPECTED

Enable program to be executed without terminating QEMU.

DISCUSSION

This was already discussed in [1], reverted in [2], however, no solution
is provided.

It worked perfectly in the past, I guess because of [1] and before [3].

Opening bug for this as discussion in mailing list did not attract
anyone and functionality is required. If there is other gdb sequence to
achieve same result it would be great to get it documented.

Thanks,

[1] 
http://git.qemu.org/?p=qemu.git;a=commitdiff;h=00e94dbc7fd0110b0555d59592b004333adfb4b8
[2] 
http://git.qemu.org/?p=qemu.git;a=commitdiff;h=ce0274f730eacbd24c706523ddbbabb6b95d0659
[3] 
http://git.qemu.org/?p=qemu.git;a=commitdiff;h=7d03f82f81e0e6c106ca0d2445a0fc49dc9ddc7b

** Affects: qemu
 Importance: Undecided
 Status: New

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1603580

Title:
  [gdbstub] qemu is killed when using remote debugger with qemu -S -s

Status in QEMU:
  New

Bug description:
  Hello,

  REPRODUCE

  $ qemu-system-x86_64 -s -S -nographic
  
  QEMU: Terminated via GDBStub

  $ gdb
  (gdb) target remote :1234
  (gdb) load /bin/ls
  (gdb) target exec
  A program is being debugged already. Kill it? (y or no) y
  No executable file now.

  EXPECTED

  Enable program to be executed without terminating QEMU.

  DISCUSSION

  This was already discussed in [1], reverted in [2], however, no
  solution is provided.

  It worked perfectly in the past, I guess because of [1] and before
  [3].

  Opening bug for this as discussion in mailing list did not attract
  anyone and functionality is required. If there is other gdb sequence
  to achieve same result it would be great to get it documented.

  Thanks,

  [1] 
http://git.qemu.org/?p=qemu.git;a=commitdiff;h=00e94dbc7fd0110b0555d59592b004333adfb4b8
  [2] 
http://git.qemu.org/?p=qemu.git;a=commitdiff;h=ce0274f730eacbd24c706523ddbbabb6b95d0659
  [3] 
http://git.qemu.org/?p=qemu.git;a=commitdiff;h=7d03f82f81e0e6c106ca0d2445a0fc49dc9ddc7b

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1603580/+subscriptions

[Qemu-devel] [Bug 1490611] Re: Using qemu >=2.2.1 to convert raw->VHD (fixed) adds extra padding to the result file, which Microsoft Azure rejects as invalid

2016-07-15 Thread Nish Aravamudan

** Changed in: qemu (Ubuntu Xenial)
   Status: New => In Progress

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1490611

Title:
  Using qemu >=2.2.1 to convert raw->VHD (fixed) adds extra padding to
  the result file, which Microsoft Azure rejects as invalid

Status in QEMU:
  Fix Released
Status in qemu package in Ubuntu:
  Fix Released
Status in qemu source package in Xenial:
  In Progress

Bug description:
  Starting with a raw disk image, using "qemu-img convert" to convert
  from raw to VHD results in the output VHD file's virtual size being
  aligned to the nearest 516096 bytes (16 heads x 63 sectors per head x
  512 bytes per sector), instead of preserving the input file's size as
  the output VHD's virtual disk size.

  Microsoft Azure requires that disk images (VHDs) submitted for upload
  have virtual sizes aligned to a megabyte boundary. (Ex. 4096MB,
  4097MB, 4098MB, etc. are OK, 4096.5MB is rejected with an error.) This
  is reflected in Microsoft's documentation: https://azure.microsoft.com
  /en-us/documentation/articles/virtual-machines-linux-create-upload-
  vhd-generic/

  This is reproducible with the following set of commands (including the
  Azure command line tools from https://github.com/Azure/azure-xplat-
  cli). For the following example, I used qemu version 2.2.1:

  $ dd if=/dev/zero of=source-disk.img bs=1M count=4096

  $ stat source-disk.img 
File: ‘source-disk.img’
Size: 4294967296  Blocks: 798656 IO Block: 4096   regular file
  Device: fc01h/64513dInode: 13247963Links: 1
  Access: (0644/-rw-r--r--)  Uid: ( 1000/  smkent)   Gid: ( 1000/  smkent)
  Access: 2015-08-18 09:48:02.613988480 -0700
  Modify: 2015-08-18 09:48:02.825985646 -0700
  Change: 2015-08-18 09:48:02.825985646 -0700
   Birth: -

  $ qemu-img convert -f raw -o subformat=fixed -O vpc source-disk.img
  dest-disk.vhd

  $ stat dest-disk.vhd 
File: ‘dest-disk.vhd’
Size: 4296499712  Blocks: 535216 IO Block: 4096   regular file
  Device: fc01h/64513dInode: 13247964Links: 1
  Access: (0644/-rw-r--r--)  Uid: ( 1000/  smkent)   Gid: ( 1000/  smkent)
  Access: 2015-08-18 09:50:22.252077624 -0700
  Modify: 2015-08-18 09:49:24.424868868 -0700
  Change: 2015-08-18 09:49:24.424868868 -0700
   Birth: -

  $ azure vm image create testimage1 dest-disk.vhd -o linux -l "West US"
  info:Executing command vm image create
  + Retrieving storage accounts 
 
  info:VHD size : 4097 MB
  info:Uploading 4195800.5 KB
  Requested:100.0% Completed:100.0% Running:   0 Time: 1m 0s Speed:  6744 KB/s 
  info:https://[redacted].blob.core.windows.net/vm-images/dest-disk.vhd was 
uploaded successfully
  error:   The VHD 
https://[redacted].blob.core.windows.net/vm-images/dest-disk.vhd has an 
unsupported virtual size of 4296499200 bytes.  The size must be a whole number 
(in MBs).
  info:Error information has been recorded to /home/smkent/.azure/azure.err
  error:   vm image create command failed

  I also ran the above commands using qemu 2.4.0, which resulted in the
  same error as the conversion behavior is the same.

  However, qemu 2.1.1 and earlier (including qemu 2.0.0 installed by
  Ubuntu 14.04) does not pad the virtual disk size during conversion.
  Using qemu-img convert from qemu versions <=2.1.1 results in a VHD
  that is exactly the size of the raw input file plus 512 bytes (for the
  VHD footer). Those qemu versions do not attempt to realign the disk.
  As a result, Azure accepts VHD files created using those versions of
  qemu-img convert for upload.

  Is there a reason why newer qemu realigns the converted VHD file? It
  would be useful if an option were added to disable this feature, as
  current versions of qemu cannot be used to create VHD files for Azure
  using Microsoft's official instructions.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1490611/+subscriptions

Re: [Qemu-devel] [RFC 2/2] linux-user: Fix cpu_index generation

2016-07-15 Thread Greg Kurz

On Thu, 14 Jul 2016 21:59:45 +1000
David Gibson  wrote:

> On Thu, Jul 14, 2016 at 03:50:56PM +0530, Bharata B Rao wrote:
> > On Thu, Jul 14, 2016 at 3:24 PM, Peter Maydell  
> > wrote:  
> > > On 14 July 2016 at 08:57, David Gibson  
> > > wrote:  
> > >> With CONFIG_USER_ONLY, generation of cpu_index values is done differently
> > >> than for full system targets.  This method turns out to be broken, since
> > >> it can fairly easily result in duplicate cpu_index values for
> > >> simultaneously active cpus (i.e. threads in the emulated process).
> > >>
> > >> Consider this sequence:
> > >> Create thread 1
> > >> Create thread 2
> > >> Exit thread 1
> > >> Create thread 3
> > >>
> > >> With the current logic thread 1 will get cpu_index 1, thread 2 will get
> > >> cpu_index 2 and thread 3 will also get cpu_index 2 (because there are 2
> > >> threads in the cpus list at the point of its creation).
> > >>
> > >> We mostly get away with this because cpu_index values aren't that 
> > >> important
> > >> for userspace emulation.  Still, it can't be good, so this patch fixes it
> > >> by making CONFIG_USER_ONLY use the same bitmap based allocation that full
> > >> system targets already use.
> > >>
> > >> Signed-off-by: David Gibson 
> > >> ---
> > >>  exec.c | 19 ---
> > >>  1 file changed, 19 deletions(-)
> > >>
> > >> diff --git a/exec.c b/exec.c
> > >> index 011babd..e410dab 100644
> > >> --- a/exec.c
> > >> +++ b/exec.c
> > >> @@ -596,7 +596,6 @@ AddressSpace *cpu_get_address_space(CPUState *cpu, 
> > >> int asidx)
> > >>  }
> > >>  #endif
> > >>
> > >> -#ifndef CONFIG_USER_ONLY
> > >>  static DECLARE_BITMAP(cpu_index_map, MAX_CPUMASK_BITS);
> > >>
> > >>  static int cpu_get_free_index(Error **errp)
> > >> @@ -617,24 +616,6 @@ static void cpu_release_index(CPUState *cpu)
> > >>  {
> > >>  bitmap_clear(cpu_index_map, cpu->cpu_index, 1);
> > >>  }
> > >> -#else
> > >> -
> > >> -static int cpu_get_free_index(Error **errp)
> > >> -{
> > >> -CPUState *some_cpu;
> > >> -int cpu_index = 0;
> > >> -
> > >> -CPU_FOREACH(some_cpu) {
> > >> -cpu_index++;
> > >> -}
> > >> -return cpu_index;
> > >> -}
> > >> -
> > >> -static void cpu_release_index(CPUState *cpu)
> > >> -{
> > >> -return;
> > >> -}
> > >> -#endif  
> > >
> > > Won't this change impose a maximum limit of 256 simultaneous
> > > threads? That seems a little low for comfort.  
> > 
> > This was the reason why the bitmap logic wasn't applied to
> > CONFIG_USER_ONLY when it was introduced.
> > 
> > https://lists.gnu.org/archive/html/qemu-devel/2015-05/msg01980.html  
> 
> Ah.. good point.
> 
> Hrm, ok, my next idea would be to just (globally) sequentially
> allocate cpu_index values for CONFIG_USER, and never try to re-use
> them.  Does that seem reasonable?
> 

Isn't it only deferring the problem to later ?

Maybe it is possible to define MAX_CPUMASK_BITS to a much higher
value fo CONFIG_USER only instead ?

> > But then we didn't have actual removal, but we do now.  
> 
> You mean patch 1/2 in this set?  Or something else?
> 
> Even so, 256 does seem a bit low for a number of simultaneously active
> threads - there are some bug hairy multi-threaded programs out there.
> 



pgp_ZN8y_w6j4.pgp
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH 01/17] block: Convert bdrv_co_discard() to byte-based

2016-07-15 Thread Eric Blake

On 06/22/2016 09:50 AM, Eric Blake wrote:
> Another step towards byte-based interfaces everywhere.  Replace
> the sector-based bdrv_co_discard() with a new byte-based
> bdrv_co_pdiscard(), which silently ignores any unaligned head
> or tail.  Driver callbacks will be converted in followup patches.
> 
> By calculating the alignment outside of the loop, and clamping
> the max discard to an aligned value, we can simplify the actions
> done within the loop.
> 
> Signed-off-by: Eric Blake 
> ---

> +++ b/block/io.c

> +int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
> +  int count)

> -tracked_request_begin(&req, bs, sector_num << BDRV_SECTOR_BITS,
> -  nb_sectors << BDRV_SECTOR_BITS, 
> BDRV_TRACKED_DISCARD);
> +/* Discard is advisory, so ignore any unaligned head or tail */
> +align = MAX(BDRV_SECTOR_SIZE,
> +MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment));
> +assert(is_power_of_2(align));

And this further ingrains the already existing problem with iscsi
devices that advertise 15M for discard alignment.  Is it more important
for me to fix that regression first then base this series on top of
that, or is it acceptable for me to repost this series and then work on
the regression fix?  I'd prefer the latter course of action, as a
regression fix is good material for post-hardfreeze.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] QOM: best way for parents to pass information to children? (was Re: [PATCH RFC 07/16] qom/cpu: make nr-cores, nr-threads real properties)

2016-07-15 Thread Eduardo Habkost

On Fri, Jul 15, 2016 at 08:38:35PM +0200, Igor Mammedov wrote:
> On Fri, 15 Jul 2016 14:43:53 -0300
> Eduardo Habkost  wrote:
> > On Fri, Jul 15, 2016 at 06:30:41PM +0200, Andreas Färber wrote:
> > > Am 15.07.2016 um 18:10 schrieb Eduardo Habkost:
> > > > On Fri, Jul 15, 2016 at 11:11:38AM +0200, Igor Mammedov wrote:
> > > >> On Fri, 15 Jul 2016 08:35:30 +0200
> > > >> Andrew Jones  wrote:
> > > >>> On Thu, Jul 14, 2016 at 05:07:43PM -0300, Eduardo Habkost wrote:
> > > 
> > >  First of all, sorry for the horrible delay in replying to this
> > >  thread.
> > > 
> > >  On Wed, Jun 15, 2016 at 10:56:20AM +1000, David Gibson wrote:  
> > > > On Tue, Jun 14, 2016 at 08:19:49AM +0200, Andrew Jones
> > > > wrote:  
> > > >> On Tue, Jun 14, 2016 at 12:12:16PM +1000, David Gibson
> > > >> wrote:  
> > > >>> On Sun, Jun 12, 2016 at 03:48:10PM +0200, Andrew Jones
> > > >>> wrote:  
> > > > [...]
> > > >> +static Property cpu_common_properties[] = {
> > > >> +DEFINE_PROP_INT32("nr-cores", CPUState, nr_cores,
> > > >> 1),
> > > >> +DEFINE_PROP_INT32("nr-threads", CPUState,
> > > >> nr_threads, 1),
> > > >> +DEFINE_PROP_END_OF_LIST()
> > > >> +};  
> > > >
> > > > Are you aware of the current CPU hotplug discussion that
> > > > is going on?  
> > > 
> > >  I'm aware of it going on, but haven't been following it.
> > >    
> > > > I'm not very involved there, but I think some of these
> > > > reworks also move "nr_threads" into the CPU state
> > > > already, e.g. see:  
> > > 
> > >  nr_threads (and nr_cores) are already state in CPUState.
> > >  This patch just exposes that state via properties.
> > >    
> > > >
> > > > https://github.com/dgibson/qemu/commit/9d07719784ecbeebea71
> > > >
> > > > ... so you might want to check these patches first to see
> > > > whether you can base your rework on them?  
> > > 
> > >  Every cpu, and thus every machine, uses CPUState for its
> > >  cpus. I'm not sure every machine will want to use that new
> > >  abstract core class though. If they did, then we could
> > >  indeed use nr_threads from there instead (and remove it
> > >  from CPUState), but we'd still need nr_cores from the
> > >  abstract cpu package class (CPUState).  
> > > >>>
> > > >>> Hmm.  Since the CPUState object represents just a single
> > > >>> thread, it seems weird to me that it would have nr_threads
> > > >>> and nr_cores information.  
> > > 
> > >  Agreed it is weird, and I think we should try to move it away
> > >  from CPUState, not make it part of the TYPE_CPU interface.
> > >  nr_threads belongs to the actual container of the Thread
> > >  objects, and nr_cores in the actual container of the Core
> > >  objects.
> > > 
> > >  The problem is how to implement that in a non-intrusive way
> > >  that would require changing the object hierarchy of all
> > >  architectures.
> > > 
> > >    
> > > >>>
> > > >>> Exposing those as properties makes that much worse, because
> > > >>> it's now ABI, rather than internal detail we can clean up
> > > >>> at some future time.  
> > > >>
> > > >> CPUState is supposed to be "State of one CPU core or
> > > >> thread", which justifies having nr_threads state, as it may
> > > >> be describing a core.  
> > > >
> > > > Um.. does it ever actually represent a (multithread) core in
> > > > practice? It would need to have duplicated register state for
> > > > every thread were that the case.  
> > > 
> > >  AFAIK, CPUState is still always thread state. Or has this
> > >  changed in some architectures, already?
> > >    
> > > >   
> > > >> I guess there's no justification for having nr_cores in
> > > >> there though. I agree adding the Core class is a good idea,
> > > >> assuming it will get used by all machines, and CPUState then
> > > >> gets changed to a Thread class. The question then, though,
> > > >> is do we also create a Socket class that contains nr_cores?  
> > > >
> > > > That was roughly our intention with the way the cross
> > > > platform hotplug stuff is evolving.  But the intention was
> > > > that the Socket objects would only need to be constructed for
> > > > machine types where it makes sense.  So for example on the
> > > > paravirt pseries platform, we'll only have Core objects,
> > > > because the socket distinction isn't really meaningful.
> > > >   
> > > >> And how will a Thread method get that information when it
> > > >> needs to emulate, e.g. CPUID, that requires it? It's a bit
> > > >> messy, so I'm open to all suggestions on it.  
> > > >
> > > >>

Re: [Qemu-devel] [PATCH] cpu-exec: Move down some declarations in cpu_exec()

2016-07-15 Thread Stefan Weil

Am 15.07.2016 um 21:31 schrieb Sergey Fedorov:
> From: Sergey Fedorov 
> 
> This will fix a compiler warning with -Wclobbered:
> 
> http://lists.nongnu.org/archive/html/qemu-devel/2016-07/msg03347.html
> 
> Reported-by: Stefan Weil 
> Signed-off-by: Sergey Fedorov 
> Signed-off-by: Sergey Fedorov 
> ---
>  cpu-exec.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/cpu-exec.c b/cpu-exec.c
> index b840e1d2dd41..5d9710a1eaf2 100644
> --- a/cpu-exec.c
> +++ b/cpu-exec.c
> @@ -608,17 +608,16 @@ int cpu_exec(CPUState *cpu)
>  init_delay_params(&sc, cpu);
>  
>  for(;;) {
> -TranslationBlock *tb, *last_tb;
> -int tb_exit = 0;
> -
>  /* prepare setjmp context for exception handling */
>  if (sigsetjmp(cpu->jmp_env, 0) == 0) {
> +TranslationBlock *tb, *last_tb = NULL;
> +int tb_exit = 0;
> +
>  /* if an exception is pending, we execute it here */
>  if (cpu_handle_exception(cpu, &ret)) {
>  break;
>  }
>  
> -last_tb = NULL; /* forget the last executed TB after exception */
>  cpu->tb_flushed = false; /* reset before first TB lookup */
>  for(;;) {
>  cpu_handle_interrupt(cpu, &last_tb);
> 

Reviewed-by: Stefan Weil 

Thanks.




signature.asc
Description: OpenPGP digital signature

[Qemu-devel] [PATCH v6 09/10] tests: Add test code for hbitmap serialization

2016-07-15 Thread John Snow

From: Fam Zheng 

Acked-by: John Snow 
Signed-off-by: Fam Zheng 
Signed-off-by: John Snow 
---
 tests/test-hbitmap.c | 139 +++
 1 file changed, 139 insertions(+)

diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index e3abde1..18e8114 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -11,6 +11,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/hbitmap.h"
+#include "qemu/bitmap.h"
 #include "block/block.h"
 
 #define LOG_BITS_PER_LONG  (BITS_PER_LONG == 32 ? 5 : 6)
@@ -737,6 +738,16 @@ static void test_hbitmap_meta_one(TestHBitmapData *data, 
const void *unused)
 }
 }
 
+static void test_hbitmap_serialize_granularity(TestHBitmapData *data,
+   const void *unused)
+{
+int r;
+
+hbitmap_test_init(data, L3 * 2, 3);
+r = hbitmap_serialization_granularity(data->hb);
+g_assert_cmpint(r, ==, BITS_PER_LONG << 3);
+}
+
 static void test_hbitmap_meta_zero(TestHBitmapData *data, const void *unused)
 {
 hbitmap_test_init_meta(data, 0, 0, 1);
@@ -744,6 +755,125 @@ static void test_hbitmap_meta_zero(TestHBitmapData *data, 
const void *unused)
 hbitmap_check_meta(data, 0, 0);
 }
 
+static void hbitmap_test_serialize_range(TestHBitmapData *data,
+ uint8_t *buf, size_t buf_size,
+ uint64_t pos, uint64_t count)
+{
+size_t i;
+
+assert(hbitmap_granularity(data->hb) == 0);
+hbitmap_reset_all(data->hb);
+memset(buf, 0, buf_size);
+if (count) {
+hbitmap_set(data->hb, pos, count);
+}
+hbitmap_serialize_part(data->hb, buf, 0, data->size);
+for (i = 0; i < data->size; i++) {
+int is_set = test_bit(i, (unsigned long *)buf);
+if (i >= pos && i < pos + count) {
+g_assert(is_set);
+} else {
+g_assert(!is_set);
+}
+}
+hbitmap_reset_all(data->hb);
+hbitmap_deserialize_part(data->hb, buf, 0, data->size, true);
+
+for (i = 0; i < data->size; i++) {
+int is_set = hbitmap_get(data->hb, i);
+if (i >= pos && i < pos + count) {
+g_assert(is_set);
+} else {
+g_assert(!is_set);
+}
+}
+}
+
+static void test_hbitmap_serialize_basic(TestHBitmapData *data,
+ const void *unused)
+{
+int i, j;
+size_t buf_size;
+uint8_t *buf;
+uint64_t positions[] = { 0, 1, L1 - 1, L1, L2 - 1, L2, L2 + 1, L3 - 1 };
+int num_positions = sizeof(positions) / sizeof(positions[0]);
+
+hbitmap_test_init(data, L3, 0);
+buf_size = hbitmap_serialization_size(data->hb, 0, data->size);
+buf = g_malloc0(buf_size);
+
+for (i = 0; i < num_positions; i++) {
+for (j = 0; j < num_positions; j++) {
+hbitmap_test_serialize_range(data, buf, buf_size,
+ positions[i],
+ MIN(positions[j], L3 - positions[i]));
+}
+}
+
+g_free(buf);
+}
+
+static void test_hbitmap_serialize_part(TestHBitmapData *data,
+const void *unused)
+{
+int i, j, k;
+size_t buf_size;
+uint8_t *buf;
+uint64_t positions[] = { 0, 1, L1 - 1, L1, L2 - 1, L2, L2 + 1, L3 - 1 };
+int num_positions = sizeof(positions) / sizeof(positions[0]);
+
+hbitmap_test_init(data, L3, 0);
+buf_size = L2;
+buf = g_malloc0(buf_size);
+
+for (i = 0; i < num_positions; i++) {
+hbitmap_set(data->hb, positions[i], 1);
+}
+
+for (i = 0; i < data->size; i += buf_size) {
+hbitmap_serialize_part(data->hb, buf, i, buf_size);
+for (j = 0; j < buf_size; j++) {
+bool should_set = false;
+for (k = 0; k < num_positions; k++) {
+if (positions[k] == j + i) {
+should_set = true;
+break;
+}
+}
+g_assert_cmpint(should_set, ==, test_bit(j, (unsigned long *)buf));
+}
+}
+
+g_free(buf);
+}
+
+static void test_hbitmap_serialize_zeroes(TestHBitmapData *data,
+  const void *unused)
+{
+int i;
+HBitmapIter iter;
+int64_t next;
+uint64_t positions[] = { 0, L1, L2, L3 - L1};
+int num_positions = sizeof(positions) / sizeof(positions[0]);
+
+hbitmap_test_init(data, L3, 0);
+
+for (i = 0; i < num_positions; i++) {
+hbitmap_set(data->hb, positions[i], L1);
+}
+
+for (i = 0; i < num_positions; i++) {
+hbitmap_deserialize_zeroes(data->hb, positions[i], L1, true);
+hbitmap_iter_init(&iter, data->hb, 0);
+next = hbitmap_iter_next(&iter);
+if (i == num_positions - 1) {
+g_assert_cmpint(next, ==, -1);
+} else {
+g_assert_cmpint(next, ==, positions[i + 1]);
+}
+}
+}
+
 static void hbitmap_test_add(const char

[Qemu-devel] [PATCH v6 10/10] block: More operations for meta dirty bitmap

2016-07-15 Thread John Snow

From: Fam Zheng 

Callers can create an iterator of meta bitmap with
bdrv_dirty_meta_iter_new(), then use the bdrv_dirty_iter_* operations on
it. Meta iterators are also counted by bitmap->active_iterators.

Also add a couple of functions to retrieve granularity and count.

Signed-off-by: Fam Zheng 
Reviewed-by: Max Reitz 
Signed-off-by: John Snow 
---
 block/dirty-bitmap.c | 19 +++
 include/block/dirty-bitmap.h |  3 +++
 2 files changed, 22 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 8113090..d94de7b 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -393,6 +393,11 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap 
*bitmap)
 return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
 }
 
+uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap)
+{
+return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->meta);
+}
+
 BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
  uint64_t first_sector)
 {
@@ -403,6 +408,15 @@ BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap 
*bitmap,
 return iter;
 }
 
+BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap)
+{
+BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1);
+hbitmap_iter_init(&iter->hbi, bitmap->meta, 0);
+iter->bitmap = bitmap;
+bitmap->active_iterators++;
+return iter;
+}
+
 void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter)
 {
 if (!iter) {
@@ -514,3 +528,8 @@ int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
 {
 return hbitmap_count(bitmap->bitmap);
 }
+
+int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap)
+{
+return hbitmap_count(bitmap->meta);
+}
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 40a09c0..3cbed02 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -30,6 +30,7 @@ void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
 uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
+uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
 const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap);
@@ -47,12 +48,14 @@ int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
 void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
   BdrvDirtyBitmap *bitmap, int64_t sector,
   int nb_sectors);
+BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap);
 BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
  uint64_t first_sector);
 void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter);
 int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
 void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num);
 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
+int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);
 void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 
 uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
-- 
2.7.4

[Qemu-devel] [PATCH v6 08/10] block: BdrvDirtyBitmap serialization interface

2016-07-15 Thread John Snow

From: Vladimir Sementsov-Ogievskiy 

Several functions to provide necessary access to BdrvDirtyBitmap for
block-migration.c

Signed-off-by: Vladimir Sementsov-Ogievskiy 
[Add the "finish" parameters. - Fam]
Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 

Signed-off-by: John Snow 
---
 block/dirty-bitmap.c | 37 +
 include/block/dirty-bitmap.h | 14 ++
 2 files changed, 51 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 39e072a..8113090 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -453,6 +453,43 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, 
HBitmap *in)
 hbitmap_free(tmp);
 }
 
+uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
+  uint64_t start, uint64_t count)
+{
+return hbitmap_serialization_size(bitmap->bitmap, start, count);
+}
+
+uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap)
+{
+return hbitmap_serialization_granularity(bitmap->bitmap);
+}
+
+void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
+  uint8_t *buf, uint64_t start,
+  uint64_t count)
+{
+hbitmap_serialize_part(bitmap->bitmap, buf, start, count);
+}
+
+void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
+uint8_t *buf, uint64_t start,
+uint64_t count, bool finish)
+{
+hbitmap_deserialize_part(bitmap->bitmap, buf, start, count, finish);
+}
+
+void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
+  uint64_t start, uint64_t count,
+  bool finish)
+{
+hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish);
+}
+
+void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap)
+{
+hbitmap_deserialize_finish(bitmap->bitmap);
+}
+
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
 int nr_sectors)
 {
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index caa4d82..40a09c0 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -55,4 +55,18 @@ void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t 
sector_num);
 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
 void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 
+uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
+  uint64_t start, uint64_t count);
+uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
+  uint8_t *buf, uint64_t start,
+  uint64_t count);
+void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
+uint8_t *buf, uint64_t start,
+uint64_t count, bool finish);
+void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
+  uint64_t start, uint64_t count,
+  bool finish);
+void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap);
+
 #endif
-- 
2.7.4

[Qemu-devel] [PATCH v6 01/10] block: Hide HBitmap in block dirty bitmap interface

2016-07-15 Thread John Snow

From: Fam Zheng 

HBitmap is an implementation detail of block dirty bitmap that should be hidden
from users. Introduce a BdrvDirtyBitmapIter to encapsulate the underlying
HBitmapIter.

A small difference in the interface is, before, an HBitmapIter is initialized
in place, now the new BdrvDirtyBitmapIter must be dynamically allocated because
the structure definition is in block/dirty-bitmap.c.

Two current users are converted too.

Signed-off-by: Fam Zheng 
Reviewed-by: Max Reitz 
Signed-off-by: John Snow 
---
 block/backup.c   | 14 --
 block/dirty-bitmap.c | 39 +--
 block/mirror.c   | 24 +---
 include/block/dirty-bitmap.h |  7 +--
 include/qemu/typedefs.h  |  1 +
 5 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 2c05323..fc0fccf 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -325,14 +325,14 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 int64_t end;
 int64_t last_cluster = -1;
 int64_t sectors_per_cluster = cluster_size_sectors(job);
-HBitmapIter hbi;
+BdrvDirtyBitmapIter *dbi;
 
 granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
 clusters_per_iter = MAX((granularity / job->cluster_size), 1);
-bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
+dbi = bdrv_dirty_iter_new(job->sync_bitmap, 0);
 
 /* Find the next dirty sector(s) */
-while ((sector = hbitmap_iter_next(&hbi)) != -1) {
+while ((sector = bdrv_dirty_iter_next(dbi)) != -1) {
 cluster = sector / sectors_per_cluster;
 
 /* Fake progress updates for any clusters we skipped */
@@ -344,7 +344,7 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
 do {
 if (yield_and_check(job)) {
-return ret;
+goto out;
 }
 ret = backup_do_cow(job, cluster * sectors_per_cluster,
 sectors_per_cluster, &error_is_read,
@@ -352,7 +352,7 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 if ((ret < 0) &&
 backup_error_action(job, error_is_read, -ret) ==
 BLOCK_ERROR_ACTION_REPORT) {
-return ret;
+goto out;
 }
 } while (ret < 0);
 }
@@ -360,7 +360,7 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 /* If the bitmap granularity is smaller than the backup granularity,
  * we need to advance the iterator pointer to the next cluster. */
 if (granularity < job->cluster_size) {
-bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster);
+bdrv_set_dirty_iter(dbi, cluster * sectors_per_cluster);
 }
 
 last_cluster = cluster - 1;
@@ -372,6 +372,8 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 job->common.offset += ((end - last_cluster - 1) * job->cluster_size);
 }
 
+out:
+bdrv_dirty_iter_free(dbi);
 return ret;
 }
 
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 4902ca5..628b77c 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -42,9 +42,15 @@ struct BdrvDirtyBitmap {
 char *name; /* Optional non-empty unique ID */
 int64_t size;   /* Size of the bitmap (Number of sectors) */
 bool disabled;  /* Bitmap is read-only */
+int active_iterators;   /* How many iterators are active */
 QLIST_ENTRY(BdrvDirtyBitmap) list;
 };
 
+struct BdrvDirtyBitmapIter {
+HBitmapIter hbi;
+BdrvDirtyBitmap *bitmap;
+};
+
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
 {
 BdrvDirtyBitmap *bm;
@@ -212,6 +218,7 @@ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
 
 QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) {
 assert(!bdrv_dirty_bitmap_frozen(bitmap));
+assert(!bitmap->active_iterators);
 hbitmap_truncate(bitmap->bitmap, size);
 bitmap->size = size;
 }
@@ -224,6 +231,7 @@ static void 
bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
 BdrvDirtyBitmap *bm, *next;
 QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) {
 if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
+assert(!bm->active_iterators);
 assert(!bdrv_dirty_bitmap_frozen(bm));
 QLIST_REMOVE(bm, list);
 hbitmap_free(bm->bitmap);
@@ -320,9 +328,29 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap 
*bitmap)
 return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
 }
 
-void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
+BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBi

[Qemu-devel] [PATCH v6 05/10] block: Add two dirty bitmap getters

2016-07-15 Thread John Snow

From: Fam Zheng 

For dirty bitmap users to get the size and the name of a
BdrvDirtyBitmap.

Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
Reviewed-by: Max Reitz 
Signed-off-by: John Snow 
---
 block/dirty-bitmap.c | 10 ++
 include/block/dirty-bitmap.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 9c53c56..a71c9b7 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -154,6 +154,16 @@ void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
 hbitmap_reset(bitmap->meta, sector, nb_sectors);
 }
 
+int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap)
+{
+return bitmap->size;
+}
+
+const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap)
+{
+return bitmap->name;
+}
+
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
 {
 return bitmap->successor;
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 50e0fca..caa4d82 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -32,6 +32,8 @@ uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState 
*bs);
 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
+const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap);
+int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap);
 DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
int64_t sector);
-- 
2.7.4

[Qemu-devel] [PATCH v6 03/10] tests: Add test code for meta bitmap

2016-07-15 Thread John Snow

From: Fam Zheng 

Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
Reviewed-by: Max Reitz 
Signed-off-by: John Snow 
---
 tests/test-hbitmap.c | 116 +++
 1 file changed, 116 insertions(+)

diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index c0e9895..e3abde1 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -11,6 +11,7 @@
 
 #include "qemu/osdep.h"
 #include "qemu/hbitmap.h"
+#include "block/block.h"
 
 #define LOG_BITS_PER_LONG  (BITS_PER_LONG == 32 ? 5 : 6)
 
@@ -20,6 +21,7 @@
 
 typedef struct TestHBitmapData {
 HBitmap   *hb;
+HBitmap   *meta;
 unsigned long *bits;
 size_t size;
 size_t old_size;
@@ -91,6 +93,14 @@ static void hbitmap_test_init(TestHBitmapData *data,
 }
 }
 
+static void hbitmap_test_init_meta(TestHBitmapData *data,
+   uint64_t size, int granularity,
+   int meta_chunk)
+{
+hbitmap_test_init(data, size, granularity);
+data->meta = hbitmap_create_meta(data->hb, meta_chunk);
+}
+
 static inline size_t hbitmap_test_array_size(size_t bits)
 {
 size_t n = DIV_ROUND_UP(bits, BITS_PER_LONG);
@@ -133,6 +143,9 @@ static void hbitmap_test_teardown(TestHBitmapData *data,
   const void *unused)
 {
 if (data->hb) {
+if (data->meta) {
+hbitmap_free_meta(data->hb);
+}
 hbitmap_free(data->hb);
 data->hb = NULL;
 }
@@ -634,6 +647,103 @@ static void 
test_hbitmap_truncate_shrink_large(TestHBitmapData *data,
 hbitmap_test_truncate(data, size, -diff, 0);
 }
 
+static void hbitmap_check_meta(TestHBitmapData *data,
+   int64_t start, int count)
+{
+int64_t i;
+
+for (i = 0; i < data->size; i++) {
+if (i >= start && i < start + count) {
+g_assert(hbitmap_get(data->meta, i));
+} else {
+g_assert(!hbitmap_get(data->meta, i));
+}
+}
+}
+
+static void hbitmap_test_meta(TestHBitmapData *data,
+  int64_t start, int count,
+  int64_t check_start, int check_count)
+{
+hbitmap_reset_all(data->hb);
+hbitmap_reset_all(data->meta);
+
+/* Test "unset" -> "unset" will not update meta. */
+hbitmap_reset(data->hb, start, count);
+hbitmap_check_meta(data, 0, 0);
+
+/* Test "unset" -> "set" will update meta */
+hbitmap_set(data->hb, start, count);
+hbitmap_check_meta(data, check_start, check_count);
+
+/* Test "set" -> "set" will not update meta */
+hbitmap_reset_all(data->meta);
+hbitmap_set(data->hb, start, count);
+hbitmap_check_meta(data, 0, 0);
+
+/* Test "set" -> "unset" will update meta */
+hbitmap_reset_all(data->meta);
+hbitmap_reset(data->hb, start, count);
+hbitmap_check_meta(data, check_start, check_count);
+}
+
+static void hbitmap_test_meta_do(TestHBitmapData *data, int chunk_size)
+{
+uint64_t size = chunk_size * 100;
+hbitmap_test_init_meta(data, size, 0, chunk_size);
+
+hbitmap_test_meta(data, 0, 1, 0, chunk_size);
+hbitmap_test_meta(data, 0, chunk_size, 0, chunk_size);
+hbitmap_test_meta(data, chunk_size - 1, 1, 0, chunk_size);
+hbitmap_test_meta(data, chunk_size - 1, 2, 0, chunk_size * 2);
+hbitmap_test_meta(data, chunk_size - 1, chunk_size + 1, 0, chunk_size * 2);
+hbitmap_test_meta(data, chunk_size - 1, chunk_size + 2, 0, chunk_size * 3);
+hbitmap_test_meta(data, 7 * chunk_size - 1, chunk_size + 2,
+  6 * chunk_size, chunk_size * 3);
+hbitmap_test_meta(data, size - 1, 1, size - chunk_size, chunk_size);
+hbitmap_test_meta(data, 0, size, 0, size);
+}
+
+static void test_hbitmap_meta_byte(TestHBitmapData *data, const void *unused)
+{
+hbitmap_test_meta_do(data, BITS_PER_BYTE);
+}
+
+static void test_hbitmap_meta_word(TestHBitmapData *data, const void *unused)
+{
+hbitmap_test_meta_do(data, BITS_PER_LONG);
+}
+
+static void test_hbitmap_meta_sector(TestHBitmapData *data, const void *unused)
+{
+hbitmap_test_meta_do(data, BDRV_SECTOR_SIZE * BITS_PER_BYTE);
+}
+
+/**
+ * Create an HBitmap and test set/unset.
+ */
+static void test_hbitmap_meta_one(TestHBitmapData *data, const void *unused)
+{
+int i;
+int64_t offsets[] = {
+0, 1, L1 - 1, L1, L1 + 1, L2 - 1, L2, L2 + 1, L3 - 1, L3, L3 + 1
+};
+
+hbitmap_test_init_meta(data, L3 * 2, 0, 1);
+for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+hbitmap_test_meta(data, offsets[i], 1, offsets[i], 1);
+hbitmap_test_meta(data, offsets[i], L1, offsets[i], L1);
+hbitmap_test_meta(data, offsets[i], L2, offsets[i], L2);
+}
+}
+
+static void test_hbitmap_meta_zero(TestHBitmapData *data, const void *unused)
+{
+hbitmap_test_init_meta(data, 0, 0, 1);
+
+hbitmap_check_meta(data, 0, 0);
+}
+
 static void hbitmap_test_add(const char *testpath,

[Qemu-devel] [PATCH v6 07/10] hbitmap: serialization

2016-07-15 Thread John Snow

From: Vladimir Sementsov-Ogievskiy 

Functions to serialize / deserialize(restore) HBitmap. HBitmap should be
saved to linear sequence of bits independently of endianness and bitmap
array element (unsigned long) size. Therefore Little Endian is chosen.

These functions are appropriate for dirty bitmap migration, restoring
the bitmap in several steps is available. To save performance, every
step writes only the last level of the bitmap. All other levels are
restored by hbitmap_deserialize_finish() as a last step of restoring.
So, HBitmap is inconsistent while restoring.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
[Fix left shift operand to 1UL; add "finish" parameter. - Fam]
Signed-off-by: Fam Zheng 
Reviewed-by: Max Reitz 

Signed-off-by: John Snow 
---
 include/qemu/hbitmap.h |  79 
 util/hbitmap.c | 137 +
 2 files changed, 216 insertions(+)

diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 1725919..eb46475 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -146,6 +146,85 @@ void hbitmap_reset_all(HBitmap *hb);
 bool hbitmap_get(const HBitmap *hb, uint64_t item);
 
 /**
+ * hbitmap_serialization_granularity:
+ * @hb: HBitmap to operate on.
+ *
+ * Granularity of serialization chunks, used by other serialization functions.
+ * For every chunk:
+ * 1. Chunk start should be aligned to this granularity.
+ * 2. Chunk size should be aligned too, except for last chunk (for which
+ *  start + count == hb->size)
+ */
+uint64_t hbitmap_serialization_granularity(const HBitmap *hb);
+
+/**
+ * hbitmap_serialization_size:
+ * @hb: HBitmap to operate on.
+ * @start: Starting bit
+ * @count: Number of bits
+ *
+ * Return number of bytes hbitmap_(de)serialize_part needs
+ */
+uint64_t hbitmap_serialization_size(const HBitmap *hb,
+uint64_t start, uint64_t count);
+
+/**
+ * hbitmap_serialize_part
+ * @hb: HBitmap to operate on.
+ * @buf: Buffer to store serialized bitmap.
+ * @start: First bit to store.
+ * @count: Number of bits to store.
+ *
+ * Stores HBitmap data corresponding to given region. The format of saved data
+ * is linear sequence of bits, so it can be used by hbitmap_deserialize_part
+ * independently of endianness and size of HBitmap level array elements
+ */
+void hbitmap_serialize_part(const HBitmap *hb, uint8_t *buf,
+uint64_t start, uint64_t count);
+
+/**
+ * hbitmap_deserialize_part
+ * @hb: HBitmap to operate on.
+ * @buf: Buffer to restore bitmap data from.
+ * @start: First bit to restore.
+ * @count: Number of bits to restore.
+ * @finish: Whether to call hbitmap_deserialize_finish automatically.
+ *
+ * Restores HBitmap data corresponding to given region. The format is the same
+ * as for hbitmap_serialize_part.
+ *
+ * If @finish is false, caller must call hbitmap_serialize_finish before using
+ * the bitmap.
+ */
+void hbitmap_deserialize_part(HBitmap *hb, uint8_t *buf,
+  uint64_t start, uint64_t count,
+  bool finish);
+
+/**
+ * hbitmap_deserialize_zeroes
+ * @hb: HBitmap to operate on.
+ * @start: First bit to restore.
+ * @count: Number of bits to restore.
+ * @finish: Whether to call hbitmap_deserialize_finish automatically.
+ *
+ * Fills the bitmap with zeroes.
+ *
+ * If @finish is false, caller must call hbitmap_serialize_finish before using
+ * the bitmap.
+ */
+void hbitmap_deserialize_zeroes(HBitmap *hb, uint64_t start, uint64_t count,
+bool finish);
+
+/**
+ * hbitmap_deserialize_finish
+ * @hb: HBitmap to operate on.
+ *
+ * Repair HBitmap after calling hbitmap_deserialize_data. Actually, all HBitmap
+ * layers are restored here.
+ */
+void hbitmap_deserialize_finish(HBitmap *hb);
+
+/**
  * hbitmap_free:
  * @hb: HBitmap to operate on.
  *
diff --git a/util/hbitmap.c b/util/hbitmap.c
index 5186500..fd4d327 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -398,6 +398,143 @@ bool hbitmap_get(const HBitmap *hb, uint64_t item)
 return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0;
 }
 
+uint64_t hbitmap_serialization_granularity(const HBitmap *hb)
+{
+/* Require at least 64 bit granularity to be safe on both 64 bit and 32 bit
+ * hosts. */
+return 64 << hb->granularity;
+}
+
+/* Start should be aligned to serialization granularity, chunk size should be
+ * aligned to serialization granularity too, except for last chunk.
+ */
+static void serialization_chunk(const HBitmap *hb,
+uint64_t start, uint64_t count,
+unsigned long **first_el, size_t *el_count)
+{
+uint64_t last = start + count - 1;
+uint64_t gran = hbitmap_serialization_granularity(hb);
+
+assert((start & (gran - 1)) == 0);
+assert((last >> hb->granularity) < hb->size);
+if ((last >> hb->granularity) != hb->size - 1) {
+a

[Qemu-devel] [PATCH v6 06/10] block: Assert that bdrv_release_dirty_bitmap succeeded

2016-07-15 Thread John Snow

From: Fam Zheng 

We use a loop over bs->dirty_bitmaps to make sure the caller is
only releasing a bitmap owned by bs. Let's also assert that in this case
the caller is releasing a bitmap that does exist.

Signed-off-by: Fam Zheng 
Reviewed-by: Max Reitz 
Signed-off-by: John Snow 
---
 block/dirty-bitmap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index a71c9b7..39e072a 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -305,6 +305,9 @@ static void 
bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
 }
 }
 }
+if (bitmap) {
+abort();
+}
 }
 
 void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap)
-- 
2.7.4

[Qemu-devel] [PATCH v6 00/10] Dirty bitmap changes for migration/persistence work

2016-07-15 Thread John Snow

v6: Rebase.

02: Added documentation changes as suggested by Max.

v5: Rebase: first 5 patches from last revision are already merged.

Addressed Max's comments:

01: - "block.c" -> "block/dirty-bitmap.c" in commit message.
- "an BdrvDirtyBitmapIter" -> "an BdrvDirtyBitmapIter" in code comment.
- hbitmap_next => next_dirty as variable name.
- bdrv_dirty_iter_free()/bdrv_dirty_iter_new() pairs =>
  bdrv_set_dirty_iter.

02: Move the assert fix into 01.

04: Truncate the meta bitmap (done by hbitmap_truncate).

06: Add Max's r-b.

07: I left the memcpy vs cpu_to_le32/64w as is to pick up Max's r-b. That
could be improved on top if wanted.

10: Add Max's r-b.



For convenience, this branch is available at:
https://github.com/jnsnow/qemu.git branch meta-bitmap
https://github.com/jnsnow/qemu/tree/meta-bitmap

This version is tagged meta-bitmap-v6:
https://github.com/jnsnow/qemu/releases/tag/meta-bitmap-v6

Fam Zheng (8):
  block: Hide HBitmap in block dirty bitmap interface
  HBitmap: Introduce "meta" bitmap to track bit changes
  tests: Add test code for meta bitmap
  block: Support meta dirty bitmap
  block: Add two dirty bitmap getters
  block: Assert that bdrv_release_dirty_bitmap succeeded
  tests: Add test code for hbitmap serialization
  block: More operations for meta dirty bitmap

Vladimir Sementsov-Ogievskiy (2):
  hbitmap: serialization
  block: BdrvDirtyBitmap serialization interface

 block/backup.c   |  14 ++-
 block/dirty-bitmap.c | 160 ++-
 block/mirror.c   |  24 ++--
 include/block/dirty-bitmap.h |  35 +-
 include/qemu/hbitmap.h   | 100 +
 include/qemu/typedefs.h  |   1 +
 tests/test-hbitmap.c | 255 +++
 util/hbitmap.c   | 207 ---
 8 files changed, 756 insertions(+), 40 deletions(-)

-- 
2.7.4

[Qemu-devel] [PATCH v6 02/10] HBitmap: Introduce "meta" bitmap to track bit changes

2016-07-15 Thread John Snow

From: Fam Zheng 

Upon each bit toggle, the corresponding bit in the meta bitmap will be
set.

Signed-off-by: Fam Zheng 
[Amended text inline. --js]

Signed-off-by: John Snow 
---
 include/qemu/hbitmap.h | 21 +++
 util/hbitmap.c | 70 +++---
 2 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index 8ab721e..1725919 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -178,6 +178,27 @@ void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap 
*hb, uint64_t first);
  */
 unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi);
 
+/* hbitmap_create_meta:
+ * Create a "meta" hbitmap to track dirtiness of the bits in this HBitmap.
+ * The caller owns the created bitmap and must call hbitmap_free_meta(hb) to
+ * free it.
+ *
+ * Currently, we only guarantee that if a bit in the hbitmap is changed it
+ * will be reflected in the meta bitmap, but we do not yet guarantee the
+ * opposite.
+ *
+ * @hb: The HBitmap to operate on.
+ * @chunk_size: How many bits in @hb does one bit in the meta track.
+ */
+HBitmap *hbitmap_create_meta(HBitmap *hb, int chunk_size);
+
+/* hbitmap_free_meta:
+ * Free the meta bitmap of @hb.
+ *
+ * @hb: The HBitmap whose meta bitmap should be freed.
+ */
+void hbitmap_free_meta(HBitmap *hb);
+
 /**
  * hbitmap_iter_next:
  * @hbi: HBitmapIter to operate on.
diff --git a/util/hbitmap.c b/util/hbitmap.c
index 99fd2ba..5186500 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -78,6 +78,9 @@ struct HBitmap {
  */
 int granularity;
 
+/* A meta dirty bitmap to track the dirtiness of bits in this HBitmap. */
+HBitmap *meta;
+
 /* A number of progressively less coarse bitmaps (i.e. level 0 is the
  * coarsest).  Each bit in level N represents a word in level N+1 that
  * has a set bit, except the last level where each bit represents the
@@ -209,25 +212,27 @@ static uint64_t hb_count_between(HBitmap *hb, uint64_t 
start, uint64_t last)
 }
 
 /* Setting starts at the last layer and propagates up if an element
- * changes from zero to non-zero.
+ * changes.
  */
 static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t 
last)
 {
 unsigned long mask;
-bool changed;
+unsigned long old;
 
 assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
 assert(start <= last);
 
 mask = 2UL << (last & (BITS_PER_LONG - 1));
 mask -= 1UL << (start & (BITS_PER_LONG - 1));
-changed = (*elem == 0);
+old = *elem;
 *elem |= mask;
-return changed;
+return old != *elem;
 }
 
-/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */
-static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t 
last)
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)...
+ * Returns true if at least one bit is changed. */
+static bool hb_set_between(HBitmap *hb, int level, uint64_t start,
+   uint64_t last)
 {
 size_t pos = start >> BITS_PER_LEVEL;
 size_t lastpos = last >> BITS_PER_LEVEL;
@@ -256,23 +261,29 @@ static void hb_set_between(HBitmap *hb, int level, 
uint64_t start, uint64_t last
 if (level > 0 && changed) {
 hb_set_between(hb, level - 1, pos, lastpos);
 }
+return changed;
 }
 
 void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
 {
 /* Compute range in the last layer.  */
+uint64_t first, n;
 uint64_t last = start + count - 1;
 
 trace_hbitmap_set(hb, start, count,
   start >> hb->granularity, last >> hb->granularity);
 
-start >>= hb->granularity;
+first = start >> hb->granularity;
 last >>= hb->granularity;
+assert(last < hb->size);
 count = last - start + 1;
-assert(last < hb->size);
+n = last - first + 1;
 
-hb->count += count - hb_count_between(hb, start, last);
-hb_set_between(hb, HBITMAP_LEVELS - 1, start, last);
+hb->count += n - hb_count_between(hb, first, last);
+if (hb_set_between(hb, HBITMAP_LEVELS - 1, first, last) &&
+hb->meta) {
+hbitmap_set(hb->meta, start, count);
+}
 }
 
 /* Resetting works the other way round: propagate up if the new
@@ -293,8 +304,10 @@ static inline bool hb_reset_elem(unsigned long *elem, 
uint64_t start, uint64_t l
 return blanked;
 }
 
-/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */
-static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t 
last)
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)...
+ * Returns true if at least one bit is changed. */
+static bool hb_reset_between(HBitmap *hb, int level, uint64_t start,
+ uint64_t last)
 {
 size_t pos = start >> BITS_PER_LEVEL;
 size_t lastpos = last >> BITS_PER_LEVEL;
@@ -337,22 +350,29 @@ static void hb_reset_between(HBitmap *hb, int level, 
uint64_t start, uint64_t la
 if (level

[Qemu-devel] [PATCH v6 04/10] block: Support meta dirty bitmap

2016-07-15 Thread John Snow

From: Fam Zheng 

The added group of operations enables tracking of the changed bits in
the dirty bitmap.

Signed-off-by: Fam Zheng 
Reviewed-by: Max Reitz 
Signed-off-by: John Snow 
---
 block/dirty-bitmap.c | 52 
 include/block/dirty-bitmap.h |  9 
 2 files changed, 61 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 628b77c..9c53c56 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -38,6 +38,7 @@
  */
 struct BdrvDirtyBitmap {
 HBitmap *bitmap;/* Dirty sector bitmap implementation */
+HBitmap *meta;  /* Meta dirty bitmap */
 BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
 char *name; /* Optional non-empty unique ID */
 int64_t size;   /* Size of the bitmap (Number of sectors) */
@@ -103,6 +104,56 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState 
*bs,
 return bitmap;
 }
 
+/* bdrv_create_meta_dirty_bitmap
+ *
+ * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. I.e.
+ * when a dirty status bit in @bitmap is changed (either from reset to set or
+ * the other way around), its respective meta dirty bitmap bit will be marked
+ * dirty as well.
+ *
+ * @bitmap: the block dirty bitmap for which to create a meta dirty bitmap.
+ * @chunk_size: how many bytes of bitmap data does each bit in the meta bitmap
+ * track.
+ */
+void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+   int chunk_size)
+{
+assert(!bitmap->meta);
+bitmap->meta = hbitmap_create_meta(bitmap->bitmap,
+   chunk_size * BITS_PER_BYTE);
+}
+
+void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+assert(bitmap->meta);
+hbitmap_free_meta(bitmap->bitmap);
+bitmap->meta = NULL;
+}
+
+int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
+   BdrvDirtyBitmap *bitmap, int64_t sector,
+   int nb_sectors)
+{
+uint64_t i;
+int gran = bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
+
+/* To optimize: we can make hbitmap to internally check the range in a
+ * coarse level, or at least do it word by word. */
+for (i = sector; i < sector + nb_sectors; i += gran) {
+if (hbitmap_get(bitmap->meta, i)) {
+return true;
+}
+}
+return false;
+}
+
+void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
+  BdrvDirtyBitmap *bitmap, int64_t sector,
+  int nb_sectors)
+{
+hbitmap_reset(bitmap->meta, sector, nb_sectors);
+}
+
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
 {
 return bitmap->successor;
@@ -233,6 +284,7 @@ static void 
bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
 if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
 assert(!bm->active_iterators);
 assert(!bdrv_dirty_bitmap_frozen(bm));
+assert(!bm->meta);
 QLIST_REMOVE(bm, list);
 hbitmap_free(bm->bitmap);
 g_free(bm->name);
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 2ea601b..50e0fca 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -8,6 +8,9 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
   uint32_t granularity,
   const char *name,
   Error **errp);
+void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+   int chunk_size);
+void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
BdrvDirtyBitmap *bitmap,
Error **errp);
@@ -36,6 +39,12 @@ void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
int64_t cur_sector, int nr_sectors);
 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
  int64_t cur_sector, int nr_sectors);
+int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
+   BdrvDirtyBitmap *bitmap, int64_t sector,
+   int nb_sectors);
+void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
+  BdrvDirtyBitmap *bitmap, int64_t sector,
+  int nb_sectors);
 BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
  uint64_t first_sector);
 void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter);
-- 
2.7.4

Re: [Qemu-devel] [PULL 0/4] tlb fixes for self-modifying code

2016-07-15 Thread Hulin, Patrick - 0559 - MITLL

Hi all,

 

Just noticed this patch and wanted to leave a quick comment. The original
issue wasn't with cross-page writes - it was with cross-TB writes.
Cross-page writes become an issue once you reverse the order of the loop, so
that part of the patch is necessary. But someone might want to leave a note
in the code that there's still an issue when doing an unaligned write off
the front of a TB. The  patch fixes the behavior of vanilla Windows 7 x64,
which is definitely an improvement, but there are a few real applications
that break it. (Photoshop is one, I believe).

 

-Patrick



smime.p7s
Description: S/MIME cryptographic signature

Re: [Qemu-devel] [PATCH 4/5] cpu-exec: Move TB execution stuff out of cpu_exec()

2016-07-15 Thread Sergey Fedorov

On 15/07/16 09:45, Stefan Weil wrote:
> Hi,
>
> Am 11.05.2016 um 12:21 schrieb Sergey Fedorov:
> [...]
>>   int cpu_exec(CPUState *cpu)
>> @@ -516,8 +576,6 @@ int cpu_exec(CPUState *cpu)
>>  CPUArchState *env = &x86_cpu->env;
>>  #endif
>>  int ret;
>> -TranslationBlock *tb, *last_tb;
>> -int tb_exit = 0;
> Here tb_exit was only once set to 0, ...
>
>>  SyncClocks sc;
>>  
>>  /* replay_interrupt may need current_cpu */
>> @@ -544,6 +602,9 @@ int cpu_exec(CPUState *cpu)
>>  init_delay_params(&sc, cpu);
>>  
>>  for(;;) {
>> +TranslationBlock *tb, *last_tb;
>> +int tb_exit = 0;
> ... while now it is zeroed in each iteration of the for loop.
> I'm not sure whether the new code is still correct.

That is okay because 'tb_exit' only makes sense when "last_tb != NULL".
But we always reset 'last_tb' in this loop:

last_tb = NULL; /* forget the last executed TB after exception */

>
> If it is, ...
>
>> +
>>  /* prepare setjmp context for exception handling */
>>  if (sigsetjmp(cpu->jmp_env, 0) == 0) {
> ... the declaration of tb_exit could also be done here, after the sigsetjmp.
> That would fix a compiler warning which I get when compiling with
> -Wclobbered:
>
>
> cpu-exec.c:603:13: warning: variable ‘tb_exit’ might be clobbered by
> ‘longjmp’ or ‘vfork’ [-Wclobbered]

I've sent the patch to fix this:

Message-Id: <20160715193123.28113-1-sergey.fedo...@linaro.org>

Thanks,
Sergey

[Qemu-devel] [PATCH] cpu-exec: Move down some declarations in cpu_exec()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

This will fix a compiler warning with -Wclobbered:

http://lists.nongnu.org/archive/html/qemu-devel/2016-07/msg03347.html

Reported-by: Stefan Weil 
Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
---
 cpu-exec.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index b840e1d2dd41..5d9710a1eaf2 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -608,17 +608,16 @@ int cpu_exec(CPUState *cpu)
 init_delay_params(&sc, cpu);
 
 for(;;) {
-TranslationBlock *tb, *last_tb;
-int tb_exit = 0;
-
 /* prepare setjmp context for exception handling */
 if (sigsetjmp(cpu->jmp_env, 0) == 0) {
+TranslationBlock *tb, *last_tb = NULL;
+int tb_exit = 0;
+
 /* if an exception is pending, we execute it here */
 if (cpu_handle_exception(cpu, &ret)) {
 break;
 }
 
-last_tb = NULL; /* forget the last executed TB after exception */
 cpu->tb_flushed = false; /* reset before first TB lookup */
 for(;;) {
 cpu_handle_interrupt(cpu, &last_tb);
-- 
2.9.1

Re: [Qemu-devel] [PATCH v3 2/2] trace: [bsd-user] Commandline arguments to control tracing

2016-07-15 Thread Eric Blake

On 07/15/2016 11:08 AM, Lluís Vilanova wrote:
> Signed-off-by: Lluís Vilanova 
> ---
>  bsd-user/main.c |   16 
>  1 file changed, 16 insertions(+)
> 

> @@ -754,6 +760,8 @@ int main(int argc, char **argv)
>  
>  cpu_model = NULL;
>  
> +qemu_add_opts(&qemu_trace_opts);
> +
>  optind = 1;
>  for(;;) {

Optional, but since you are in the area, you could add the missing space
after 'for'.

>  if (optind >= argc)
> @@ -840,6 +848,9 @@ int main(int argc, char **argv)
>  singlestep = 1;
>  } else if (!strcmp(r, "strace")) {
>  do_strace = 1;
> +} else if (!strcmp(r, "trace")) {
> +g_free(trace_file);
> +trace_file = trace_opt_parse(optarg);
>  } else
>  {

and fix the 'else {' to be on one line.


-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

[Qemu-devel] [PATCH v4 04/12] cpus: Wrap mutex used to protect CPU work

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

This will be useful to enable CPU work on user mode emulation.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 cpus.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpus.c b/cpus.c
index 04687c85bcd4..f80ed2aeefdd 100644
--- a/cpus.c
+++ b/cpus.c
@@ -910,6 +910,11 @@ void qemu_init_cpu_loop(void)
 qemu_thread_get_self(&io_thread);
 }
 
+static QemuMutex *qemu_get_cpu_work_mutex(void)
+{
+return &qemu_global_mutex;
+}
+
 static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
 {
 qemu_mutex_lock(&cpu->work_mutex);
@@ -943,7 +948,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void 
*data)
 while (!atomic_mb_read(&wi.done)) {
 CPUState *self_cpu = current_cpu;
 
-qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
+qemu_cond_wait(&qemu_work_cond, qemu_get_cpu_work_mutex());
 current_cpu = self_cpu;
 }
 }
-- 
2.9.1

[Qemu-devel] [PATCH v4 12/12] tcg: Make tb_flush() thread safe

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Use async_safe_run_on_cpu() to make tb_flush() thread safe.

It can happen that multiple threads schedule a safe work to flush the
translation buffer. To keep statistics and debugging output sane, always
check if the translation buffer has already been flushed.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 

---
Changes in v4:
 - check if flush has already been done by raced CPU work
Changes in v3:
 - 'tb_flushed' removed
Changes in v2:
 - stale comment about unsafe tb_flush() removed
---
 cpu-exec.c| 14 +-
 include/qom/cpu.h |  2 --
 translate-all.c   | 17 +++--
 3 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index b840e1d2dd41..0b7614ffcc9b 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -203,20 +203,16 @@ static void cpu_exec_nocache(CPUState *cpu, int 
max_cycles,
  TranslationBlock *orig_tb, bool ignore_icount)
 {
 TranslationBlock *tb;
-bool old_tb_flushed;
 
 /* Should never happen.
We only end up here when an existing TB is too long.  */
 if (max_cycles > CF_COUNT_MASK)
 max_cycles = CF_COUNT_MASK;
 
-old_tb_flushed = cpu->tb_flushed;
-cpu->tb_flushed = false;
 tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
  max_cycles | CF_NOCACHE
  | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
-tb->orig_tb = cpu->tb_flushed ? NULL : orig_tb;
-cpu->tb_flushed |= old_tb_flushed;
+tb->orig_tb = orig_tb;
 /* execute the generated code */
 trace_exec_tb_nocache(tb, tb->pc);
 cpu_tb_exec(cpu, tb);
@@ -338,13 +334,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
  tb->flags != flags)) {
 tb = tb_find_slow(cpu, pc, cs_base, flags);
 }
-if (cpu->tb_flushed) {
-/* Ensure that no TB jump will be modified as the
- * translation buffer has been flushed.
- */
-*last_tb = NULL;
-cpu->tb_flushed = false;
-}
 #ifndef CONFIG_USER_ONLY
 /* We don't take care of direct jumps when address mapping changes in
  * system emulation. So it's not safe to make a direct jump to a TB
@@ -619,7 +608,6 @@ int cpu_exec(CPUState *cpu)
 }
 
 last_tb = NULL; /* forget the last executed TB after exception */
-cpu->tb_flushed = false; /* reset before first TB lookup */
 for(;;) {
 cpu_handle_interrupt(cpu, &last_tb);
 tb = tb_find_fast(cpu, &last_tb, tb_exit);
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index ab67bf2ba19f..9af4420d2bb5 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -258,7 +258,6 @@ struct qemu_work_item {
  * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU
  * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
  *   CPU and return to its top level loop.
- * @tb_flushed: Indicates the translation buffer has been flushed.
  * @singlestep_enabled: Flags for single-stepping.
  * @icount_extra: Instructions until next timer event.
  * @icount_decr: Number of cycles left, with interrupt flag in high bit.
@@ -310,7 +309,6 @@ struct CPUState {
 bool unplug;
 bool crash_occurred;
 bool exit_request;
-bool tb_flushed;
 uint32_t interrupt_request;
 int singlestep_enabled;
 int64_t icount_extra;
diff --git a/translate-all.c b/translate-all.c
index 0d47c1c0cf82..030273ee7b13 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -831,9 +831,11 @@ static void page_flush_tb(void)
 }
 
 /* flush all the translation blocks */
-/* XXX: tb_flush is currently not thread safe */
-void tb_flush(CPUState *cpu)
+static void do_tb_flush(CPUState *cpu, void *data)
 {
+if (tcg_ctx.tb_ctx.nb_tbs == 0) {
+return;
+}
 #if defined(DEBUG_FLUSH)
 printf("qemu: flush code_size=%ld nb_tbs=%d avg_tb_size=%ld\n",
(unsigned long)(tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer),
@@ -849,7 +851,6 @@ void tb_flush(CPUState *cpu)
 
 CPU_FOREACH(cpu) {
 memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
-cpu->tb_flushed = true;
 }
 
 qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
@@ -861,6 +862,11 @@ void tb_flush(CPUState *cpu)
 tcg_ctx.tb_ctx.tb_flush_count++;
 }
 
+void tb_flush(CPUState *cpu)
+{
+async_safe_run_on_cpu(cpu, do_tb_flush, NULL);
+}
+
 #ifdef DEBUG_TB_CHECK
 
 static void
@@ -1163,9 +1169,8 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
  buffer_overflow:
 /* flush must be done */
 tb_flush(cpu);
-/* cannot fail at this point */
-tb = tb_alloc(pc);
-assert(tb != NULL);
+mmap_unlock();
+cpu_loop_exit(cpu);
 }
 
 gen_code_buf = tcg_ctx.code_gen_ptr;
-- 
2.9.1

[Qemu-devel] [PATCH v4 07/12] linux-user: Rework exclusive operation mechanism

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

A single variable 'pending_cpus' was used for both counting currently
running CPUs and for signalling the pending exclusive operation request.

To prepare for supporting operations which requires a quiescent state,
like translation buffer flush, it is useful to keep a counter of
currently running CPUs always up to date.

Use a separate variable 'tcg_pending_threads' to count for currently
running CPUs and a separate variable 'exclusive_pending' to indicate
that there's an exclusive operation pending.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 

---
Changes in v2:
 - Rename 'tcg_pending_cpus' to 'tcg_pending_threads'
---
 linux-user/main.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index bdbda693cc5f..5ff0b20bad89 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -112,7 +112,8 @@ static QemuMutex cpu_list_mutex;
 static QemuMutex exclusive_lock;
 static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
-static int pending_cpus;
+static bool exclusive_pending;
+static int tcg_pending_threads;
 
 void qemu_init_cpu_loop(void)
 {
@@ -142,7 +143,8 @@ void fork_end(int child)
 QTAILQ_REMOVE(&cpus, cpu, node);
 }
 }
-pending_cpus = 0;
+tcg_pending_threads = 0;
+exclusive_pending = false;
 qemu_mutex_init(&exclusive_lock);
 qemu_mutex_init(&cpu_list_mutex);
 qemu_cond_init(&exclusive_cond);
@@ -159,7 +161,7 @@ void fork_end(int child)
must be held.  */
 static inline void exclusive_idle(void)
 {
-while (pending_cpus) {
+while (exclusive_pending) {
 qemu_cond_wait(&exclusive_resume, &exclusive_lock);
 }
 }
@@ -173,15 +175,14 @@ static inline void start_exclusive(void)
 qemu_mutex_lock(&exclusive_lock);
 exclusive_idle();
 
-pending_cpus = 1;
+exclusive_pending = true;
 /* Make all other cpus stop executing.  */
 CPU_FOREACH(other_cpu) {
 if (other_cpu->running) {
-pending_cpus++;
 cpu_exit(other_cpu);
 }
 }
-if (pending_cpus > 1) {
+while (tcg_pending_threads) {
 qemu_cond_wait(&exclusive_cond, &exclusive_lock);
 }
 }
@@ -189,7 +190,7 @@ static inline void start_exclusive(void)
 /* Finish an exclusive operation.  */
 static inline void __attribute__((unused)) end_exclusive(void)
 {
-pending_cpus = 0;
+exclusive_pending = false;
 qemu_cond_broadcast(&exclusive_resume);
 qemu_mutex_unlock(&exclusive_lock);
 }
@@ -200,6 +201,7 @@ static inline void cpu_exec_start(CPUState *cpu)
 qemu_mutex_lock(&exclusive_lock);
 exclusive_idle();
 cpu->running = true;
+tcg_pending_threads++;
 qemu_mutex_unlock(&exclusive_lock);
 }
 
@@ -208,11 +210,9 @@ static inline void cpu_exec_end(CPUState *cpu)
 {
 qemu_mutex_lock(&exclusive_lock);
 cpu->running = false;
-if (pending_cpus > 1) {
-pending_cpus--;
-if (pending_cpus == 1) {
-qemu_cond_signal(&exclusive_cond);
-}
+tcg_pending_threads--;
+if (!tcg_pending_threads) {
+qemu_cond_signal(&exclusive_cond);
 }
 exclusive_idle();
 qemu_mutex_unlock(&exclusive_lock);
-- 
2.9.1

[Qemu-devel] [PATCH v4 09/12] linux-user: Support CPU work queue

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Make CPU work core functions common between system and user-mode
emulation. User-mode does not have BQL, so process_queued_cpu_work() is
protected by 'exclusive_lock'.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 

---
Changes in v2:
 - 'qemu_work_cond' definition moved to cpu-exec-common.c
 - documentation commend for new public API added
---
 cpu-exec-common.c   | 85 
 cpus.c  | 86 +
 include/exec/exec-all.h | 17 ++
 linux-user/main.c   |  8 +
 4 files changed, 111 insertions(+), 85 deletions(-)

diff --git a/cpu-exec-common.c b/cpu-exec-common.c
index 0cb4ae60eff9..a233f0124559 100644
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -77,3 +77,88 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
 }
 siglongjmp(cpu->jmp_env, 1);
 }
+
+QemuCond qemu_work_cond;
+
+static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
+{
+qemu_mutex_lock(&cpu->work_mutex);
+if (cpu->queued_work_first == NULL) {
+cpu->queued_work_first = wi;
+} else {
+cpu->queued_work_last->next = wi;
+}
+cpu->queued_work_last = wi;
+wi->next = NULL;
+wi->done = false;
+qemu_mutex_unlock(&cpu->work_mutex);
+
+qemu_cpu_kick(cpu);
+}
+
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+{
+struct qemu_work_item wi;
+
+if (qemu_cpu_is_self(cpu)) {
+func(cpu, data);
+return;
+}
+
+wi.func = func;
+wi.data = data;
+wi.free = false;
+
+queue_work_on_cpu(cpu, &wi);
+while (!atomic_mb_read(&wi.done)) {
+CPUState *self_cpu = current_cpu;
+
+qemu_cond_wait(&qemu_work_cond, qemu_get_cpu_work_mutex());
+current_cpu = self_cpu;
+}
+}
+
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
+{
+struct qemu_work_item *wi;
+
+if (qemu_cpu_is_self(cpu)) {
+func(cpu, data);
+return;
+}
+
+wi = g_malloc0(sizeof(struct qemu_work_item));
+wi->func = func;
+wi->data = data;
+wi->free = true;
+
+queue_work_on_cpu(cpu, wi);
+}
+
+void process_queued_cpu_work(CPUState *cpu)
+{
+struct qemu_work_item *wi;
+
+if (cpu->queued_work_first == NULL) {
+return;
+}
+
+qemu_mutex_lock(&cpu->work_mutex);
+while (cpu->queued_work_first != NULL) {
+wi = cpu->queued_work_first;
+cpu->queued_work_first = wi->next;
+if (!cpu->queued_work_first) {
+cpu->queued_work_last = NULL;
+}
+qemu_mutex_unlock(&cpu->work_mutex);
+wi->func(cpu, wi->data);
+qemu_mutex_lock(&cpu->work_mutex);
+if (wi->free) {
+g_free(wi);
+} else {
+atomic_mb_set(&wi->done, true);
+}
+}
+qemu_mutex_unlock(&cpu->work_mutex);
+qemu_cond_broadcast(&qemu_work_cond);
+}
diff --git a/cpus.c b/cpus.c
index 51fd8c18b4c8..282d7e399902 100644
--- a/cpus.c
+++ b/cpus.c
@@ -896,7 +896,6 @@ static QemuThread io_thread;
 static QemuCond qemu_cpu_cond;
 /* system init */
 static QemuCond qemu_pause_cond;
-static QemuCond qemu_work_cond;
 
 void qemu_init_cpu_loop(void)
 {
@@ -910,66 +909,11 @@ void qemu_init_cpu_loop(void)
 qemu_thread_get_self(&io_thread);
 }
 
-static QemuMutex *qemu_get_cpu_work_mutex(void)
+QemuMutex *qemu_get_cpu_work_mutex(void)
 {
 return &qemu_global_mutex;
 }
 
-static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
-{
-qemu_mutex_lock(&cpu->work_mutex);
-if (cpu->queued_work_first == NULL) {
-cpu->queued_work_first = wi;
-} else {
-cpu->queued_work_last->next = wi;
-}
-cpu->queued_work_last = wi;
-wi->next = NULL;
-wi->done = false;
-qemu_mutex_unlock(&cpu->work_mutex);
-
-qemu_cpu_kick(cpu);
-}
-
-void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
-{
-struct qemu_work_item wi;
-
-if (qemu_cpu_is_self(cpu)) {
-func(cpu, data);
-return;
-}
-
-wi.func = func;
-wi.data = data;
-wi.free = false;
-
-queue_work_on_cpu(cpu, &wi);
-while (!atomic_mb_read(&wi.done)) {
-CPUState *self_cpu = current_cpu;
-
-qemu_cond_wait(&qemu_work_cond, qemu_get_cpu_work_mutex());
-current_cpu = self_cpu;
-}
-}
-
-void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
-{
-struct qemu_work_item *wi;
-
-if (qemu_cpu_is_self(cpu)) {
-func(cpu, data);
-return;
-}
-
-wi = g_malloc0(sizeof(struct qemu_work_item));
-wi->func = func;
-wi->data = data;
-wi->free = true;
-
-queue_work_on_cpu(cpu, wi);
-}
-
 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
 {
 if (kvm_destroy_vcpu(cpu) < 0) {
@@ -982,34 +926,6 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
 }
 
-static void process_queued_cpu_work(CPUState *cpu)

[Qemu-devel] [PATCH v4 06/12] linux-user: Use QemuMutex and QemuCond

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Convert pthread_mutex_t and pthread_cond_t to QemuMutex and QemuCond.
This will allow to make some locks and conditional variables common
between user and system mode emulation.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 linux-user/main.c | 53 +++--
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/linux-user/main.c b/linux-user/main.c
index 617a179f14a4..bdbda693cc5f 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -108,17 +108,25 @@ int cpu_get_pic_interrupt(CPUX86State *env)
We don't require a full sync, only that no cpus are executing guest code.
The alternative is to map target atomic ops onto host equivalents,
which requires quite a lot of per host/target work.  */
-static pthread_mutex_t cpu_list_mutex = PTHREAD_MUTEX_INITIALIZER;
-static pthread_mutex_t exclusive_lock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_cond_t exclusive_cond = PTHREAD_COND_INITIALIZER;
-static pthread_cond_t exclusive_resume = PTHREAD_COND_INITIALIZER;
+static QemuMutex cpu_list_mutex;
+static QemuMutex exclusive_lock;
+static QemuCond exclusive_cond;
+static QemuCond exclusive_resume;
 static int pending_cpus;
 
+void qemu_init_cpu_loop(void)
+{
+qemu_mutex_init(&cpu_list_mutex);
+qemu_mutex_init(&exclusive_lock);
+qemu_cond_init(&exclusive_cond);
+qemu_cond_init(&exclusive_resume);
+}
+
 /* Make sure everything is in a consistent state for calling fork().  */
 void fork_start(void)
 {
 qemu_mutex_lock(&tcg_ctx.tb_ctx.tb_lock);
-pthread_mutex_lock(&exclusive_lock);
+qemu_mutex_lock(&exclusive_lock);
 mmap_fork_start();
 }
 
@@ -135,14 +143,14 @@ void fork_end(int child)
 }
 }
 pending_cpus = 0;
-pthread_mutex_init(&exclusive_lock, NULL);
-pthread_mutex_init(&cpu_list_mutex, NULL);
-pthread_cond_init(&exclusive_cond, NULL);
-pthread_cond_init(&exclusive_resume, NULL);
+qemu_mutex_init(&exclusive_lock);
+qemu_mutex_init(&cpu_list_mutex);
+qemu_cond_init(&exclusive_cond);
+qemu_cond_init(&exclusive_resume);
 qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
 gdbserver_fork(thread_cpu);
 } else {
-pthread_mutex_unlock(&exclusive_lock);
+qemu_mutex_unlock(&exclusive_lock);
 qemu_mutex_unlock(&tcg_ctx.tb_ctx.tb_lock);
 }
 }
@@ -152,7 +160,7 @@ void fork_end(int child)
 static inline void exclusive_idle(void)
 {
 while (pending_cpus) {
-pthread_cond_wait(&exclusive_resume, &exclusive_lock);
+qemu_cond_wait(&exclusive_resume, &exclusive_lock);
 }
 }
 
@@ -162,7 +170,7 @@ static inline void start_exclusive(void)
 {
 CPUState *other_cpu;
 
-pthread_mutex_lock(&exclusive_lock);
+qemu_mutex_lock(&exclusive_lock);
 exclusive_idle();
 
 pending_cpus = 1;
@@ -174,7 +182,7 @@ static inline void start_exclusive(void)
 }
 }
 if (pending_cpus > 1) {
-pthread_cond_wait(&exclusive_cond, &exclusive_lock);
+qemu_cond_wait(&exclusive_cond, &exclusive_lock);
 }
 }
 
@@ -182,42 +190,42 @@ static inline void start_exclusive(void)
 static inline void __attribute__((unused)) end_exclusive(void)
 {
 pending_cpus = 0;
-pthread_cond_broadcast(&exclusive_resume);
-pthread_mutex_unlock(&exclusive_lock);
+qemu_cond_broadcast(&exclusive_resume);
+qemu_mutex_unlock(&exclusive_lock);
 }
 
 /* Wait for exclusive ops to finish, and begin cpu execution.  */
 static inline void cpu_exec_start(CPUState *cpu)
 {
-pthread_mutex_lock(&exclusive_lock);
+qemu_mutex_lock(&exclusive_lock);
 exclusive_idle();
 cpu->running = true;
-pthread_mutex_unlock(&exclusive_lock);
+qemu_mutex_unlock(&exclusive_lock);
 }
 
 /* Mark cpu as not executing, and release pending exclusive ops.  */
 static inline void cpu_exec_end(CPUState *cpu)
 {
-pthread_mutex_lock(&exclusive_lock);
+qemu_mutex_lock(&exclusive_lock);
 cpu->running = false;
 if (pending_cpus > 1) {
 pending_cpus--;
 if (pending_cpus == 1) {
-pthread_cond_signal(&exclusive_cond);
+qemu_cond_signal(&exclusive_cond);
 }
 }
 exclusive_idle();
-pthread_mutex_unlock(&exclusive_lock);
+qemu_mutex_unlock(&exclusive_lock);
 }
 
 void cpu_list_lock(void)
 {
-pthread_mutex_lock(&cpu_list_mutex);
+qemu_mutex_lock(&cpu_list_mutex);
 }
 
 void cpu_list_unlock(void)
 {
-pthread_mutex_unlock(&cpu_list_mutex);
+qemu_mutex_unlock(&cpu_list_mutex);
 }
 
 
@@ -4210,6 +4218,7 @@ int main(int argc, char **argv, char **envp)
 int ret;
 int execfd;
 
+qemu_init_cpu_loop();
 module_call_init(MODULE_INIT_QOM);
 
 if ((envlist = envlist_create()) == NULL) {
-- 
2.9.1

[Qemu-devel] [PATCH v4 03/12] cpus: Move common code out of {async_, }run_on_cpu()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Move the code common between run_on_cpu() and async_run_on_cpu() into a
new function queue_work_on_cpu().

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 cpus.c | 42 ++
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/cpus.c b/cpus.c
index 049c2d04e150..04687c85bcd4 100644
--- a/cpus.c
+++ b/cpus.c
@@ -910,6 +910,22 @@ void qemu_init_cpu_loop(void)
 qemu_thread_get_self(&io_thread);
 }
 
+static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
+{
+qemu_mutex_lock(&cpu->work_mutex);
+if (cpu->queued_work_first == NULL) {
+cpu->queued_work_first = wi;
+} else {
+cpu->queued_work_last->next = wi;
+}
+cpu->queued_work_last = wi;
+wi->next = NULL;
+wi->done = false;
+qemu_mutex_unlock(&cpu->work_mutex);
+
+qemu_cpu_kick(cpu);
+}
+
 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
 struct qemu_work_item wi;
@@ -923,18 +939,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void 
*data)
 wi.data = data;
 wi.free = false;
 
-qemu_mutex_lock(&cpu->work_mutex);
-if (cpu->queued_work_first == NULL) {
-cpu->queued_work_first = &wi;
-} else {
-cpu->queued_work_last->next = &wi;
-}
-cpu->queued_work_last = &wi;
-wi.next = NULL;
-wi.done = false;
-qemu_mutex_unlock(&cpu->work_mutex);
-
-qemu_cpu_kick(cpu);
+queue_work_on_cpu(cpu, &wi);
 while (!atomic_mb_read(&wi.done)) {
 CPUState *self_cpu = current_cpu;
 
@@ -957,18 +962,7 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, 
void *data)
 wi->data = data;
 wi->free = true;
 
-qemu_mutex_lock(&cpu->work_mutex);
-if (cpu->queued_work_first == NULL) {
-cpu->queued_work_first = wi;
-} else {
-cpu->queued_work_last->next = wi;
-}
-cpu->queued_work_last = wi;
-wi->next = NULL;
-wi->done = false;
-qemu_mutex_unlock(&cpu->work_mutex);
-
-qemu_cpu_kick(cpu);
+queue_work_on_cpu(cpu, wi);
 }
 
 static void qemu_kvm_destroy_vcpu(CPUState *cpu)
-- 
2.9.1

[Qemu-devel] [PATCH v4 08/12] linux-user: Add qemu_cpu_is_self() and qemu_cpu_kick()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 linux-user/main.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/linux-user/main.c b/linux-user/main.c
index 5ff0b20bad89..a8790ac63f68 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -3785,6 +3785,16 @@ void cpu_loop(CPUTLGState *env)
 
 THREAD CPUState *thread_cpu;
 
+bool qemu_cpu_is_self(CPUState *cpu)
+{
+return thread_cpu == cpu;
+}
+
+void qemu_cpu_kick(CPUState *cpu)
+{
+cpu_exit(cpu);
+}
+
 void task_settid(TaskState *ts)
 {
 if (ts->ts_tid == 0) {
-- 
2.9.1

[Qemu-devel] [PATCH v4 10/12] bsd-user: Support CPU work queue

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

It is a minimalistic support because bsd-linux claims to be _not_
threadsafe.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
---
 bsd-user/main.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/bsd-user/main.c b/bsd-user/main.c
index 4819b9ec6333..f738dd64d691 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -63,6 +63,19 @@ int cpu_get_pic_interrupt(CPUX86State *env)
 }
 #endif
 
+void qemu_init_cpu_loop(void)
+{
+/* We need to do this becuase process_queued_cpu_work() calls
+ * qemu_cond_broadcast() on it
+ */
+qemu_cond_init(&qemu_work_cond);
+}
+
+QemuMutex *qemu_get_cpu_work_mutex(void)
+{
+return NULL; /* it will never be used */
+}
+
 /* These are no-ops because we are not threadsafe.  */
 static inline void cpu_exec_start(CPUArchState *env)
 {
@@ -70,6 +83,7 @@ static inline void cpu_exec_start(CPUArchState *env)
 
 static inline void cpu_exec_end(CPUArchState *env)
 {
+process_queued_cpu_work(cpu);
 }
 
 static inline void start_exclusive(void)
@@ -740,6 +754,7 @@ int main(int argc, char **argv)
 if (argc <= 1)
 usage();
 
+qemu_init_cpu_loop();
 module_call_init(MODULE_INIT_QOM);
 
 if ((envlist = envlist_create()) == NULL) {
-- 
2.9.1

[Qemu-devel] [PATCH v4 00/12] cpu-exec: Safe work in quiescent state

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Hi,

This is a v4 for the series [1]. There's only a small change to keep
tb_flush() statistic and debugging output sane. I also picked up
"Reviewed-by" tags.

This series is available at a public git repository:

https://github.com/sergefdrv/qemu.git safe-cpu-work-v4

Summary of changes in v4:
 - keep tb_flush() statistics and debugging output sane
Summary of changes in v3:
 - bsd-user support added
 - 'tb_flushed' removed
Summary of changes in v2:
 - atomic_dec_fetch() used to decrement 'safe_work_pending'
 - more work to use/fix passing CPUState to run_on_cpu helpers
 - instead of wrapping conditional variables access, use QemuMutex and
   QemuCond in linux-user and just wrap getting of the relevant mutex.
 - document new public API
 - Rename 'tcg_pending_cpus' to 'tcg_pending_threads'

Kind regards,
Sergey

[1] http://thread.gmane.org/gmane.comp.emulators.qemu/426554


Alex Bennée (2):
  atomic: introduce atomic_dec_fetch.
  cpus: pass CPUState to run_on_cpu helpers

Sergey Fedorov (10):
  cpus: Move common code out of {async_,}run_on_cpu()
  cpus: Wrap mutex used to protect CPU work
  cpus: Rename flush_queued_work()
  linux-user: Use QemuMutex and QemuCond
  linux-user: Rework exclusive operation mechanism
  linux-user: Add qemu_cpu_is_self() and qemu_cpu_kick()
  linux-user: Support CPU work queue
  bsd-user: Support CPU work queue
  cpu-exec-common: Introduce async_safe_run_on_cpu()
  tcg: Make tb_flush() thread safe

 bsd-user/main.c|  16 ++
 cpu-exec-common.c  | 132 +
 cpu-exec.c |  14 +
 cpus.c | 106 +++-
 hw/i386/kvm/apic.c |   3 +-
 hw/i386/kvmvapic.c |   6 +--
 hw/ppc/ppce500_spin.c  |  31 ---
 hw/ppc/spapr.c |   6 +--
 hw/ppc/spapr_hcall.c   |  17 +++---
 include/exec/exec-all.h|  31 +++
 include/qemu/atomic.h  |   4 ++
 include/qom/cpu.h  |  24 +++--
 kvm-all.c  |  21 +++-
 linux-user/main.c  |  94 
 target-i386/helper.c   |  19 +++
 target-i386/kvm.c  |   6 +--
 target-s390x/cpu.c |   4 +-
 target-s390x/cpu.h |   7 +--
 target-s390x/kvm.c |  98 -
 target-s390x/misc_helper.c |   4 +-
 translate-all.c|  17 +++---
 21 files changed, 391 insertions(+), 269 deletions(-)

-- 
2.9.1

[Qemu-devel] [PATCH v4 05/12] cpus: Rename flush_queued_work()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

To avoid possible confusion, rename flush_queued_work() to
process_queued_cpu_work().

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 cpus.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpus.c b/cpus.c
index f80ed2aeefdd..51fd8c18b4c8 100644
--- a/cpus.c
+++ b/cpus.c
@@ -982,7 +982,7 @@ static void qemu_tcg_destroy_vcpu(CPUState *cpu)
 {
 }
 
-static void flush_queued_work(CPUState *cpu)
+static void process_queued_cpu_work(CPUState *cpu)
 {
 struct qemu_work_item *wi;
 
@@ -1017,7 +1017,7 @@ static void qemu_wait_io_event_common(CPUState *cpu)
 cpu->stopped = true;
 qemu_cond_broadcast(&qemu_pause_cond);
 }
-flush_queued_work(cpu);
+process_queued_cpu_work(cpu);
 cpu->thread_kicked = false;
 }
 
-- 
2.9.1

[Qemu-devel] [PATCH v4 02/12] cpus: pass CPUState to run_on_cpu helpers

2016-07-15 Thread Sergey Fedorov

From: Alex Bennée 

CPUState is a fairly common pointer to pass to these helpers. This means
if you need other arguments for the async_run_on_cpu case you end up
having to do a g_malloc to stuff additional data into the routine. For
the current users this isn't a massive deal but for MTTCG this gets
cumbersome when the only other parameter is often an address.

This adds the typedef run_on_cpu_func for helper functions which has an
explicit CPUState * passed as the first parameter. All the users of
run_on_cpu and async_run_on_cpu have had their helpers updated to use
CPUState where available.

Signed-off-by: Alex Bennée 
[Sergey Fedorov:
 - eliminate more CPUState in user data;
 - remove unnecessary user data passing;
 - fix target-s390x/kvm.c and target-s390x/misc_helper.c]
Signed-off-by: Sergey Fedorov 
Acked-by: David Gibson  (ppc parts)
Reviewed-by: Christian Borntraeger  (s390 parts)

---
Changes in v2:
 - eliminate more CPUState in user data
 - remove unnecessary user data passing
 - fix target-s390x/kvm.c and target-s390x/misc_helper.c
---
 cpus.c | 15 ---
 hw/i386/kvm/apic.c |  3 +-
 hw/i386/kvmvapic.c |  6 +--
 hw/ppc/ppce500_spin.c  | 31 +--
 hw/ppc/spapr.c |  6 +--
 hw/ppc/spapr_hcall.c   | 17 
 include/qom/cpu.h  |  8 ++--
 kvm-all.c  | 21 --
 target-i386/helper.c   | 19 -
 target-i386/kvm.c  |  6 +--
 target-s390x/cpu.c |  4 +-
 target-s390x/cpu.h |  7 +---
 target-s390x/kvm.c | 98 +++---
 target-s390x/misc_helper.c |  4 +-
 14 files changed, 108 insertions(+), 137 deletions(-)

diff --git a/cpus.c b/cpus.c
index 84c3520d446f..049c2d04e150 100644
--- a/cpus.c
+++ b/cpus.c
@@ -551,9 +551,8 @@ static const VMStateDescription vmstate_timers = {
 }
 };
 
-static void cpu_throttle_thread(void *opaque)
+static void cpu_throttle_thread(CPUState *cpu, void *opaque)
 {
-CPUState *cpu = opaque;
 double pct;
 double throttle_ratio;
 long sleeptime_ns;
@@ -583,7 +582,7 @@ static void cpu_throttle_timer_tick(void *opaque)
 }
 CPU_FOREACH(cpu) {
 if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
-async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
+async_run_on_cpu(cpu, cpu_throttle_thread, NULL);
 }
 }
 
@@ -911,12 +910,12 @@ void qemu_init_cpu_loop(void)
 qemu_thread_get_self(&io_thread);
 }
 
-void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
+void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
 struct qemu_work_item wi;
 
 if (qemu_cpu_is_self(cpu)) {
-func(data);
+func(cpu, data);
 return;
 }
 
@@ -944,12 +943,12 @@ void run_on_cpu(CPUState *cpu, void (*func)(void *data), 
void *data)
 }
 }
 
-void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
+void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
 {
 struct qemu_work_item *wi;
 
 if (qemu_cpu_is_self(cpu)) {
-func(data);
+func(cpu, data);
 return;
 }
 
@@ -1000,7 +999,7 @@ static void flush_queued_work(CPUState *cpu)
 cpu->queued_work_last = NULL;
 }
 qemu_mutex_unlock(&cpu->work_mutex);
-wi->func(wi->data);
+wi->func(cpu, wi->data);
 qemu_mutex_lock(&cpu->work_mutex);
 if (wi->free) {
 g_free(wi);
diff --git a/hw/i386/kvm/apic.c b/hw/i386/kvm/apic.c
index c5983c79be47..9b66e741d4b4 100644
--- a/hw/i386/kvm/apic.c
+++ b/hw/i386/kvm/apic.c
@@ -125,10 +125,9 @@ static void kvm_apic_vapic_base_update(APICCommonState *s)
 }
 }
 
-static void do_inject_external_nmi(void *data)
+static void do_inject_external_nmi(CPUState *cpu, void *data)
 {
 APICCommonState *s = data;
-CPUState *cpu = CPU(s->cpu);
 uint32_t lvt;
 int ret;
 
diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
index 3bf1ddd97612..1bc02fb2f1a1 100644
--- a/hw/i386/kvmvapic.c
+++ b/hw/i386/kvmvapic.c
@@ -483,7 +483,7 @@ typedef struct VAPICEnableTPRReporting {
 bool enable;
 } VAPICEnableTPRReporting;
 
-static void vapic_do_enable_tpr_reporting(void *data)
+static void vapic_do_enable_tpr_reporting(CPUState *cpu, void *data)
 {
 VAPICEnableTPRReporting *info = data;
 
@@ -734,10 +734,10 @@ static void vapic_realize(DeviceState *dev, Error **errp)
 nb_option_roms++;
 }
 
-static void do_vapic_enable(void *data)
+static void do_vapic_enable(CPUState *cs, void *data)
 {
 VAPICROMState *s = data;
-X86CPU *cpu = X86_CPU(first_cpu);
+X86CPU *cpu = X86_CPU(cs);
 
 static const uint8_t enabled = 1;
 cpu_physical_memory_write(s->vapic_paddr + offsetof(VAPICState, enabled),
diff --git a/hw/ppc/ppce500_spin.c b/hw/ppc/ppce500_spin.c
index 22c584eb8dd0..8e16f651ea95 100644
--- a/hw/ppc/ppce500_spin.c
+++ b/hw/ppc/ppce500_spin.c
@@ -54,11 +54,6 @@ typedef struct

[Qemu-devel] [PATCH v4 11/12] cpu-exec-common: Introduce async_safe_run_on_cpu()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

This patch is based on the ideas found in work of KONRAD Frederic [1],
Alex Bennée [2], and Alvise Rigo [3].

This mechanism allows to perform an operation safely in a quiescent
state. Quiescent state means: (1) no vCPU is running and (2) BQL in
system-mode or 'exclusive_lock' in user-mode emulation is held while
performing the operation. This functionality is required e.g. for
performing translation buffer flush safely in multi-threaded user-mode
emulation.

The existing CPU work queue is used to schedule such safe operations. A
new 'safe' flag is added into struct qemu_work_item to designate the
special requirements of the safe work. An operation in a quiescent sate
can be scheduled by using async_safe_run_on_cpu() function which is
actually the same as sync_run_on_cpu() except that it marks the queued
work item with the 'safe' flag set to true. Given this flag set
queue_work_on_cpu() atomically increments 'safe_work_pending' global
counter and kicks all the CPUs instead of just the target CPU as in case
of normal CPU work. This allows to force other CPUs to exit their
execution loops and wait in wait_safe_cpu_work() function for the safe
work to finish. When a CPU drains its work queue, if it encounters a
work item marked as safe, it first waits for other CPUs to exit their
execution loops, then called the work item function, and finally
decrements 'safe_work_pending' counter with signalling other CPUs to let
them continue execution as soon as all pending safe work items have been
processed. The 'tcg_pending_threads' protected by 'exclusive_lock' in
user-mode or by 'qemu_global_mutex' in system-mode emulation is used to
determine if there is any CPU run and wait for it to exit the execution
loop. The fairness of all the CPU work queues is ensured by draining all
the pending safe work items before any CPU can run.

[1] http://lists.nongnu.org/archive/html/qemu-devel/2015-08/msg01128.html
[2] http://lists.nongnu.org/archive/html/qemu-devel/2016-04/msg02531.html
[3] http://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04792.html

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 

---
Changes in v3:
 - bsd-user supported
Changes in v2:
 - some conditional varialbes moved to cpu-exec-common.c
 - documentation commend for new public API added
---
 bsd-user/main.c |  3 ++-
 cpu-exec-common.c   | 49 -
 cpus.c  | 20 
 include/exec/exec-all.h | 14 ++
 include/qom/cpu.h   | 14 ++
 linux-user/main.c   | 13 +++--
 6 files changed, 105 insertions(+), 8 deletions(-)

diff --git a/bsd-user/main.c b/bsd-user/main.c
index f738dd64d691..5433bca0fca6 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -66,9 +66,10 @@ int cpu_get_pic_interrupt(CPUX86State *env)
 void qemu_init_cpu_loop(void)
 {
 /* We need to do this becuase process_queued_cpu_work() calls
- * qemu_cond_broadcast() on it
+ * qemu_cond_broadcast() on them
  */
 qemu_cond_init(&qemu_work_cond);
+qemu_cond_init(&qemu_safe_work_cond);
 }
 
 QemuMutex *qemu_get_cpu_work_mutex(void)
diff --git a/cpu-exec-common.c b/cpu-exec-common.c
index a233f0124559..6f278d6d3b70 100644
--- a/cpu-exec-common.c
+++ b/cpu-exec-common.c
@@ -25,6 +25,7 @@
 
 bool exit_request;
 CPUState *tcg_current_cpu;
+int tcg_pending_threads;
 
 /* exit the current TB, but without causing any exception to be raised */
 void cpu_loop_exit_noexc(CPUState *cpu)
@@ -79,6 +80,17 @@ void cpu_loop_exit_restore(CPUState *cpu, uintptr_t pc)
 }
 
 QemuCond qemu_work_cond;
+QemuCond qemu_safe_work_cond;
+QemuCond qemu_exclusive_cond;
+
+static int safe_work_pending;
+
+void wait_safe_cpu_work(void)
+{
+while (atomic_mb_read(&safe_work_pending) > 0) {
+qemu_cond_wait(&qemu_safe_work_cond, qemu_get_cpu_work_mutex());
+}
+}
 
 static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
 {
@@ -91,9 +103,18 @@ static void queue_work_on_cpu(CPUState *cpu, struct 
qemu_work_item *wi)
 cpu->queued_work_last = wi;
 wi->next = NULL;
 wi->done = false;
+if (wi->safe) {
+atomic_inc(&safe_work_pending);
+}
 qemu_mutex_unlock(&cpu->work_mutex);
 
-qemu_cpu_kick(cpu);
+if (!wi->safe) {
+qemu_cpu_kick(cpu);
+} else {
+CPU_FOREACH(cpu) {
+qemu_cpu_kick(cpu);
+}
+}
 }
 
 void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void *data)
@@ -108,6 +129,7 @@ void run_on_cpu(CPUState *cpu, run_on_cpu_func func, void 
*data)
 wi.func = func;
 wi.data = data;
 wi.free = false;
+wi.safe = false;
 
 queue_work_on_cpu(cpu, &wi);
 while (!atomic_mb_read(&wi.done)) {
@@ -131,6 +153,20 @@ void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, 
void *data)
 wi->func = func;
 wi->data = data;
 wi->free = true;
+wi->safe = false;
+
+queue_work_on_cpu(cpu, w

[Qemu-devel] [PATCH v4 01/12] atomic: introduce atomic_dec_fetch.

2016-07-15 Thread Sergey Fedorov

From: Alex Bennée 

Useful for counting down.

Signed-off-by: Alex Bennée 
Signed-off-by: Sergey Fedorov 
---
 include/qemu/atomic.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 7e13fca351d1..560b1af703a8 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -160,6 +160,8 @@
 #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
 
+#define atomic_dec_fetch(ptr)  __atomic_sub_fetch(ptr, 1, __ATOMIC_SEQ_CST)
+
 /* And even shorter names that return void.  */
 #define atomic_inc(ptr)((void) __atomic_fetch_add(ptr, 1, 
__ATOMIC_SEQ_CST))
 #define atomic_dec(ptr)((void) __atomic_fetch_sub(ptr, 1, 
__ATOMIC_SEQ_CST))
@@ -355,6 +357,8 @@
 #define atomic_fetch_or__sync_fetch_and_or
 #define atomic_cmpxchg __sync_val_compare_and_swap
 
+#define atomic_dec_fetch(ptr)  __sync_sub_and_fetch(ptr, 1)
+
 /* And even shorter names that return void.  */
 #define atomic_inc(ptr)((void) __sync_fetch_and_add(ptr, 1))
 #define atomic_dec(ptr)((void) __sync_fetch_and_add(ptr, -1))
-- 
2.9.1

Re: [Qemu-devel] QOM: best way for parents to pass information to children? (was Re: [PATCH RFC 07/16] qom/cpu: make nr-cores, nr-threads real properties)

2016-07-15 Thread Igor Mammedov

On Fri, 15 Jul 2016 14:43:53 -0300
Eduardo Habkost  wrote:

> On Fri, Jul 15, 2016 at 06:30:41PM +0200, Andreas Färber wrote:
> > Am 15.07.2016 um 18:10 schrieb Eduardo Habkost:
> > > On Fri, Jul 15, 2016 at 11:11:38AM +0200, Igor Mammedov wrote:
> > >> On Fri, 15 Jul 2016 08:35:30 +0200
> > >> Andrew Jones  wrote:
> > >>> On Thu, Jul 14, 2016 at 05:07:43PM -0300, Eduardo Habkost wrote:
> > 
> >  First of all, sorry for the horrible delay in replying to this
> >  thread.
> > 
> >  On Wed, Jun 15, 2016 at 10:56:20AM +1000, David Gibson wrote:  
> > > On Tue, Jun 14, 2016 at 08:19:49AM +0200, Andrew Jones
> > > wrote:  
> > >> On Tue, Jun 14, 2016 at 12:12:16PM +1000, David Gibson
> > >> wrote:  
> > >>> On Sun, Jun 12, 2016 at 03:48:10PM +0200, Andrew Jones
> > >>> wrote:  
> > > [...]
> > >> +static Property cpu_common_properties[] = {
> > >> +DEFINE_PROP_INT32("nr-cores", CPUState, nr_cores,
> > >> 1),
> > >> +DEFINE_PROP_INT32("nr-threads", CPUState,
> > >> nr_threads, 1),
> > >> +DEFINE_PROP_END_OF_LIST()
> > >> +};  
> > >
> > > Are you aware of the current CPU hotplug discussion that
> > > is going on?  
> > 
> >  I'm aware of it going on, but haven't been following it.
> >    
> > > I'm not very involved there, but I think some of these
> > > reworks also move "nr_threads" into the CPU state
> > > already, e.g. see:  
> > 
> >  nr_threads (and nr_cores) are already state in CPUState.
> >  This patch just exposes that state via properties.
> >    
> > >
> > > https://github.com/dgibson/qemu/commit/9d07719784ecbeebea71
> > >
> > > ... so you might want to check these patches first to see
> > > whether you can base your rework on them?  
> > 
> >  Every cpu, and thus every machine, uses CPUState for its
> >  cpus. I'm not sure every machine will want to use that new
> >  abstract core class though. If they did, then we could
> >  indeed use nr_threads from there instead (and remove it
> >  from CPUState), but we'd still need nr_cores from the
> >  abstract cpu package class (CPUState).  
> > >>>
> > >>> Hmm.  Since the CPUState object represents just a single
> > >>> thread, it seems weird to me that it would have nr_threads
> > >>> and nr_cores information.  
> > 
> >  Agreed it is weird, and I think we should try to move it away
> >  from CPUState, not make it part of the TYPE_CPU interface.
> >  nr_threads belongs to the actual container of the Thread
> >  objects, and nr_cores in the actual container of the Core
> >  objects.
> > 
> >  The problem is how to implement that in a non-intrusive way
> >  that would require changing the object hierarchy of all
> >  architectures.
> > 
> >    
> > >>>
> > >>> Exposing those as properties makes that much worse, because
> > >>> it's now ABI, rather than internal detail we can clean up
> > >>> at some future time.  
> > >>
> > >> CPUState is supposed to be "State of one CPU core or
> > >> thread", which justifies having nr_threads state, as it may
> > >> be describing a core.  
> > >
> > > Um.. does it ever actually represent a (multithread) core in
> > > practice? It would need to have duplicated register state for
> > > every thread were that the case.  
> > 
> >  AFAIK, CPUState is still always thread state. Or has this
> >  changed in some architectures, already?
> >    
> > >   
> > >> I guess there's no justification for having nr_cores in
> > >> there though. I agree adding the Core class is a good idea,
> > >> assuming it will get used by all machines, and CPUState then
> > >> gets changed to a Thread class. The question then, though,
> > >> is do we also create a Socket class that contains nr_cores?  
> > >
> > > That was roughly our intention with the way the cross
> > > platform hotplug stuff is evolving.  But the intention was
> > > that the Socket objects would only need to be constructed for
> > > machine types where it makes sense.  So for example on the
> > > paravirt pseries platform, we'll only have Core objects,
> > > because the socket distinction isn't really meaningful.
> > >   
> > >> And how will a Thread method get that information when it
> > >> needs to emulate, e.g. CPUID, that requires it? It's a bit
> > >> messy, so I'm open to all suggestions on it.  
> > >
> > > So, if the Thread needs this information, I'm not opposed to
> > > it having it internally (presumably populated earlier from
> > > the Core object). But I am opposed to it being a locked in
> > > part of the interface by having it as an exposed proper

[Qemu-devel] [PATCH v3 4/6] nbd: Rely on block layer to break up large requests

2016-07-15 Thread Eric Blake

Now that the block layer will honor max_transfer, we can simplify
our code to rely on that guarantee.

The readv code can call directly into nbd-client, just as the
writev code has done since commit 52a4650.

Interestingly enough, while qemu-io 'w 0 40m' splits into a 32M
and 8M transaction, 'w -z 0 40m' splits into two 16M and an 8M,
because the block layer caps the bounce buffer for writing zeroes
at 16M.  When we later introduce support for NBD_CMD_WRITE_ZEROES,
we can get a full 32M zero write (or larger, if the client and
server negotiate that write zeroes can use a larger size than
ordinary writes).

Signed-off-by: Eric Blake 
Reviewed-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 block/nbd-client.c | 51 ---
 block/nbd.c| 12 +++-
 2 files changed, 11 insertions(+), 52 deletions(-)

diff --git a/block/nbd-client.c b/block/nbd-client.c
index 4cc408d..f1fb58b 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -217,15 +217,15 @@ static void nbd_coroutine_end(NbdClientSession *s,
 }
 }

-static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
-  int nb_sectors, QEMUIOVector *qiov,
-  int offset)
+int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
+int nb_sectors, QEMUIOVector *qiov)
 {
 NbdClientSession *client = nbd_get_client_session(bs);
 struct nbd_request request = { .type = NBD_CMD_READ };
 struct nbd_reply reply;
 ssize_t ret;

+assert(nb_sectors <= NBD_MAX_SECTORS);
 request.from = sector_num * 512;
 request.len = nb_sectors * 512;

@@ -234,16 +234,15 @@ static int nbd_co_readv_1(BlockDriverState *bs, int64_t 
sector_num,
 if (ret < 0) {
 reply.error = -ret;
 } else {
-nbd_co_receive_reply(client, &request, &reply, qiov, offset);
+nbd_co_receive_reply(client, &request, &reply, qiov, 0);
 }
 nbd_coroutine_end(client, &request);
 return -reply.error;

 }

-static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
-   int nb_sectors, QEMUIOVector *qiov,
-   int offset, int flags)
+int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov, int flags)
 {
 NbdClientSession *client = nbd_get_client_session(bs);
 struct nbd_request request = { .type = NBD_CMD_WRITE };
@@ -255,11 +254,12 @@ static int nbd_co_writev_1(BlockDriverState *bs, int64_t 
sector_num,
 request.type |= NBD_CMD_FLAG_FUA;
 }

+assert(nb_sectors <= NBD_MAX_SECTORS);
 request.from = sector_num * 512;
 request.len = nb_sectors * 512;

 nbd_coroutine_start(client, &request);
-ret = nbd_co_send_request(bs, &request, qiov, offset);
+ret = nbd_co_send_request(bs, &request, qiov, 0);
 if (ret < 0) {
 reply.error = -ret;
 } else {
@@ -269,41 +269,6 @@ static int nbd_co_writev_1(BlockDriverState *bs, int64_t 
sector_num,
 return -reply.error;
 }

-int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num,
-int nb_sectors, QEMUIOVector *qiov)
-{
-int offset = 0;
-int ret;
-while (nb_sectors > NBD_MAX_SECTORS) {
-ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
-if (ret < 0) {
-return ret;
-}
-offset += NBD_MAX_SECTORS * 512;
-sector_num += NBD_MAX_SECTORS;
-nb_sectors -= NBD_MAX_SECTORS;
-}
-return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
-}
-
-int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num,
- int nb_sectors, QEMUIOVector *qiov, int flags)
-{
-int offset = 0;
-int ret;
-while (nb_sectors > NBD_MAX_SECTORS) {
-ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset,
-  flags);
-if (ret < 0) {
-return ret;
-}
-offset += NBD_MAX_SECTORS * 512;
-sector_num += NBD_MAX_SECTORS;
-nb_sectors -= NBD_MAX_SECTORS;
-}
-return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset, flags);
-}
-
 int nbd_client_co_flush(BlockDriverState *bs)
 {
 NbdClientSession *client = nbd_get_client_session(bs);
diff --git a/block/nbd.c b/block/nbd.c
index 08e5b67..8a13078 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -349,12 +349,6 @@ static int nbd_open(BlockDriverState *bs, QDict *options, 
int flags,
 return ret;
 }

-static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
-int nb_sectors, QEMUIOVector *qiov)
-{
-return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov);
-}
-
 static int nbd_co_flush(BlockDriverState *bs)
 {
 return nbd_client_co_flush(bs);
@@ -450,7 +444,7 @@ static BlockDriver bdrv_nbd = {
 .instance_size  = sizeof(BDRVNBDState),
 .bd

[Qemu-devel] [PATCH v3 2/6] raw_bsd: Don't advertise flags not supported by protocol layer

2016-07-15 Thread Eric Blake

The raw format layer supports all flags via passthrough - but
it only makes sense to pass through flags that the lower layer
actually supports.

The next patch gives stronger reasoning for why this is correct.
At the moment, the raw format layer ignores the max_transfer
limit of its protocol layer, and an attempt to do the qemu-io
'w -f 0 40m' to an NBD server that lacks FUA will pass the entire
40m request to the NBD driver, which then fragments the request
itself into a 32m write, 8m write, and flush.  But once the block
layer starts honoring limits and fragmenting packets, the raw
driver will hand the NBD driver two separate requests; if both
requests have BDRV_REQ_FUA set, then this would result in a 32m
write, flush, 8m write, and second flush.  By having the raw
layer no longer advertise FUA support when the protocol layer
lacks it, we are back to a single flush at the block layer for
the overall 40m request.

Note that 'w -f -z 0 40m' does not currently exhibit the same
problem, because there, the fragmentation does not occur until
at the NBD layer (the raw layer has .bdrv_co_pwrite_zeroes, and
the NBD layer doesn't advertise max_pwrite_zeroes to constrain
things at the raw layer) - but the problem is latent and we
would again have too many flushes without this patch once the
NBD layer implements support for the new NBD_CMD_WRITE_ZEROES
command, if it sets max_pwrite_zeroes to the same 32m limit as
recommended by the NBD protocol.

Signed-off-by: Eric Blake 
Reviewed-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 

---
v3: no change
v2: no code change, but hoist earlier in series and reword commit
---
 block/raw_bsd.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index 5f9dd29..d767413 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -192,8 +192,10 @@ static int raw_open(BlockDriverState *bs, QDict *options, 
int flags,
 Error **errp)
 {
 bs->sg = bs->file->bs->sg;
-bs->supported_write_flags = BDRV_REQ_FUA;
-bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP;
+bs->supported_write_flags = BDRV_REQ_FUA &
+bs->file->bs->supported_write_flags;
+bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+bs->file->bs->supported_zero_flags;

 if (bs->probed && !bdrv_is_read_only(bs)) {
 fprintf(stderr,
-- 
2.5.5

[Qemu-devel] [PATCH v3 6/6] iscsi: Rely on block layer to break up large requests

2016-07-15 Thread Eric Blake

Now that the block layer honors max_request, we don't need to
bother with an EINVAL on overlarge requests, but can instead
assert that requests are well-behaved.

Signed-off-by: Eric Blake 
Reviewed-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 block/iscsi.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index cf1e9e7..bdc7ade 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -472,11 +472,8 @@ iscsi_co_writev_flags(BlockDriverState *bs, int64_t 
sector_num, int nb_sectors,
 return -EINVAL;
 }

-if (bs->bl.max_transfer &&
-nb_sectors << BDRV_SECTOR_BITS > bs->bl.max_transfer) {
-error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len "
- "of %" PRIu32 " bytes", nb_sectors, bs->bl.max_transfer);
-return -EINVAL;
+if (bs->bl.max_transfer) {
+assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer);
 }

 lba = sector_qemu2lun(sector_num, iscsilun);
@@ -650,11 +647,8 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState 
*bs,
 return -EINVAL;
 }

-if (bs->bl.max_transfer &&
-nb_sectors << BDRV_SECTOR_BITS > bs->bl.max_transfer) {
-error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len "
- "of %" PRIu32 " bytes", nb_sectors, bs->bl.max_transfer);
-return -EINVAL;
+if (bs->bl.max_transfer) {
+assert(nb_sectors << BDRV_SECTOR_BITS <= bs->bl.max_transfer);
 }

 if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES &&
-- 
2.5.5

[Qemu-devel] [PATCH v3 3/6] block: Fragment writes to max transfer length

2016-07-15 Thread Eric Blake

Drivers should be able to rely on the block layer honoring the
max transfer length, rather than needing to return -EINVAL
(iscsi) or manually fragment things (nbd).  We already fragment
write zeroes at the block layer; this patch adds the fragmentation
for normal writes, after requests have been aligned (fragmenting
before alignment would lead to multiple unaligned requests, rather
than just the head and tail).

When fragmenting a large request where FUA was requested, but
where we know that FUA is implemented by flushing all requests
rather than the given request, then we can still get by with
only one flush.  Note, however, that we need a followup patch
to the raw format driver to avoid a regression in the number of
flushes actually issued.

The return value was previously nebulous on success (sometimes
zero, sometimes the length written); since we never have a short
write, and since fragmenting may store yet another positive
value in 'ret', change the function to always return 0 on success,
matching what we do in bdrv_aligned_preadv().

Signed-off-by: Eric Blake 

---
v3: another return semantic tweak
v2: Tweak success return semantics to match read
---
 block/io.c | 35 +--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index ceff694..86db77e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1269,7 +1269,8 @@ fail:
 }

 /*
- * Forwards an already correctly aligned write request to the BlockDriver.
+ * Forwards an already correctly aligned write request to the BlockDriver,
+ * after possibly fragmenting it.
  */
 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
@@ -1281,6 +1282,8 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,

 int64_t start_sector = offset >> BDRV_SECTOR_BITS;
 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
+uint64_t bytes_remaining = bytes;
+int max_transfer;

 assert(is_power_of_2(align));
 assert((offset & (align - 1)) == 0);
@@ -1288,6 +1291,8 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 assert(!qiov || bytes == qiov->size);
 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
 assert(!(flags & ~BDRV_REQ_MASK));
+max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+   align);

 waited = wait_serialising_requests(req);
 assert(!waited || !req->serialising);
@@ -1310,9 +1315,34 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 } else if (flags & BDRV_REQ_ZERO_WRITE) {
 bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
 ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
-} else {
+} else if (bytes <= max_transfer) {
 bdrv_debug_event(bs, BLKDBG_PWRITEV);
 ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
+} else {
+bdrv_debug_event(bs, BLKDBG_PWRITEV);
+while (bytes_remaining) {
+int num = MIN(bytes_remaining, max_transfer);
+QEMUIOVector local_qiov;
+int local_flags = flags;
+
+assert(num);
+if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
+!(bs->supported_write_flags & BDRV_REQ_FUA)) {
+/* If FUA is going to be emulated by flush, we only
+ * need to flush on the last iteration */
+local_flags &= ~BDRV_REQ_FUA;
+}
+qemu_iovec_init(&local_qiov, qiov->niov);
+qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
+
+ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
+  num, &local_qiov, local_flags);
+qemu_iovec_destroy(&local_qiov);
+if (ret < 0) {
+break;
+}
+bytes_remaining -= num;
+}
 }
 bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);

@@ -1324,6 +1354,7 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,

 if (ret >= 0) {
 bs->total_sectors = MAX(bs->total_sectors, end_sector);
+ret = 0;
 }

 return ret;
-- 
2.5.5

[Qemu-devel] [PATCH v3 1/6] block: Fragment reads to max transfer length

2016-07-15 Thread Eric Blake

Drivers should be able to rely on the block layer honoring the
max transfer length, rather than needing to return -EINVAL
(iscsi) or manually fragment things (nbd).  This patch adds
the fragmentation in the block layer, after requests have been
aligned (fragmenting before alignment would lead to multiple
unaligned requests, rather than just the head and tail).

The return value was previously nebulous on success on whether
it was zero or the length read; and fragmenting may introduce
yet other non-zero values if we use the last length read.  But
as at least some callers are sloppy and expect only zero on
success, it is easiest to just guarantee 0.

Signed-off-by: Eric Blake 

---
v3: Slam 'ret' to 0, not 'bytes', as some callers choke otherwise
v2: Fix uninitialized use of 'ret' for an all-zero read beyond eof
---
 block/io.c | 55 ++-
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/block/io.c b/block/io.c
index 2887394..ceff694 100644
--- a/block/io.c
+++ b/block/io.c
@@ -971,8 +971,8 @@ err:

 /*
  * Forwards an already correctly aligned request to the BlockDriver. This
- * handles copy on read and zeroing after EOF; any other features must be
- * implemented by the caller.
+ * handles copy on read, zeroing after EOF, and fragmentation of large
+ * reads; any other features must be implemented by the caller.
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
@@ -980,12 +980,16 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 {
 int64_t total_bytes, max_bytes;
 int ret;
+uint64_t bytes_remaining = bytes;
+int max_transfer;

 assert(is_power_of_2(align));
 assert((offset & (align - 1)) == 0);
 assert((bytes & (align - 1)) == 0);
 assert(!qiov || bytes == qiov->size);
 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
+max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
+   align);

 /* TODO: We would need a per-BDS .supported_read_flags and
  * potential fallback support, if we ever implement any read flags
@@ -1024,7 +1028,7 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 }
 }

-/* Forward the request to the BlockDriver */
+/* Forward the request to the BlockDriver, possibly fragmenting it */
 total_bytes = bdrv_getlength(bs);
 if (total_bytes < 0) {
 ret = total_bytes;
@@ -1032,30 +1036,39 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 }

 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
-if (bytes <= max_bytes) {
+if (bytes <= max_bytes && bytes <= max_transfer) {
 ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
-} else if (max_bytes > 0) {
-QEMUIOVector local_qiov;
-
-qemu_iovec_init(&local_qiov, qiov->niov);
-qemu_iovec_concat(&local_qiov, qiov, 0, max_bytes);
-
-ret = bdrv_driver_preadv(bs, offset, max_bytes, &local_qiov, 0);
-
-qemu_iovec_destroy(&local_qiov);
-} else {
-ret = 0;
+goto out;
 }

-/* Reading beyond end of file is supposed to produce zeroes */
-if (ret == 0 && total_bytes < offset + bytes) {
-uint64_t zero_offset = MAX(0, total_bytes - offset);
-uint64_t zero_bytes = offset + bytes - zero_offset;
-qemu_iovec_memset(qiov, zero_offset, 0, zero_bytes);
+while (bytes_remaining) {
+int num;
+
+if (max_bytes) {
+QEMUIOVector local_qiov;
+
+num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
+assert(num);
+qemu_iovec_init(&local_qiov, qiov->niov);
+qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
+
+ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
+ num, &local_qiov, 0);
+max_bytes -= num;
+qemu_iovec_destroy(&local_qiov);
+} else {
+num = bytes_remaining;
+ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
+bytes_remaining);
+}
+if (ret < 0) {
+goto out;
+}
+bytes_remaining -= num;
 }

 out:
-return ret;
+return ret < 0 ? ret : 0;
 }

 /*
-- 
2.5.5

[Qemu-devel] [PATCH v3 5/6] nbd: Drop unused offset parameter

2016-07-15 Thread Eric Blake

Now that NBD relies on the block layer to fragment things, we no
longer need to track an offset argument for which fragment of
a request we are actually servicing.

While at it, use true and false instead of 0 and 1 for a bool
parameter.

Signed-off-by: Eric Blake 
Reviewed-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 

---
v3: no change
v2: minor formatting tweak [Kevin]
---
 include/block/nbd.h |  1 -
 nbd/nbd-internal.h  |  4 ++--
 block/nbd-client.c  | 31 ---
 nbd/common.c|  5 +
 4 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index eeda3eb..503f514 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -89,7 +89,6 @@ enum {
 ssize_t nbd_wr_syncv(QIOChannel *ioc,
  struct iovec *iov,
  size_t niov,
- size_t offset,
  size_t length,
  bool do_read);
 int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint32_t *flags,
diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
index 26a9f4d..93a6ca8 100644
--- a/nbd/nbd-internal.h
+++ b/nbd/nbd-internal.h
@@ -101,14 +101,14 @@ static inline ssize_t read_sync(QIOChannel *ioc, void 
*buffer, size_t size)
  * our request/reply.  Synchronization is done with recv_coroutine, so
  * that this is coroutine-safe.
  */
-return nbd_wr_syncv(ioc, &iov, 1, 0, size, true);
+return nbd_wr_syncv(ioc, &iov, 1, size, true);
 }

 static inline ssize_t write_sync(QIOChannel *ioc, void *buffer, size_t size)
 {
 struct iovec iov = { .iov_base = buffer, .iov_len = size };

-return nbd_wr_syncv(ioc, &iov, 1, 0, size, false);
+return nbd_wr_syncv(ioc, &iov, 1, size, false);
 }

 struct NBDTLSHandshakeData {
diff --git a/block/nbd-client.c b/block/nbd-client.c
index f1fb58b..f184844 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -116,7 +116,7 @@ static void nbd_restart_write(void *opaque)

 static int nbd_co_send_request(BlockDriverState *bs,
struct nbd_request *request,
-   QEMUIOVector *qiov, int offset)
+   QEMUIOVector *qiov)
 {
 NbdClientSession *s = nbd_get_client_session(bs);
 AioContext *aio_context;
@@ -149,8 +149,8 @@ static int nbd_co_send_request(BlockDriverState *bs,
 qio_channel_set_cork(s->ioc, true);
 rc = nbd_send_request(s->ioc, request);
 if (rc >= 0) {
-ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
-   offset, request->len, 0);
+ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
+   false);
 if (ret != request->len) {
 rc = -EIO;
 }
@@ -167,8 +167,9 @@ static int nbd_co_send_request(BlockDriverState *bs,
 }

 static void nbd_co_receive_reply(NbdClientSession *s,
-struct nbd_request *request, struct nbd_reply *reply,
-QEMUIOVector *qiov, int offset)
+ struct nbd_request *request,
+ struct nbd_reply *reply,
+ QEMUIOVector *qiov)
 {
 int ret;

@@ -181,8 +182,8 @@ static void nbd_co_receive_reply(NbdClientSession *s,
 reply->error = EIO;
 } else {
 if (qiov && reply->error == 0) {
-ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov,
-   offset, request->len, 1);
+ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, request->len,
+   true);
 if (ret != request->len) {
 reply->error = EIO;
 }
@@ -230,11 +231,11 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t 
sector_num,
 request.len = nb_sectors * 512;

 nbd_coroutine_start(client, &request);
-ret = nbd_co_send_request(bs, &request, NULL, 0);
+ret = nbd_co_send_request(bs, &request, NULL);
 if (ret < 0) {
 reply.error = -ret;
 } else {
-nbd_co_receive_reply(client, &request, &reply, qiov, 0);
+nbd_co_receive_reply(client, &request, &reply, qiov);
 }
 nbd_coroutine_end(client, &request);
 return -reply.error;
@@ -259,11 +260,11 @@ int nbd_client_co_writev(BlockDriverState *bs, int64_t 
sector_num,
 request.len = nb_sectors * 512;

 nbd_coroutine_start(client, &request);
-ret = nbd_co_send_request(bs, &request, qiov, 0);
+ret = nbd_co_send_request(bs, &request, qiov);
 if (ret < 0) {
 reply.error = -ret;
 } else {
-nbd_co_receive_reply(client, &request, &reply, NULL, 0);
+nbd_co_receive_reply(client, &request, &reply, NULL);
 }
 nbd_coroutine_end(client, &request);
 return -reply.error;
@@ -284,11 +285,11 @@ int nbd_client_co_flush(BlockDriverState *bs)
 request.len = 0;

 nbd_coroutine_start(client, &request);
-ret = nbd_co_send_request(b

[Qemu-devel] [PATCH v3 0/6] Auto-fragment large transactions at the block layer

2016-07-15 Thread Eric Blake

We have max_transfer documented in BlockLimits, but while we
honor it during pwrite_zeroes, we were blindly ignoring it
during pwritev and preadv, leading to multiple drivers having
to implement fragmentation themselves.  This series moves
fragmentation to the block layer, then fixes the NBD and iscsi
driver to use it.

qcow2 still does self-fragmenting, but that's because of cluster
boundaries where it really has to do additional work beyond what
the block layer can automatically provide.

Prequisite: Kevin's latest block branch PULL request

Also available as a tag at:
git fetch git://repo.or.cz/qemu/ericb.git nbd-fragment-v3

Changes since v2:
- patch 1, 3: change return semantics to be 0, not bytes, on success,
since at least one caller asserts during 'make check' otherwise

001/6:[0008] [FC] 'block: Fragment reads to max transfer length'
002/6:[] [--] 'raw_bsd: Don't advertise flags not supported by protocol 
layer'
003/6:[0002] [FC] 'block: Fragment writes to max transfer length'
004/6:[] [--] 'nbd: Rely on block layer to break up large requests'
005/6:[] [--] 'nbd: Drop unused offset parameter'
006/6:[] [--] 'iscsi: Rely on block layer to break up large requests'

Eric Blake (6):
  block: Fragment reads to max transfer length
  raw_bsd: Don't advertise flags not supported by protocol layer
  block: Fragment writes to max transfer length
  nbd: Rely on block layer to break up large requests
  nbd: Drop unused offset parameter
  iscsi: Rely on block layer to break up large requests

 include/block/nbd.h |  1 -
 nbd/nbd-internal.h  |  4 +--
 block/io.c  | 90 +++--
 block/iscsi.c   | 14 +++--
 block/nbd-client.c  | 78 +-
 block/nbd.c | 12 ++-
 block/raw_bsd.c |  6 ++--
 nbd/common.c|  5 +--
 8 files changed, 103 insertions(+), 107 deletions(-)

-- 
2.5.5

Re: [Qemu-devel] [PATCH] hw/misc: Add simple measurement hardware

2016-07-15 Thread Stefan Berger

"Dr. David Alan Gilbert"  wrote on 07/15/2016 
07:29:24 AM:

> 
> * Matthew Garrett (mj...@coreos.com) wrote:
> 
> Hi Matthew,
>   (Ccing in Stefan who has been trying to get vTPM in for years and
>Paolo for any x86ism and especially the ACPIisms, and Daniel for 
> crypto stuff)
> 
> I'll repeat some of my comments from yesterday's irc chat so you can
> reply on list.
> 
> So overall the plus point is it's simple (much smaller than even 
theinterface
> to the vTPM), the minus is it's very non-standard.
> 
> > Trusted Boot is based around having a trusted store of measurementdata 
and a
> > secure communications channel between that store and an 
> attestation target. In
> > actual hardware, that's a TPM. Since the TPM can only be accessed 
> via the host
> > system, this in turn requires that the TPM be able to perform 
reasonably
> > complicated cryptographic functions in order to demonstrate its 
> trusted state.
> > 
> > In cloud environments, qemu is inherently trusted and the 
> hypervisor infrastructure
> > provides a trusted mechanism for extracting information from qemu 
> and providing it
> > to another system. This means we can skip the crypto and stick 
> with the basic
> > functionality - ie, providing a trusted store of measurement data.
> 
> I think the big question for me is what uses this system and in 
> particular how the users
> can guarantee who they're speaking to; I'd like to understand the 
> cases it works
> for and those it doesn't;  for example:
> 
>a) (one that works) 'are all the VMs on my hosts running trusted OSs'
>   That works with this just as well as with a vTPM; you ask your
> hypervisor to
>   give you the measurements for your guests; you trust your 
hypervisor.
>   Although I think you've somehow got to extract the measurement
> log from the
>   guest and get it to the hypervisor if it's going to make sense of 
the
>   measurements.
> 
>b) (one that doesn't?) I'm connecting to a VM remotely over a 
> network, I want
>   to check the VM really is who it says it is and is running a 
trusted OS.
>   As a remote entity I don't know which hypervisor is running 
> the VM, the VM
>   itself can't sign anything to give me back because it might 
> just sign a reply
>   for a different VM.   On a real TPM the attestation results 
> would be signed
>   using one of the keys in the TPM (I can't remember which).

Attestation Identity Key (AIK)

> 
>c) (similar to b) 'I paid you to give me a ... VM - can I check 
> it really is that'
>   how do I externally to the cloud show that the measurement 
> came from the same VM
>   I'm asking about.
> 
> and then I'm not clear which of the existing TPM users could be 
> munged into working
> with it; can you make an existing trusted-grub or trousers write 
> measurements and log
> into it?
> 
> > This driver provides a very small subset of TPM 1.2 functionality 
> in the form of a
> > bank of registers that can store SHA1 measurements of boot 
> components. Performing a
> > write to one of these registers will append the new 20 byte hash 
> to the 20 bytes
> > currently stored within the register, take a SHA1 of this 40 byte 
> value and then
> > replace the existing register contents with the new value. This 
> ensures that a
> > given value can only be obtained by performing the same sequence 
> of writes. It also
> > adds a monitor command to allow an external agent to extract this 
> information from
> > the running system and provide it over a secure communications 
> channel. Finally, it
> > measures each of the loaded ROMs into one of the registers at reset 
time.


Are you also providing a measurement log that goes along with these PCR 
extensions? Like a measurement log we have in the TCPA ACPI table? Just 
measurements without knowing what was measured wouldn't be all that 
helpful. Typically recipients of the measurement list would inspect the 
individual measurements and replay the extensions to come up with the same 
state of the PCRs that was quoted (signed) by the TPM. Also, are you going 
to instrument Linux IMA to use this device? And the list goes on into 
higher level tools that may work with a measurement list from 
/sys/kernel/security/{tpm0,ima}/*_measurement_list and assume there's a 
/dev/tpm0 there that can issue a quote with the AIK. Well, one problem is 
there's little traction for the vTPM but this device here will require new 
support in existing tools.

Typically the TPM is there for the reason: it is a hardware root of trust 
that signs the current state of the PCRs that were accumulated by 
measurements starting early on during BIOS init. Now with this device, 
apart from exposing this via HMP, how would one be sure that, if the 
current list of the PCRs is presented to an attesting client, that the 
kernel or attestation server not just completely fake the state of the 
PCRs? My assumption here is that the state of this device's PCRs will be 
ex

[Qemu-devel] [PULL 2/4] linux-aio: share one LinuxAioState within an AioContext

2016-07-15 Thread Stefan Hajnoczi

From: Paolo Bonzini 

This has better performance because it executes fewer system calls
and does not use a bottom half per disk.

Originally proposed by Ming Lei.

Acked-by: Stefan Hajnoczi 
Signed-off-by: Paolo Bonzini 
Message-id: 146765-51385-1-git-send-email-pbonz...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 async.c |  23 ++
 block/linux-aio.c   |  10 ++--
 block/raw-aio.h |  68 ---
 block/raw-posix.c   | 119 ++--
 block/raw-win32.c   |   2 +-
 include/block/aio.h |  13 ++
 include/block/raw-aio.h |  68 +++
 7 files changed, 125 insertions(+), 178 deletions(-)
 delete mode 100644 block/raw-aio.h
 create mode 100644 include/block/raw-aio.h

diff --git a/async.c b/async.c
index b4bf205..6caa98c 100644
--- a/async.c
+++ b/async.c
@@ -29,6 +29,7 @@
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
+#include "block/raw-aio.h"
 
 /***/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -242,6 +243,14 @@ aio_ctx_finalize(GSource *source)
 qemu_bh_delete(ctx->notify_dummy_bh);
 thread_pool_free(ctx->thread_pool);
 
+#ifdef CONFIG_LINUX_AIO
+if (ctx->linux_aio) {
+laio_detach_aio_context(ctx->linux_aio, ctx);
+laio_cleanup(ctx->linux_aio);
+ctx->linux_aio = NULL;
+}
+#endif
+
 qemu_mutex_lock(&ctx->bh_lock);
 while (ctx->first_bh) {
 QEMUBH *next = ctx->first_bh->next;
@@ -282,6 +291,17 @@ ThreadPool *aio_get_thread_pool(AioContext *ctx)
 return ctx->thread_pool;
 }
 
+#ifdef CONFIG_LINUX_AIO
+LinuxAioState *aio_get_linux_aio(AioContext *ctx)
+{
+if (!ctx->linux_aio) {
+ctx->linux_aio = laio_init();
+laio_attach_aio_context(ctx->linux_aio, ctx);
+}
+return ctx->linux_aio;
+}
+#endif
+
 void aio_notify(AioContext *ctx)
 {
 /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
@@ -345,6 +365,9 @@ AioContext *aio_context_new(Error **errp)
false,
(EventNotifierHandler *)
event_notifier_dummy_cb);
+#ifdef CONFIG_LINUX_AIO
+ctx->linux_aio = NULL;
+#endif
 ctx->thread_pool = NULL;
 qemu_mutex_init(&ctx->bh_lock);
 rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 5c104bd..1d702a5 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -50,6 +50,8 @@ typedef struct {
 } LaioQueue;
 
 struct LinuxAioState {
+AioContext *aio_context;
+
 io_context_t ctx;
 EventNotifier e;
 
@@ -227,15 +229,14 @@ static void ioq_submit(LinuxAioState *s)
 
 void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
 {
-assert(!s->io_q.plugged);
-s->io_q.plugged = 1;
+s->io_q.plugged++;
 }
 
 void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s)
 {
 assert(s->io_q.plugged);
-s->io_q.plugged = 0;
-if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
+if (--s->io_q.plugged == 0 &&
+!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
 ioq_submit(s);
 }
 }
@@ -325,6 +326,7 @@ void laio_detach_aio_context(LinuxAioState *s, AioContext 
*old_context)
 
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 {
+s->aio_context = new_context;
 s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
 aio_set_event_notifier(new_context, &s->e, false,
qemu_laio_completion_cb);
diff --git a/block/raw-aio.h b/block/raw-aio.h
deleted file mode 100644
index a4cdbbf..000
--- a/block/raw-aio.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Declarations for AIO in the raw protocol
- *
- * Copyright IBM, Corp. 2008
- *
- * Authors:
- *  Anthony Liguori   
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- * Contributions after 2012-01-13 are licensed under the terms of the
- * GNU GPL, version 2 or (at your option) any later version.
- */
-#ifndef QEMU_RAW_AIO_H
-#define QEMU_RAW_AIO_H
-
-#include "qemu/coroutine.h"
-#include "qemu/iov.h"
-
-/* AIO request types */
-#define QEMU_AIO_READ 0x0001
-#define QEMU_AIO_WRITE0x0002
-#define QEMU_AIO_IOCTL0x0004
-#define QEMU_AIO_FLUSH0x0008
-#define QEMU_AIO_DISCARD  0x0010
-#define QEMU_AIO_WRITE_ZEROES 0x0020
-#define QEMU_AIO_TYPE_MASK \
-(QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \
- QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES)
-
-/* AIO flags */
-#define QEMU_AIO_MISALIGNED   0x1000
-#define QEMU_AIO_BLKDEV   0x2000
-
-
-/* linux-aio.c - Linux native implementation */
-#ifdef CONFIG_LINUX_AIO
-typedef struct LinuxAioState LinuxAioState;
-LinuxAioState *laio_init(void);
-void laio_cleanup(LinuxAioStat

[Qemu-devel] [PULL 3/4] aio_ctx_check: follow CODING_STYLE

2016-07-15 Thread Stefan Hajnoczi

From: Cao jin 

replace tab with spaces

Signed-off-by: Cao jin 
Message-id: 1468501843-14927-1-git-send-email-caoj.f...@cn.fujitsu.com
Signed-off-by: Stefan Hajnoczi 
---
 async.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/async.c b/async.c
index 6caa98c..0e0efc3 100644
--- a/async.c
+++ b/async.c
@@ -218,7 +218,7 @@ aio_ctx_check(GSource *source)
 for (bh = ctx->first_bh; bh; bh = bh->next) {
 if (!bh->deleted && bh->scheduled) {
 return true;
-   }
+}
 }
 return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0);
 }
-- 
2.7.4

Re: [Qemu-devel] [PATCH v5 04/10] block: Support meta dirty bitmap

2016-07-15 Thread John Snow



On 07/15/2016 08:04 AM, Max Reitz wrote:
> On 14.07.2016 22:00, John Snow wrote:
>> On 06/22/2016 11:53 AM, Max Reitz wrote:
>>> On 03.06.2016 06:32, Fam Zheng wrote:
 The added group of operations enables tracking of the changed bits in
 the dirty bitmap.

 Signed-off-by: Fam Zheng 
 ---
  block/dirty-bitmap.c | 52 
 
  include/block/dirty-bitmap.h |  9 
  2 files changed, 61 insertions(+)

 diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
 index 628b77c..9c53c56 100644
 --- a/block/dirty-bitmap.c
 +++ b/block/dirty-bitmap.c
 @@ -38,6 +38,7 @@
   */
  struct BdrvDirtyBitmap {
  HBitmap *bitmap;/* Dirty sector bitmap implementation */
 +HBitmap *meta;  /* Meta dirty bitmap */
  BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status 
 */
  char *name; /* Optional non-empty unique ID */
  int64_t size;   /* Size of the bitmap (Number of sectors) 
 */
 @@ -103,6 +104,56 @@ BdrvDirtyBitmap 
 *bdrv_create_dirty_bitmap(BlockDriverState *bs,
  return bitmap;
  }
  
 +/* bdrv_create_meta_dirty_bitmap
 + *
 + * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. 
 I.e.
 + * when a dirty status bit in @bitmap is changed (either from reset to 
 set or
 + * the other way around), its respective meta dirty bitmap bit will be 
 marked
 + * dirty as well.
>>>
>>> Not wrong, but I'd like a note here that this is not an
>>> when-and-only-when relationship, i.e. that bits in the meta bitmap may
>>> be set even without the tracked bits in the dirty bitmap having changed.
>>>
>>
>> How?
>>
>> You mean, if the caller manually starts setting things in the meta
>> bitmap object?
>>
>> That's their fault then, isn't it?
> 
> No, I mean something that I mentioned in a reply to some previous
> version (the patch adding the test):
> 
> http://lists.nongnu.org/archive/html/qemu-block/2016-03/msg00332.html
> 
> Fam's reply is here:
> 
> http://lists.nongnu.org/archive/html/qemu-block/2016-06/msg00097.html
> 
> (Interesting how that reply took nearly three months and yours took
> nearly one, there most be something about this series that makes
> replying to replies very cumbersome :-))
> 

https://media.giphy.com/media/waG6HzLWKIkhi/giphy.gif

> What I meant by “then it would update meta” is that it would update *all
> of the range* even though only a single bit has actually been changed.
> 

Aha, I understand exactly now, thanks.

> So the answer to your “how” is: See patch 2, the changes to
> hbitmap_set() (and hbitmap_reset()). If any of the bits in the given
> range is changed, all of the range is marked as having changed in the
> meta bitmap.
> 
> So all we guarantee is that every time a bit is changed, the
> corresponding bit in the meta bitmap will be set. But we do not
> guarantee that a bit in the meta bitmap stays cleared as long as the
> corresponding range of the underlying bitmap stays the same.
> 
> Max
> 

I'll work on a followup patch to improve it.

>>
>>> Maybe this should be mentioned somewhere in patch 2, too. Or maybe only
>>> in patch 2.
>>>
 + *
 + * @bitmap: the block dirty bitmap for which to create a meta dirty 
 bitmap.
 + * @chunk_size: how many bytes of bitmap data does each bit in the meta 
 bitmap
 + * track.
 + */
 +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
 +   int chunk_size)
 +{
 +assert(!bitmap->meta);
 +bitmap->meta = hbitmap_create_meta(bitmap->bitmap,
 +   chunk_size * BITS_PER_BYTE);
 +}
 +
 +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap)
 +{
 +assert(bitmap->meta);
 +hbitmap_free_meta(bitmap->bitmap);
 +bitmap->meta = NULL;
 +}
 +
 +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
 +   BdrvDirtyBitmap *bitmap, int64_t sector,
 +   int nb_sectors)
 +{
 +uint64_t i;
 +int gran = bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
 +
 +/* To optimize: we can make hbitmap to internally check the range in a
 + * coarse level, or at least do it word by word. */
>>>
>>> We could also multiply gran by the granularity of the meta bitmap. Each
>>> bit of the meta bitmap tracks at least eight bits of the dirty bitmap,
>>> so we're calling hbitmap_get() at least eight times as often as
>>> necessary here.
>>>
>>> Or we just use int gran = hbitmap_granularity(bitmap->meta);.
>>>
>>> Not exactly wrong, though, so:
>>>
>>> Reviewed-by: Max Reitz 
>>>
 +for (i = sector; i < sector + nb_sectors; i += gran) {
 +if (hbitmap_get(bitmap->meta, i)) {

[Qemu-devel] [PULL 1/4] spec/parallels: fix a mistake

2016-07-15 Thread Stefan Hajnoczi

From: Vladimir Sementsov-Ogievskiy 

We have only one flag for now - Empty Image flag. The patch fixes unused
bits specification and marks bit 1 as usused.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Denis V. Lunev 
CC: Stefan Hajnoczi 
Signed-off-by: Stefan Hajnoczi 
---
 docs/specs/parallels.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/specs/parallels.txt b/docs/specs/parallels.txt
index b4fe229..e9271eb 100644
--- a/docs/specs/parallels.txt
+++ b/docs/specs/parallels.txt
@@ -94,7 +94,7 @@ Bytes:
   Bit 0: Empty Image bit. If set, the image should be
  considered clear.
 
-  Bits 2-31: Unused.
+  Bits 1-31: Unused.
 
   56 - 63:ext_off
   Format Extension offset, an offset, in sectors, from the start of
-- 
2.7.4

Re: [Qemu-devel] [PATCH v6 3/6] tests: in IDE and AHCI tests perform DMA write before flushing

2016-07-15 Thread Evgeny Yakovlev




On 15.07.2016 20:23, Eric Blake wrote:

On 07/15/2016 02:08 AM, Evgeny Yakovlev wrote:


+ * Write sector 0 with random data to make AHCI storage dirty

If we ever have a case where we open a disk without specifying -raw, the
random data _might_ resemble some other format and cause probe to
misbehave; as such, we also have code in the block layer that
specifically prevents writes to sector 0 for some data. Should you pick
a different sector than 0, so as to avoid any (remote) possibility that
the random data could change probe results or be rejected?


Not sure if i understand the problem you're referring to here. Those are
blkdebug tests, those disks are created, emulated with blkdebug backend,
flushed and then thrown away. So is there really any possibility for
reopening the image and accidentally parsing a partition table in sector 0?

Also, not sure what you mean by "code in the block layer that
specifically prevents writes to sector 0 for some data". Can you explain
that bit, because it sounds pretty scary. How can we deny guest VM to
write anything to sector 0 on its emulated disk?

Read block/raw_bsd.c:raw_co_writev_flags() for the gory details.  If the
guest ever gets a raw format driver because the user forgot to say
'--format $foo', then we prevent the guest from writing anything into
sector 0 that would be probed as non-raw.  It means there are only a
handful of patterns that the guest cannot write into the first sector,
but it IS a non-zero number of patterns.  How the guest behaves if such
a write is attempted depends on the error policy you have on that
device; it might show up as an EIO error to the guest, or it might stop
the guest from executing and raise a qemu event to the management
application, but the point is that we actively prohibit some writes to
sector 0 on a probed raw disk.  Using any sector other than 0 doesn't
have this limitation, or you can ensure that your test ALWAYS passes the
appropriate --format $foo so that the disk is never probed as another
way to avoid limitations on sector 0.




I think i get it now. Thanks!

[Qemu-devel] [PULL 4/4] linux-aio: prevent submitting more than MAX_EVENTS

2016-07-15 Thread Stefan Hajnoczi

From: Roman Pen 

Invoking io_setup(MAX_EVENTS) we ask kernel to create ring buffer for us
with specified number of events.  But kernel ring buffer allocation logic
is a bit tricky (ring buffer is page size aligned + some percpu allocation
are required) so eventually more than requested events number is allocated.

>From a userspace side we have to follow the convention and should not try
to io_submit() more or logic, which consumes completed events, should be
changed accordingly.  The pitfall is in the following sequence:

MAX_EVENTS = 128
io_setup(MAX_EVENTS)

io_submit(MAX_EVENTS)
io_submit(MAX_EVENTS)

/* now 256 events are in-flight */

io_getevents(MAX_EVENTS) = 128

/* we can handle only 128 events at once, to be sure
 * that nothing is pended the io_getevents(MAX_EVENTS)
 * call must be invoked once more or hang will happen. */

To prevent the hang or reiteration of io_getevents() call this patch
restricts the number of in-flights, which is now limited to MAX_EVENTS.

Signed-off-by: Roman Pen 
Reviewed-by: Fam Zheng 
Reviewed-by: Paolo Bonzini 
Reviewed-by: Stefan Hajnoczi 
Message-id: 1468415004-31755-1-git-send-email-roman.peny...@profitbricks.com
Cc: Stefan Hajnoczi 
Cc: qemu-devel@nongnu.org
Signed-off-by: Stefan Hajnoczi 
---
 block/linux-aio.c | 26 --
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/block/linux-aio.c b/block/linux-aio.c
index 1d702a5..de3548f 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -28,8 +28,6 @@
  */
 #define MAX_EVENTS 128
 
-#define MAX_QUEUED_IO  128
-
 struct qemu_laiocb {
 BlockAIOCB common;
 Coroutine *co;
@@ -44,7 +42,8 @@ struct qemu_laiocb {
 
 typedef struct {
 int plugged;
-unsigned int n;
+unsigned int in_queue;
+unsigned int in_flight;
 bool blocked;
 QSIMPLEQ_HEAD(, qemu_laiocb) pending;
 } LaioQueue;
@@ -131,6 +130,7 @@ static void qemu_laio_completion_bh(void *opaque)
 s->event_max = 0;
 return; /* no more events */
 }
+s->io_q.in_flight -= s->event_max;
 }
 
 /* Reschedule so nested event loops see currently pending completions */
@@ -192,7 +192,8 @@ static void ioq_init(LaioQueue *io_q)
 {
 QSIMPLEQ_INIT(&io_q->pending);
 io_q->plugged = 0;
-io_q->n = 0;
+io_q->in_queue = 0;
+io_q->in_flight = 0;
 io_q->blocked = false;
 }
 
@@ -200,14 +201,17 @@ static void ioq_submit(LinuxAioState *s)
 {
 int ret, len;
 struct qemu_laiocb *aiocb;
-struct iocb *iocbs[MAX_QUEUED_IO];
+struct iocb *iocbs[MAX_EVENTS];
 QSIMPLEQ_HEAD(, qemu_laiocb) completed;
 
 do {
+if (s->io_q.in_flight >= MAX_EVENTS) {
+break;
+}
 len = 0;
 QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
 iocbs[len++] = &aiocb->iocb;
-if (len == MAX_QUEUED_IO) {
+if (s->io_q.in_flight + len >= MAX_EVENTS) {
 break;
 }
 }
@@ -220,11 +224,12 @@ static void ioq_submit(LinuxAioState *s)
 abort();
 }
 
-s->io_q.n -= ret;
+s->io_q.in_flight += ret;
+s->io_q.in_queue  -= ret;
 aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
 QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
 } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
-s->io_q.blocked = (s->io_q.n > 0);
+s->io_q.blocked = (s->io_q.in_queue > 0);
 }
 
 void laio_io_plug(BlockDriverState *bs, LinuxAioState *s)
@@ -264,9 +269,10 @@ static int laio_do_submit(int fd, struct qemu_laiocb 
*laiocb, off_t offset,
 io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e));
 
 QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next);
-s->io_q.n++;
+s->io_q.in_queue++;
 if (!s->io_q.blocked &&
-(!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) {
+(!s->io_q.plugged ||
+ s->io_q.in_flight + s->io_q.in_queue >= MAX_EVENTS)) {
 ioq_submit(s);
 }
 
-- 
2.7.4

[Qemu-devel] [PATCH v4 10/12] tcg: Avoid bouncing tb_lock between tb_gen_code() and tb_add_jump()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 cpu-exec.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index bbaed5bb1978..073d783398f3 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -281,7 +281,8 @@ static TranslationBlock *tb_find_physical(CPUState *cpu,
 static TranslationBlock *tb_find_slow(CPUState *cpu,
   target_ulong pc,
   target_ulong cs_base,
-  uint32_t flags)
+  uint32_t flags,
+  bool *have_tb_lock)
 {
 TranslationBlock *tb;
 
@@ -294,6 +295,7 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
  */
 mmap_lock();
 tb_lock();
+*have_tb_lock = true;
 
 /* There's a chance that our desired tb has been translated while
  * taking the locks so we check again inside the lock.
@@ -304,7 +306,6 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
 tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
 }
 
-tb_unlock();
 mmap_unlock();
 }
 
@@ -321,6 +322,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
 TranslationBlock *tb;
 target_ulong cs_base, pc;
 uint32_t flags;
+bool have_tb_lock = false;
 
 /* we record a subset of the CPU state. It will
always be the same before a given translated block
@@ -329,8 +331,8 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
 tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
 if (unlikely(!tb || atomic_read(&tb->pc) != pc ||
  atomic_read(&tb->cs_base) != cs_base ||
- atomic_read(&b->flags) != flags)) {
-tb = tb_find_slow(cpu, pc, cs_base, flags);
+ atomic_read(&tb->flags) != flags)) {
+tb = tb_find_slow(cpu, pc, cs_base, flags, &have_tb_lock);
 }
 #ifndef CONFIG_USER_ONLY
 /* We don't take care of direct jumps when address mapping changes in
@@ -343,13 +345,18 @@ static inline TranslationBlock *tb_find_fast(CPUState 
*cpu,
 #endif
 /* See if we can patch the calling TB. */
 if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-tb_lock();
+if (!have_tb_lock) {
+tb_lock();
+have_tb_lock = true;
+}
 /* Check if translation buffer has been flushed */
 if (cpu->tb_flushed) {
 cpu->tb_flushed = false;
 } else if (!tb_is_invalid(tb)) {
 tb_add_jump(last_tb, tb_exit, tb);
 }
+}
+if (have_tb_lock) {
 tb_unlock();
 }
 return tb;
-- 
2.9.1

Re: [Qemu-devel] [PATCH] linux-user: fix "really futimens" condition in sys_utimensat()

2016-07-15 Thread Peter Maydell

On 15 July 2016 at 18:43, Peter Maydell  wrote:
> In some configurations we implement sys_utimensat() via a wrapper
> that calls either futimens() or utimensat(), depending on the
> arguments (to handle a case where the Linux syscall API diverges
> from the glibc API). Fix a corner case in this handling:
> if the syscall is passed a NULL pathname and dirfd == AT_FDCWD,
> then it must fail with EFAULT. We can't handle this by passing
> it to glibc utimensat() because at the libc level a NULL
> pathname is failed with EINVAL, and we can't handle it by
> passing to futimens() because that would fail with EBADF.
> So special case it and return EFAULT directly from the wrapper.
>
> This means that if the guest calls utimes() with a NULL pathname
> and guest glibc converts that into a syscall utimensat(AT_FDCWD,
> NULL, ...) then we correctly fail it with EFAULT.
>
> Signed-off-by: Peter Maydell 
> ---
>  linux-user/syscall.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
>
> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
> index 0e87157..61ea58b 100644
> --- a/linux-user/syscall.c
> +++ b/linux-user/syscall.c
> @@ -367,10 +367,15 @@ static int sys_getcwd1(char *buf, size_t size)
>  static int sys_utimensat(int dirfd, const char *pathname,
>  const struct timespec times[2], int flags)
>  {
> -if (pathname == NULL)
> +if (pathname == NULL) {
> +if (dirfd == AT_FDCWD) {
> +errno = EFAULT;
> +return -1;
> +}
>  return futimens(dirfd, times);
> -else
> +} else {
>  return utimensat(dirfd, pathname, times, flags);
> +}
>  }
>  #elif defined(__NR_utimensat)
>  #define __NR_sys_utimensat __NR_utimensat

There turns out to be another annoying corner case here, which is
when pathname == NULL, dirfd != AT_FDCWD and the flags include
AT_SYMLINK_NOFOLLOW -- this is supposed to fail EINVAL. I'll
have a look at that next week and see whether it's best fixed
in the same patch as this case or with a followup patch...

thanks
-- PMM

[Qemu-devel] [PATCH v4 08/12] tcg: set up tb->page_addr before insertion

2016-07-15 Thread Sergey Fedorov

From: Alex Bennée 

This ensures that if we find the TB on the slow path that tb->page_addr
is correctly set before being tested.

Signed-off-by: Alex Bennée 
Reviewed-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
---
 translate-all.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/translate-all.c b/translate-all.c
index 9db72e8982b1..6156bdcbef42 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -1118,10 +1118,6 @@ static void tb_link_page(TranslationBlock *tb, 
tb_page_addr_t phys_pc,
 {
 uint32_t h;
 
-/* add in the hash table */
-h = tb_hash_func(phys_pc, tb->pc, tb->flags);
-qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
-
 /* add in the page list */
 tb_alloc_page(tb, 0, phys_pc & TARGET_PAGE_MASK);
 if (phys_page2 != -1) {
@@ -1130,6 +1126,10 @@ static void tb_link_page(TranslationBlock *tb, 
tb_page_addr_t phys_pc,
 tb->page_addr[1] = -1;
 }
 
+/* add in the hash table */
+h = tb_hash_func(phys_pc, tb->pc, tb->flags);
+qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
+
 #ifdef DEBUG_TB_CHECK
 tb_page_check();
 #endif
-- 
2.9.1

[Qemu-devel] [PATCH v4 11/12] tcg: Merge tb_find_slow() and tb_find_fast()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

These functions are not too big and can be merged together. This makes
locking scheme more clear and easier to follow.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 cpu-exec.c | 72 ++
 1 file changed, 30 insertions(+), 42 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 073d783398f3..ff138809046c 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -278,45 +278,9 @@ static TranslationBlock *tb_find_physical(CPUState *cpu,
 return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
 }
 
-static TranslationBlock *tb_find_slow(CPUState *cpu,
-  target_ulong pc,
-  target_ulong cs_base,
-  uint32_t flags,
-  bool *have_tb_lock)
-{
-TranslationBlock *tb;
-
-tb = tb_find_physical(cpu, pc, cs_base, flags);
-if (!tb) {
-
-/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
- * taken outside tb_lock. As system emulation is currently
- * single threaded the locks are NOPs.
- */
-mmap_lock();
-tb_lock();
-*have_tb_lock = true;
-
-/* There's a chance that our desired tb has been translated while
- * taking the locks so we check again inside the lock.
- */
-tb = tb_find_physical(cpu, pc, cs_base, flags);
-if (!tb) {
-/* if no translated code available, then translate it now */
-tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
-}
-
-mmap_unlock();
-}
-
-/* We add the TB in the virtual pc hash table for the fast lookup */
-atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
-return tb;
-}
-
-static inline TranslationBlock *tb_find_fast(CPUState *cpu,
- TranslationBlock *last_tb,
- int tb_exit)
+static inline TranslationBlock *tb_find(CPUState *cpu,
+TranslationBlock *last_tb,
+int tb_exit)
 {
 CPUArchState *env = (CPUArchState *)cpu->env_ptr;
 TranslationBlock *tb;
@@ -332,7 +296,31 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
 if (unlikely(!tb || atomic_read(&tb->pc) != pc ||
  atomic_read(&tb->cs_base) != cs_base ||
  atomic_read(&tb->flags) != flags)) {
-tb = tb_find_slow(cpu, pc, cs_base, flags, &have_tb_lock);
+tb = tb_find_physical(cpu, pc, cs_base, flags);
+if (!tb) {
+
+/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
+ * taken outside tb_lock. As system emulation is currently
+ * single threaded the locks are NOPs.
+ */
+mmap_lock();
+tb_lock();
+have_tb_lock = true;
+
+/* There's a chance that our desired tb has been translated while
+ * taking the locks so we check again inside the lock.
+ */
+tb = tb_find_physical(cpu, pc, cs_base, flags);
+if (!tb) {
+/* if no translated code available, then translate it now */
+tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+}
+
+mmap_unlock();
+}
+
+/* We add the TB in the virtual pc hash table for the fast lookup */
+atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
 }
 #ifndef CONFIG_USER_ONLY
 /* We don't take care of direct jumps when address mapping changes in
@@ -437,7 +425,7 @@ static inline bool cpu_handle_exception(CPUState *cpu, int 
*ret)
 } else if (replay_has_exception()
&& cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
 /* try to cause an exception pending in the log */
-cpu_exec_nocache(cpu, 1, tb_find_fast(cpu, NULL, 0), true);
+cpu_exec_nocache(cpu, 1, tb_find(cpu, NULL, 0), true);
 *ret = -1;
 return true;
 #endif
@@ -621,7 +609,7 @@ int cpu_exec(CPUState *cpu)
 atomic_mb_set(&cpu->tb_flushed, false); /* reset before first TB 
lookup */
 for(;;) {
 cpu_handle_interrupt(cpu, &last_tb);
-tb = tb_find_fast(cpu, last_tb, tb_exit);
+tb = tb_find(cpu, last_tb, tb_exit);
 cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit, &sc);
 /* Try to align the host and virtual clocks
if the guest is in advance */
-- 
2.9.1

[Qemu-devel] [PULL 0/4] Block patches

2016-07-15 Thread Stefan Hajnoczi

The following changes since commit 14c7d99333e4a474c65bdae6f99aa8837e8078e6:

  Merge remote-tracking branch 'remotes/pmaydell/tags/pull-target-arm-20160714' 
into staging (2016-07-14 17:32:53 +0100)

are available in the git repository at:

  git://github.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to 325e6139a30a4c747cf35489ea5bee8da272:

  linux-aio: prevent submitting more than MAX_EVENTS (2016-07-15 12:45:47 +0100)





Cao jin (1):
  aio_ctx_check: follow CODING_STYLE

Paolo Bonzini (1):
  linux-aio: share one LinuxAioState within an AioContext

Roman Pen (1):
  linux-aio: prevent submitting more than MAX_EVENTS

Vladimir Sementsov-Ogievskiy (1):
  spec/parallels: fix a mistake

 async.c  |  25 +-
 block/linux-aio.c|  36 --
 block/raw-aio.h  |  68 ---
 block/raw-posix.c| 119 ++-
 block/raw-win32.c|   2 +-
 docs/specs/parallels.txt |   2 +-
 include/block/aio.h  |  13 ++
 include/block/raw-aio.h  |  68 +++
 8 files changed, 143 insertions(+), 190 deletions(-)
 delete mode 100644 block/raw-aio.h
 create mode 100644 include/block/raw-aio.h

-- 
2.7.4

[Qemu-devel] [PATCH v4 04/12] tcg: Prepare safe access to tb_flushed out of tb_lock

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Ensure atomicity and ordering of CPU's 'tb_flushed' access for future
translation block lookup out of 'tb_lock'.

This field can only be touched from another thread by tb_flush() in user
mode emulation. So the only access to be sequential atomic is:
 * a single write in tb_flush();
 * reads/writes out of 'tb_lock'.

In future, before enabling MTTCG in system mode, tb_flush() must be safe
and this field becomes unnecessary.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 

---
Changes in v4:
 - Commit message tweaked
---
 cpu-exec.c  | 16 +++-
 translate-all.c |  4 ++--
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 2fd1875a7317..c973e3b85922 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -338,13 +338,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
  tb->flags != flags)) {
 tb = tb_find_slow(cpu, pc, cs_base, flags);
 }
-if (cpu->tb_flushed) {
-/* Ensure that no TB jump will be modified as the
- * translation buffer has been flushed.
- */
-last_tb = NULL;
-cpu->tb_flushed = false;
-}
 #ifndef CONFIG_USER_ONLY
 /* We don't take care of direct jumps when address mapping changes in
  * system emulation. So it's not safe to make a direct jump to a TB
@@ -356,7 +349,12 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
 #endif
 /* See if we can patch the calling TB. */
 if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-tb_add_jump(last_tb, tb_exit, tb);
+/* Check if translation buffer has been flushed */
+if (cpu->tb_flushed) {
+cpu->tb_flushed = false;
+} else {
+tb_add_jump(last_tb, tb_exit, tb);
+}
 }
 tb_unlock();
 return tb;
@@ -618,7 +616,7 @@ int cpu_exec(CPUState *cpu)
 }
 
 last_tb = NULL; /* forget the last executed TB after exception */
-cpu->tb_flushed = false; /* reset before first TB lookup */
+atomic_mb_set(&cpu->tb_flushed, false); /* reset before first TB 
lookup */
 for(;;) {
 cpu_handle_interrupt(cpu, &last_tb);
 tb = tb_find_fast(cpu, last_tb, tb_exit);
diff --git a/translate-all.c b/translate-all.c
index fdf520a86d68..788fed1e0765 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -845,7 +845,6 @@ void tb_flush(CPUState *cpu)
 > tcg_ctx.code_gen_buffer_size) {
 cpu_abort(cpu, "Internal error: code buffer overflow\n");
 }
-tcg_ctx.tb_ctx.nb_tbs = 0;
 
 CPU_FOREACH(cpu) {
 int i;
@@ -853,9 +852,10 @@ void tb_flush(CPUState *cpu)
 for (i = 0; i < TB_JMP_CACHE_SIZE; ++i) {
 atomic_set(&cpu->tb_jmp_cache[i], NULL);
 }
-cpu->tb_flushed = true;
+atomic_mb_set(&cpu->tb_flushed, true);
 }
 
+tcg_ctx.tb_ctx.nb_tbs = 0;
 qht_reset_size(&tcg_ctx.tb_ctx.htable, CODE_GEN_HTABLE_SIZE);
 page_flush_tb();
 
-- 
2.9.1

[Qemu-devel] [PATCH v4 06/12] tcg: Introduce tb_mark_invalid() and tb_is_invalid()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

These functions will be used to make translation block invalidation safe
with concurrent lockless lookup in the global hash table.

Most targets don't use 'cs_base'; so marking TB as invalid is as simple
as assigning -1 to 'cs_base'. SPARC target stores the next program
counter into 'cs_base', and -1 is a fine invalid value since PC must bet
a multiple of 4 in SPARC. The only odd target is i386, for which a
special flag is introduced in place of removed 'HF_SOFTMMU_MASK'.

Suggested-by: Paolo Bonzini 
Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 include/exec/exec-all.h  | 10 ++
 target-alpha/cpu.h   | 14 ++
 target-arm/cpu.h | 14 ++
 target-cris/cpu.h| 14 ++
 target-i386/cpu.h| 17 +
 target-lm32/cpu.h| 14 ++
 target-m68k/cpu.h| 14 ++
 target-microblaze/cpu.h  | 14 ++
 target-mips/cpu.h| 14 ++
 target-moxie/cpu.h   | 14 ++
 target-openrisc/cpu.h| 14 ++
 target-ppc/cpu.h | 14 ++
 target-s390x/cpu.h   | 14 ++
 target-sh4/cpu.h | 14 ++
 target-sparc/cpu.h   | 14 ++
 target-sparc/translate.c |  1 +
 target-tilegx/cpu.h  | 14 ++
 target-tricore/cpu.h | 14 ++
 target-unicore32/cpu.h   | 14 ++
 target-xtensa/cpu.h  | 14 ++
 20 files changed, 266 insertions(+)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index acda7b613d53..a499c7c56eef 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -256,6 +256,16 @@ void tb_free(TranslationBlock *tb);
 void tb_flush(CPUState *cpu);
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
 
+static inline void tb_mark_invalid(TranslationBlock *tb)
+{
+cpu_get_invalid_tb_cpu_state(&tb->pc, &tb->cs_base, &tb->flags);
+}
+
+static inline bool tb_is_invalid(TranslationBlock *tb)
+{
+return cpu_tb_cpu_state_is_invalidated(tb->pc, tb->cs_base, tb->flags);
+}
+
 #if defined(USE_DIRECT_JUMP)
 
 #if defined(CONFIG_TCG_INTERPRETER)
diff --git a/target-alpha/cpu.h b/target-alpha/cpu.h
index ac5e801fb43b..f4ecabeb5b68 100644
--- a/target-alpha/cpu.h
+++ b/target-alpha/cpu.h
@@ -524,4 +524,18 @@ static inline void cpu_get_tb_cpu_state(CPUAlphaState 
*env, target_ulong *pc,
 *pflags = flags;
 }
 
+static inline void cpu_get_invalid_tb_cpu_state(target_ulong *pc,
+target_ulong *cs_base,
+uint32_t *flags)
+{
+*cs_base = -1;
+}
+
+static inline bool cpu_tb_cpu_state_is_invalidated(target_ulong pc,
+   target_ulong cs_base,
+   uint32_t flags)
+{
+return cs_base == -1;
+}
+
 #endif /* ALPHA_CPU_H */
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index 76d824d315f7..068f58d6a278 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -2371,6 +2371,20 @@ static inline void cpu_get_tb_cpu_state(CPUARMState 
*env, target_ulong *pc,
 *cs_base = 0;
 }
 
+static inline void cpu_get_invalid_tb_cpu_state(target_ulong *pc,
+target_ulong *cs_base,
+uint32_t *flags)
+{
+*cs_base = -1;
+}
+
+static inline bool cpu_tb_cpu_state_is_invalidated(target_ulong pc,
+   target_ulong cs_base,
+   uint32_t flags)
+{
+return cs_base == -1;
+}
+
 enum {
 QEMU_PSCI_CONDUIT_DISABLED = 0,
 QEMU_PSCI_CONDUIT_SMC = 1,
diff --git a/target-cris/cpu.h b/target-cris/cpu.h
index 7d7fe6eb1cf4..a20154e06b31 100644
--- a/target-cris/cpu.h
+++ b/target-cris/cpu.h
@@ -296,6 +296,20 @@ static inline void cpu_get_tb_cpu_state(CPUCRISState *env, 
target_ulong *pc,
 | X_FLAG | PFIX_FLAG));
 }
 
+static inline void cpu_get_invalid_tb_cpu_state(target_ulong *pc,
+target_ulong *cs_base,
+uint32_t *flags)
+{
+*cs_base = -1;
+}
+
+static inline bool cpu_tb_cpu_state_is_invalidated(target_ulong pc,
+   target_ulong cs_base,
+   uint32_t flags)
+{
+return cs_base == -1;
+}
+
 #define cpu_list cris_cpu_list
 void cris_cpu_list(FILE *f, fprintf_function cpu_fprintf);
 
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 5b14a72baa6f..1e430ae07915 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -130,6 +130,8 @@
positions to ease oring with eflags. */
 /* current cpl */
 #define HF_CPL_SHIFT 0
+/* used to mark invalidated translatio

[Qemu-devel] [PATCH v4 12/12] tcg: rename tb_find_physical()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

In fact, this function does not exactly perform a lookup by physical
address as it is descibed for comment on get_page_addr_code(). Thus
it may be a bit confusing to have "physical" in it's name. So rename it
to tb_htable_lookup() to better reflect its actual functionality.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
---
 cpu-exec.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index ff138809046c..735541e753fb 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -259,7 +259,7 @@ static bool tb_cmp(const void *p, const void *d)
 return false;
 }
 
-static TranslationBlock *tb_find_physical(CPUState *cpu,
+static TranslationBlock *tb_htable_lookup(CPUState *cpu,
   target_ulong pc,
   target_ulong cs_base,
   uint32_t flags)
@@ -296,7 +296,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
 if (unlikely(!tb || atomic_read(&tb->pc) != pc ||
  atomic_read(&tb->cs_base) != cs_base ||
  atomic_read(&tb->flags) != flags)) {
-tb = tb_find_physical(cpu, pc, cs_base, flags);
+tb = tb_htable_lookup(cpu, pc, cs_base, flags);
 if (!tb) {
 
 /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
@@ -310,7 +310,7 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
 /* There's a chance that our desired tb has been translated while
  * taking the locks so we check again inside the lock.
  */
-tb = tb_find_physical(cpu, pc, cs_base, flags);
+tb = tb_htable_lookup(cpu, pc, cs_base, flags);
 if (!tb) {
 /* if no translated code available, then translate it now */
 tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
-- 
2.9.1

[Qemu-devel] [PATCH v4 05/12] target-i386: Remove redundant HF_SOFTMMU_MASK

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

'HF_SOFTMMU_MASK' is only set when 'CONFIG_SOFTMMU' is defined. So
there's no need in this flag: test 'CONFIG_SOFTMMU' instead.

Suggested-by: Paolo Bonzini 
Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 
---
 target-i386/cpu.c   |  3 ---
 target-i386/cpu.h   |  3 ---
 target-i386/translate.c | 12 
 3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index fc209ee1cb8a..6e49e4ca8282 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -2725,9 +2725,6 @@ static void x86_cpu_reset(CPUState *s)
 
 /* init to reset state */
 
-#ifdef CONFIG_SOFTMMU
-env->hflags |= HF_SOFTMMU_MASK;
-#endif
 env->hflags2 |= HF2_GIF_MASK;
 
 cpu_x86_update_cr0(env, 0x6010);
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 776efe630ea3..5b14a72baa6f 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -130,8 +130,6 @@
positions to ease oring with eflags. */
 /* current cpl */
 #define HF_CPL_SHIFT 0
-/* true if soft mmu is being used */
-#define HF_SOFTMMU_SHIFT 2
 /* true if hardware interrupts must be disabled for next instruction */
 #define HF_INHIBIT_IRQ_SHIFT 3
 /* 16 or 32 segments */
@@ -161,7 +159,6 @@
 #define HF_MPX_IU_SHIFT 26 /* BND registers in-use */
 
 #define HF_CPL_MASK  (3 << HF_CPL_SHIFT)
-#define HF_SOFTMMU_MASK  (1 << HF_SOFTMMU_SHIFT)
 #define HF_INHIBIT_IRQ_MASK  (1 << HF_INHIBIT_IRQ_SHIFT)
 #define HF_CS32_MASK (1 << HF_CS32_SHIFT)
 #define HF_SS32_MASK (1 << HF_SS32_SHIFT)
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 7dea18bd6345..e81fce7bc2b5 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -8224,9 +8224,9 @@ void gen_intermediate_code(CPUX86State *env, 
TranslationBlock *tb)
 dc->popl_esp_hack = 0;
 /* select memory access functions */
 dc->mem_index = 0;
-if (flags & HF_SOFTMMU_MASK) {
-   dc->mem_index = cpu_mmu_index(env, false);
-}
+#ifdef CONFIG_SOFTMMU
+dc->mem_index = cpu_mmu_index(env, false);
+#endif
 dc->cpuid_features = env->features[FEAT_1_EDX];
 dc->cpuid_ext_features = env->features[FEAT_1_ECX];
 dc->cpuid_ext2_features = env->features[FEAT_8000_0001_EDX];
@@ -8239,11 +8239,7 @@ void gen_intermediate_code(CPUX86State *env, 
TranslationBlock *tb)
 #endif
 dc->flags = flags;
 dc->jmp_opt = !(dc->tf || cs->singlestep_enabled ||
-(flags & HF_INHIBIT_IRQ_MASK)
-#ifndef CONFIG_SOFTMMU
-|| (flags & HF_SOFTMMU_MASK)
-#endif
-);
+(flags & HF_INHIBIT_IRQ_MASK));
 /* Do not optimize repz jumps at all in icount mode, because
rep movsS instructions are execured with different paths
in !repz_opt and repz_opt modes. The first one was used
-- 
2.9.1

[Qemu-devel] [PATCH v4 09/12] tcg: cpu-exec: remove tb_lock from the hot-path

2016-07-15 Thread Sergey Fedorov

From: Alex Bennée 

Lock contention in the hot path of moving between existing patched
TranslationBlocks is the main drag in multithreaded performance. This
patch pushes the tb_lock() usage down to the two places that really need
it:

  - code generation (tb_gen_code)
  - jump patching (tb_add_jump)

The rest of the code doesn't really need to hold a lock as it is either
using per-CPU structures, atomically updated or designed to be used in
concurrent read situations (qht_lookup).

To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
locks become NOPs anyway until the MTTCG work is completed.

Signed-off-by: Alex Bennée 
Reviewed-by: Richard Henderson 
Reviewed-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 

---
v2 (hot path)
 - Add r-b tags
v1 (hot path, split from base-patches series)
 - revert name tweaking
 - drop test jmp_list_next outside lock
 - mention lock NOPs in comments
v3 (base-patches)
 - fix merge conflicts with Sergey's patch
---
 cpu-exec.c | 48 +---
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index e16df762f50a..bbaed5bb1978 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -286,35 +286,29 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
 TranslationBlock *tb;
 
 tb = tb_find_physical(cpu, pc, cs_base, flags);
-if (tb) {
-goto found;
-}
+if (!tb) {
 
-#ifdef CONFIG_USER_ONLY
-/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
- * taken outside tb_lock.  Since we're momentarily dropping
- * tb_lock, there's a chance that our desired tb has been
- * translated.
- */
-tb_unlock();
-mmap_lock();
-tb_lock();
-tb = tb_find_physical(cpu, pc, cs_base, flags);
-if (tb) {
-mmap_unlock();
-goto found;
-}
-#endif
+/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
+ * taken outside tb_lock. As system emulation is currently
+ * single threaded the locks are NOPs.
+ */
+mmap_lock();
+tb_lock();
 
-/* if no translated code available, then translate it now */
-tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+/* There's a chance that our desired tb has been translated while
+ * taking the locks so we check again inside the lock.
+ */
+tb = tb_find_physical(cpu, pc, cs_base, flags);
+if (!tb) {
+/* if no translated code available, then translate it now */
+tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+}
 
-#ifdef CONFIG_USER_ONLY
-mmap_unlock();
-#endif
+tb_unlock();
+mmap_unlock();
+}
 
-found:
-/* we add the TB in the virtual pc hash table */
+/* We add the TB in the virtual pc hash table for the fast lookup */
 atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
 return tb;
 }
@@ -332,7 +326,6 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
always be the same before a given translated block
is executed. */
 cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
-tb_lock();
 tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
 if (unlikely(!tb || atomic_read(&tb->pc) != pc ||
  atomic_read(&tb->cs_base) != cs_base ||
@@ -350,14 +343,15 @@ static inline TranslationBlock *tb_find_fast(CPUState 
*cpu,
 #endif
 /* See if we can patch the calling TB. */
 if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+tb_lock();
 /* Check if translation buffer has been flushed */
 if (cpu->tb_flushed) {
 cpu->tb_flushed = false;
 } else if (!tb_is_invalid(tb)) {
 tb_add_jump(last_tb, tb_exit, tb);
 }
+tb_unlock();
 }
-tb_unlock();
 return tb;
 }
 
-- 
2.9.1

[Qemu-devel] [PATCH v4 01/12] util/qht: Document memory ordering assumptions

2016-07-15 Thread Sergey Fedorov

From: Paolo Bonzini 

It is naturally expected that some memory ordering should be provided
around qht_insert() and qht_lookup(). Document these assumptions in the
header file and put some comments in the source to denote how that
memory ordering requirements are fulfilled.

Signed-off-by: Paolo Bonzini 
[Sergey Fedorov: commit title and message provided;
comment on qht_remove() elided]
Signed-off-by: Sergey Fedorov 

---
Changes in v4:
 - Modified version of Paolo's patch is used
---
 include/qemu/qht.h | 5 +
 util/qht.c | 7 ++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/qemu/qht.h b/include/qemu/qht.h
index 70bfc68b8d67..311139b85a9a 100644
--- a/include/qemu/qht.h
+++ b/include/qemu/qht.h
@@ -69,6 +69,9 @@ void qht_destroy(struct qht *ht);
  * Attempting to insert a NULL @p is a bug.
  * Inserting the same pointer @p with different @hash values is a bug.
  *
+ * In case of successful operation, smp_wmb() is implied before the pointer is
+ * inserted into the hash table.
+ *
  * Returns true on sucess.
  * Returns false if the @p-@hash pair already exists in the hash table.
  */
@@ -83,6 +86,8 @@ bool qht_insert(struct qht *ht, void *p, uint32_t hash);
  *
  * Needs to be called under an RCU read-critical section.
  *
+ * smp_read_barrier_depends() is implied before the call to @func.
+ *
  * The user-provided @func compares pointers in QHT against @userp.
  * If the function returns true, a match has been found.
  *
diff --git a/util/qht.c b/util/qht.c
index 40d6e218f759..28ce289245a7 100644
--- a/util/qht.c
+++ b/util/qht.c
@@ -445,7 +445,11 @@ void *qht_do_lookup(struct qht_bucket *head, 
qht_lookup_func_t func,
 do {
 for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
 if (b->hashes[i] == hash) {
-void *p = atomic_read(&b->pointers[i]);
+/* The pointer is dereferenced before seqlock_read_retry,
+ * so (unlike qht_insert__locked) we need to use
+ * atomic_rcu_read here.
+ */
+void *p = atomic_rcu_read(&b->pointers[i]);
 
 if (likely(p) && likely(func(p, userp))) {
 return p;
@@ -535,6 +539,7 @@ static bool qht_insert__locked(struct qht *ht, struct 
qht_map *map,
 atomic_rcu_set(&prev->next, b);
 }
 b->hashes[i] = hash;
+/* smp_wmb() implicit in seqlock_write_begin.  */
 atomic_set(&b->pointers[i], p);
 seqlock_write_end(&head->sequence);
 return true;
-- 
2.9.1

[Qemu-devel] [PATCH v4 07/12] tcg: Prepare TB invalidation for lockless TB lookup

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

When invalidating a translation block, set an invalid CPU state into the
TranslationBlock structure first.

As soon as the TB is marked with an invalid CPU state, there is no need
to remove it from CPU's 'tb_jmp_cache'. However it will be necessary to
recheck whether the target TB is still valid after acquiring 'tb_lock'
but before calling tb_add_jump() since TB lookup is to be performed out
of 'tb_lock' in future. Note that we don't have to check 'last_tb' since
it is safe to patch an already invalidated TB since it will not be
executed anyway.

Suggested-by: Paolo Bonzini 
Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 

---
Changes in v4:
 - smp_wmb() removed after tb_mark_invalid()
 - atomic access to TB CPU state
---
 cpu-exec.c  |  7 ---
 include/exec/exec-all.h |  8 +++-
 translate-all.c | 11 ++-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index c973e3b85922..e16df762f50a 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -334,8 +334,9 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
 cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
 tb_lock();
 tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
-if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
- tb->flags != flags)) {
+if (unlikely(!tb || atomic_read(&tb->pc) != pc ||
+ atomic_read(&tb->cs_base) != cs_base ||
+ atomic_read(&b->flags) != flags)) {
 tb = tb_find_slow(cpu, pc, cs_base, flags);
 }
 #ifndef CONFIG_USER_ONLY
@@ -352,7 +353,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
 /* Check if translation buffer has been flushed */
 if (cpu->tb_flushed) {
 cpu->tb_flushed = false;
-} else {
+} else if (!tb_is_invalid(tb)) {
 tb_add_jump(last_tb, tb_exit, tb);
 }
 }
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index a499c7c56eef..8f0afcdbd62a 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -258,7 +258,13 @@ void tb_phys_invalidate(TranslationBlock *tb, 
tb_page_addr_t page_addr);
 
 static inline void tb_mark_invalid(TranslationBlock *tb)
 {
-cpu_get_invalid_tb_cpu_state(&tb->pc, &tb->cs_base, &tb->flags);
+target_ulong pc = 0, cs_base = 0;
+uint32_t flags = 0;
+
+cpu_get_invalid_tb_cpu_state(&pc, &cs_base, &flags);
+atomic_set(&tb->pc, pc);
+atomic_set(&tb->cs_base, cs_base);
+atomic_set(&tb->flags, flags);
 }
 
 static inline bool tb_is_invalid(TranslationBlock *tb)
diff --git a/translate-all.c b/translate-all.c
index 788fed1e0765..9db72e8982b1 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -986,11 +986,12 @@ static inline void tb_jmp_unlink(TranslationBlock *tb)
 /* invalidate one TB */
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
 {
-CPUState *cpu;
 PageDesc *p;
 uint32_t h;
 tb_page_addr_t phys_pc;
 
+tb_mark_invalid(tb);
+
 /* remove the TB from the hash list */
 phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
 h = tb_hash_func(phys_pc, tb->pc, tb->flags);
@@ -1008,14 +1009,6 @@ void tb_phys_invalidate(TranslationBlock *tb, 
tb_page_addr_t page_addr)
 invalidate_page_bitmap(p);
 }
 
-/* remove the TB from the hash list */
-h = tb_jmp_cache_hash_func(tb->pc);
-CPU_FOREACH(cpu) {
-if (atomic_read(&cpu->tb_jmp_cache[h]) == tb) {
-atomic_set(&cpu->tb_jmp_cache[h], NULL);
-}
-}
-
 /* suppress this TB from the two jump lists */
 tb_remove_from_jmp_list(tb, 0);
 tb_remove_from_jmp_list(tb, 1);
-- 
2.9.1

[Qemu-devel] [PATCH v4 02/12] tcg: Pass last_tb by value to tb_find_fast()

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

This is a small clean up. tb_find_fast() is a final consumer of this
variable so no need to pass it by reference. 'last_tb' is always updated
by subsequent cpu_loop_exec_tb() in cpu_exec().

This change also simplifies calling cpu_exec_nocache() in
cpu_handle_exception().

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 

---
Changes in v4:
 - Compile error fixed (missed conversion)
---
 cpu-exec.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index b840e1d2dd41..974de6aa27ee 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -320,7 +320,7 @@ found:
 }
 
 static inline TranslationBlock *tb_find_fast(CPUState *cpu,
- TranslationBlock **last_tb,
+ TranslationBlock *last_tb,
  int tb_exit)
 {
 CPUArchState *env = (CPUArchState *)cpu->env_ptr;
@@ -342,7 +342,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
 /* Ensure that no TB jump will be modified as the
  * translation buffer has been flushed.
  */
-*last_tb = NULL;
+last_tb = NULL;
 cpu->tb_flushed = false;
 }
 #ifndef CONFIG_USER_ONLY
@@ -351,12 +351,12 @@ static inline TranslationBlock *tb_find_fast(CPUState 
*cpu,
  * spanning two pages because the mapping for the second page can change.
  */
 if (tb->page_addr[1] != -1) {
-*last_tb = NULL;
+last_tb = NULL;
 }
 #endif
 /* See if we can patch the calling TB. */
-if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-tb_add_jump(*last_tb, tb_exit, tb);
+if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+tb_add_jump(last_tb, tb_exit, tb);
 }
 tb_unlock();
 return tb;
@@ -437,8 +437,7 @@ static inline bool cpu_handle_exception(CPUState *cpu, int 
*ret)
 } else if (replay_has_exception()
&& cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
 /* try to cause an exception pending in the log */
-TranslationBlock *last_tb = NULL; /* Avoid chaining TBs */
-cpu_exec_nocache(cpu, 1, tb_find_fast(cpu, &last_tb, 0), true);
+cpu_exec_nocache(cpu, 1, tb_find_fast(cpu, NULL, 0), true);
 *ret = -1;
 return true;
 #endif
@@ -622,7 +621,7 @@ int cpu_exec(CPUState *cpu)
 cpu->tb_flushed = false; /* reset before first TB lookup */
 for(;;) {
 cpu_handle_interrupt(cpu, &last_tb);
-tb = tb_find_fast(cpu, &last_tb, tb_exit);
+tb = tb_find_fast(cpu, last_tb, tb_exit);
 cpu_loop_exec_tb(cpu, tb, &last_tb, &tb_exit, &sc);
 /* Try to align the host and virtual clocks
if the guest is in advance */
-- 
2.9.1

[Qemu-devel] [PATCH v4 00/12] Reduce lock contention on TCG hot-path

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Hi,

This is a respin of this series [1].

Here I used a modified version of Paolo's patch to docuement memory
ordering assumptions for certain QHT operations.

The last patch is a suggestion for renaming tb_find_physicall().

This series can be fetch from the public git repository:

https://github.com/sergefdrv/qemu.git lockless-tb-lookup-v4

[1] http://thread.gmane.org/gmane.comp.emulators.qemu/426341

Kind regards,
Sergey

Summary of changes in v4:
 - Modified version of Paolo's patch is used to document memory ordering
   assumptions for certain QHT operations
 - Intermediate compilation errors fixed
 - Atomic access to TB CPU state
 - tb_find_physical() renamed
Summary of changes in v3:
 - QHT memory ordering assumptions documented
 - 'tb_jmp_cache' reset in tb_flush() made atomic
 - explicit memory barriers removed around 'tb_jmp_cache' access
 - safe access to 'tb_flushed' out of 'tb_lock' prepared
 - TBs marked with invalid CPU state early on invalidation
 - Alex's tb_find_{fast,slow}() roll-up related patches dropped
 - bouncing of tb_lock between tb_gen_code() and tb_add_jump() avoided
   with local variable 'have_tb_lock'
 - tb_find_{fast,slow}() merged

Alex Bennée (2):
  tcg: set up tb->page_addr before insertion
  tcg: cpu-exec: remove tb_lock from the hot-path

Paolo Bonzini (1):
  util/qht: Document memory ordering assumptions

Sergey Fedorov (9):
  tcg: Pass last_tb by value to tb_find_fast()
  tcg: Prepare safe tb_jmp_cache lookup out of tb_lock
  tcg: Prepare safe access to tb_flushed out of tb_lock
  target-i386: Remove redundant HF_SOFTMMU_MASK
  tcg: Introduce tb_mark_invalid() and tb_is_invalid()
  tcg: Prepare TB invalidation for lockless TB lookup
  tcg: Avoid bouncing tb_lock between tb_gen_code() and tb_add_jump()
  tcg: Merge tb_find_slow() and tb_find_fast()
  tcg: rename tb_find_physical()

 cpu-exec.c   | 117 +--
 include/exec/exec-all.h  |  16 +++
 include/qemu/qht.h   |   5 ++
 target-alpha/cpu.h   |  14 ++
 target-arm/cpu.h |  14 ++
 target-cris/cpu.h|  14 ++
 target-i386/cpu.c|   3 --
 target-i386/cpu.h|  20 ++--
 target-i386/translate.c  |  12 ++---
 target-lm32/cpu.h|  14 ++
 target-m68k/cpu.h|  14 ++
 target-microblaze/cpu.h  |  14 ++
 target-mips/cpu.h|  14 ++
 target-moxie/cpu.h   |  14 ++
 target-openrisc/cpu.h|  14 ++
 target-ppc/cpu.h |  14 ++
 target-s390x/cpu.h   |  14 ++
 target-sh4/cpu.h |  14 ++
 target-sparc/cpu.h   |  14 ++
 target-sparc/translate.c |   1 +
 target-tilegx/cpu.h  |  14 ++
 target-tricore/cpu.h |  14 ++
 target-unicore32/cpu.h   |  14 ++
 target-xtensa/cpu.h  |  14 ++
 translate-all.c  |  29 ++--
 util/qht.c   |   7 ++-
 26 files changed, 352 insertions(+), 96 deletions(-)

-- 
2.9.1

[Qemu-devel] [PATCH v4 03/12] tcg: Prepare safe tb_jmp_cache lookup out of tb_lock

2016-07-15 Thread Sergey Fedorov

From: Sergey Fedorov 

Ensure atomicity of CPU's 'tb_jmp_cache' access for future translation
block lookup out of 'tb_lock'.

Note that this patch does *not* make CPU's TLB invalidation safe if it
is done from some other thread while the CPU is in its execution loop.

Signed-off-by: Sergey Fedorov 
Signed-off-by: Sergey Fedorov 
[AJB: fixed missing atomic set, tweak title]
Signed-off-by: Alex Bennée 
[Sergey Fedorov: removed explicit memory barriers;
removed unnecessary atomic_read();
tweaked commit title and message]
Signed-off-by: Sergey Fedorov 
Reviewed-by: Alex Bennée 

---
Changes in v3:
 - explicit memory barriers removed
 - memset() on 'tb_jmp_cache' replaced with a loop on atomic_set()
Changes in v2:
 - fix spelling s/con't/can't/
 - add atomic_read while clearing tb_jmp_cache
 - add r-b tags
Changes in v1 (AJB):
 - tweak title
 - fixed missing set of tb_jmp_cache
---
 cpu-exec.c  |  4 ++--
 translate-all.c | 10 +++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 974de6aa27ee..2fd1875a7317 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -315,7 +315,7 @@ static TranslationBlock *tb_find_slow(CPUState *cpu,
 
 found:
 /* we add the TB in the virtual pc hash table */
-cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb;
+atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
 return tb;
 }
 
@@ -333,7 +333,7 @@ static inline TranslationBlock *tb_find_fast(CPUState *cpu,
is executed. */
 cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
 tb_lock();
-tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)];
+tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
 if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
  tb->flags != flags)) {
 tb = tb_find_slow(cpu, pc, cs_base, flags);
diff --git a/translate-all.c b/translate-all.c
index 0d47c1c0cf82..fdf520a86d68 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -848,7 +848,11 @@ void tb_flush(CPUState *cpu)
 tcg_ctx.tb_ctx.nb_tbs = 0;
 
 CPU_FOREACH(cpu) {
-memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache));
+int i;
+
+for (i = 0; i < TB_JMP_CACHE_SIZE; ++i) {
+atomic_set(&cpu->tb_jmp_cache[i], NULL);
+}
 cpu->tb_flushed = true;
 }
 
@@ -1007,8 +1011,8 @@ void tb_phys_invalidate(TranslationBlock *tb, 
tb_page_addr_t page_addr)
 /* remove the TB from the hash list */
 h = tb_jmp_cache_hash_func(tb->pc);
 CPU_FOREACH(cpu) {
-if (cpu->tb_jmp_cache[h] == tb) {
-cpu->tb_jmp_cache[h] = NULL;
+if (atomic_read(&cpu->tb_jmp_cache[h]) == tb) {
+atomic_set(&cpu->tb_jmp_cache[h], NULL);
 }
 }
 
-- 
2.9.1

Re: [Qemu-devel] [PATCH] checkpatch: consider git extended headers valid patches

2016-07-15 Thread Eric Blake

On 07/15/2016 03:46 AM, Stefan Hajnoczi wrote:
> Renames look like this with git-diff(1) when diff.renames = true is set:
> 
>   diff --git a/a b/b
>   similarity index 100%
>   rename from a
>   rename to b
> 
> This raises the "Does not appear to be a unified-diff format patch"
> error because checkpatch.pl only considers a diff valid if it contains
> at least one "@@" hunk.
> 
> This patch accepts renames and copies too so that checkpatch.pl exits
> successfully when a diff only renames/copies files.  The git diff
> extended header format is described on the git-diff(1) man page.
> 
> Reported-by: Colin Lord 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  scripts/checkpatch.pl | 5 +
>  1 file changed, 5 insertions(+)

Reviewed-by: Eric Blake 

and especially nice since we recommend diff.renames = true in our patch
submission guidelines.


-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v3 0/2] trace: [*-user] Add commandline arguments to control tracing

2016-07-15 Thread Peter Maydell

On 15 July 2016 at 18:48, Lluís Vilanova  wrote:
> Peter Maydell writes:
>
>> On 15 July 2016 at 18:08, Lluís Vilanova  wrote:
>>> Adds three commandline arguments to the main *-user programs, following 
>>> what's
>>> already available in softmmu:
>>>
>>> * -trace-enable
>>> * -trace-events
>>> * -trace-file
>
>> So when would you want to use these rather than the existing
>> -d trace:pattern and -D logfile options?
>
> These are different logs/traces. The ones you point out are textual traces for
> TCG guest/target instructions (and where to write them). The ones I've added 
> are
> for QEMU's tracing infrastructure.

-d trace:pattern works for any tracepoint.

> Note that both (-trace vs -d/-D) are also available in system mode (vl.c).

True, so consistency probably makes it worth providing both interfaces.

thanks
-- PMM

Re: [Qemu-devel] [PATCH v3 0/2] trace: [*-user] Add commandline arguments to control tracing

2016-07-15 Thread Lluís Vilanova

Peter Maydell writes:

> On 15 July 2016 at 18:08, Lluís Vilanova  wrote:
>> Adds three commandline arguments to the main *-user programs, following 
>> what's
>> already available in softmmu:
>> 
>> * -trace-enable
>> * -trace-events
>> * -trace-file

> So when would you want to use these rather than the existing
> -d trace:pattern and -D logfile options?

These are different logs/traces. The ones you point out are textual traces for
TCG guest/target instructions (and where to write them). The ones I've added are
for QEMU's tracing infrastructure.

Note that both (-trace vs -d/-D) are also available in system mode (vl.c).

BTW, I forgot to update the series cover according to v3's changes.

Cheers,
  Lluis

[Qemu-devel] [PATCH] linux-user: Implement FS_IOC_GETFLAGS and FS_IOC_SETFLAGS ioctls

2016-07-15 Thread Peter Maydell

Implement the FS_IOC_GETFLAGS and FS_IOC_SETFLAGS ioctls, as used
by chattr.

Note that the type information encoded in these ioctl numbers
is at odds with the actual type the kernel accesses, as discussed
in http://thread.gmane.org/gmane.linux.file-systems/80164.

Signed-off-by: Peter Maydell 
---
We need these ioctls for chattr, which in turn is needed
for the LTP utimensat test cases to be run.
---
 linux-user/ioctls.h   | 3 +++
 linux-user/syscall_defs.h | 6 ++
 2 files changed, 9 insertions(+)

diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h
index 7e2c133..1bad701 100644
--- a/linux-user/ioctls.h
+++ b/linux-user/ioctls.h
@@ -120,6 +120,9 @@
MK_PTR(MK_STRUCT(STRUCT_fiemap)))
 #endif
 
+ IOCTL(FS_IOC_GETFLAGS, IOC_R, MK_PTR(TYPE_INT))
+ IOCTL(FS_IOC_SETFLAGS, IOC_W, MK_PTR(TYPE_INT))
+
   IOCTL(SIOCATMARK, IOC_R, MK_PTR(TYPE_INT))
   IOCTL(SIOCGIFNAME, IOC_RW, MK_PTR(TYPE_INT))
   IOCTL(SIOCGIFFLAGS, IOC_W | IOC_R, MK_PTR(MK_STRUCT(STRUCT_short_ifreq)))
diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h
index 7835654..d9dea0e 100644
--- a/linux-user/syscall_defs.h
+++ b/linux-user/syscall_defs.h
@@ -998,6 +998,12 @@ struct target_pollfd {
 
 #define TARGET_FIBMAP TARGET_IO(0x00,1)  /* bmap access */
 #define TARGET_FIGETBSZ   TARGET_IO(0x00,2)  /* get the block size used for 
bmap */
+/* Note that the ioctl numbers claim type "long" but the actual type
+ * used by the kernel is "int".
+ */
+#define TARGET_FS_IOC_GETFLAGS TARGET_IOR('f', 1, long)
+#define TARGET_FS_IOC_SETFLAGS TARGET_IOW('f', 2, long)
+
 #define TARGET_FS_IOC_FIEMAP TARGET_IOWR('f',11,struct fiemap)
 
 /* cdrom commands */
-- 
1.9.1

Re: [Qemu-devel] QOM: best way for parents to pass information to children? (was Re: [PATCH RFC 07/16] qom/cpu: make nr-cores, nr-threads real properties)

2016-07-15 Thread Eduardo Habkost

On Fri, Jul 15, 2016 at 06:30:41PM +0200, Andreas Färber wrote:
> Am 15.07.2016 um 18:10 schrieb Eduardo Habkost:
> > On Fri, Jul 15, 2016 at 11:11:38AM +0200, Igor Mammedov wrote:
> >> On Fri, 15 Jul 2016 08:35:30 +0200
> >> Andrew Jones  wrote:
> >>> On Thu, Jul 14, 2016 at 05:07:43PM -0300, Eduardo Habkost wrote:
> 
>  First of all, sorry for the horrible delay in replying to this
>  thread.
> 
>  On Wed, Jun 15, 2016 at 10:56:20AM +1000, David Gibson wrote:  
> > On Tue, Jun 14, 2016 at 08:19:49AM +0200, Andrew Jones wrote:  
> >> On Tue, Jun 14, 2016 at 12:12:16PM +1000, David Gibson wrote:  
> >>> On Sun, Jun 12, 2016 at 03:48:10PM +0200, Andrew Jones wrote:  
> > [...]
> >> +static Property cpu_common_properties[] = {
> >> +DEFINE_PROP_INT32("nr-cores", CPUState, nr_cores, 1),
> >> +DEFINE_PROP_INT32("nr-threads", CPUState, nr_threads, 1),
> >> +DEFINE_PROP_END_OF_LIST()
> >> +};  
> >
> > Are you aware of the current CPU hotplug discussion that is going 
> > on?  
> 
>  I'm aware of it going on, but haven't been following it.
>    
> > I'm not very involved there, but I think some of these reworks also 
> > move
> > "nr_threads" into the CPU state already, e.g. see:  
> 
>  nr_threads (and nr_cores) are already state in CPUState. This patch 
>  just
>  exposes that state via properties.
>    
> >
> > https://github.com/dgibson/qemu/commit/9d07719784ecbeebea71
> >
> > ... so you might want to check these patches first to see whether 
> > you
> > can base your rework on them?  
> 
>  Every cpu, and thus every machine, uses CPUState for its cpus. I'm
>  not sure every machine will want to use that new abstract core class
>  though. If they did, then we could indeed use nr_threads from there
>  instead (and remove it from CPUState), but we'd still need nr_cores
>  from the abstract cpu package class (CPUState).  
> >>>
> >>> Hmm.  Since the CPUState object represents just a single thread, it
> >>> seems weird to me that it would have nr_threads and nr_cores
> >>> information.  
> 
>  Agreed it is weird, and I think we should try to move it away
>  from CPUState, not make it part of the TYPE_CPU interface.
>  nr_threads belongs to the actual container of the Thread objects,
>  and nr_cores in the actual container of the Core objects.
> 
>  The problem is how to implement that in a non-intrusive way that
>  would require changing the object hierarchy of all architectures.
> 
>    
> >>>
> >>> Exposing those as properties makes that much worse, because it's now
> >>> ABI, rather than internal detail we can clean up at some future time. 
> >>>  
> >>
> >> CPUState is supposed to be "State of one CPU core or thread", which
> >> justifies having nr_threads state, as it may be describing a core.  
> >
> > Um.. does it ever actually represent a (multithread) core in practice?
> > It would need to have duplicated register state for every thread were
> > that the case.  
> 
>  AFAIK, CPUState is still always thread state. Or has this changed
>  in some architectures, already?
>    
> >   
> >> I guess there's no justification for having nr_cores in there though.
> >> I agree adding the Core class is a good idea, assuming it will get used
> >> by all machines, and CPUState then gets changed to a Thread class. The
> >> question then, though, is do we also create a Socket class that 
> >> contains
> >> nr_cores?  
> >
> > That was roughly our intention with the way the cross platform hotplug
> > stuff is evolving.  But the intention was that the Socket objects
> > would only need to be constructed for machine types where it makes
> > sense.  So for example on the paravirt pseries platform, we'll only
> > have Core objects, because the socket distinction isn't really
> > meaningful.
> >   
> >> And how will a Thread method get that information when it
> >> needs to emulate, e.g. CPUID, that requires it? It's a bit messy, so
> >> I'm open to all suggestions on it.  
> >
> > So, if the Thread needs this information, I'm not opposed to it having
> > it internally (presumably populated earlier from the Core object).
> > But I am opposed to it being a locked in part of the interface by
> > having it as an exposed property.  
> 
>  I agree we don't want to make this part of the external
>  interface. In this case, if we don't add the properties, how
>  exactly is the Machine or Core code supposed to pass that
>  information to the Thread object?
> 
>  Maybe the intermediate steps could be:
> 
>

[Qemu-devel] [PATCH] linux-user: fix "really futimens" condition in sys_utimensat()

2016-07-15 Thread Peter Maydell

In some configurations we implement sys_utimensat() via a wrapper
that calls either futimens() or utimensat(), depending on the
arguments (to handle a case where the Linux syscall API diverges
from the glibc API). Fix a corner case in this handling:
if the syscall is passed a NULL pathname and dirfd == AT_FDCWD,
then it must fail with EFAULT. We can't handle this by passing
it to glibc utimensat() because at the libc level a NULL
pathname is failed with EINVAL, and we can't handle it by
passing to futimens() because that would fail with EBADF.
So special case it and return EFAULT directly from the wrapper.

This means that if the guest calls utimes() with a NULL pathname
and guest glibc converts that into a syscall utimensat(AT_FDCWD,
NULL, ...) then we correctly fail it with EFAULT.

Signed-off-by: Peter Maydell 
---
 linux-user/syscall.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 0e87157..61ea58b 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -367,10 +367,15 @@ static int sys_getcwd1(char *buf, size_t size)
 static int sys_utimensat(int dirfd, const char *pathname,
 const struct timespec times[2], int flags)
 {
-if (pathname == NULL)
+if (pathname == NULL) {
+if (dirfd == AT_FDCWD) {
+errno = EFAULT;
+return -1;
+}
 return futimens(dirfd, times);
-else
+} else {
 return utimensat(dirfd, pathname, times, flags);
+}
 }
 #elif defined(__NR_utimensat)
 #define __NR_sys_utimensat __NR_utimensat
-- 
1.9.1

[Qemu-devel] [PATCH] vfio/pci: Hide ARI capability

2016-07-15 Thread Alex Williamson

QEMU supports ARI on downstream ports and assigned devices may support
ARI in their extended capabilities.  The endpoint ARI capability
specifies the next function, such that the OS doesn't need to walk
each possible function, however this next function is relative to the
host, not the guest.  This leads to device discovery issues when we
combine separate functions into virtual multi-function packages in a
guest.  For example, SR-IOV VFs are not enumerated by simply probing
the function address space, therefore the ARI next-function field is
zero.  When we combine multiple VFs together as a multi-function
device in the guest, the guest OS identifies ARI is enabled, relies on
this next-function field, and stops looking for additional function
after the first is found.

Long term we should expose the ARI capability to the guest to enable
configurations with more than 8 functions per slot, but this requires
additional QEMU PCI infrastructure to manage the next-function field
for multiple, otherwise independent devices.  In the short term,
hiding this capability allows equivalent functionality to what we
currently have on non-express chipsets.

Signed-off-by: Alex Williamson 
---
 hw/vfio/pci.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 44783c5..c8436a1 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1828,6 +1828,7 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev)
 
 switch (cap_id) {
 case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
+case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
 trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
 break;
 default:

Re: [Qemu-devel] [PATCH v6 3/6] tests: in IDE and AHCI tests perform DMA write before flushing

2016-07-15 Thread Eric Blake

On 07/15/2016 02:08 AM, Evgeny Yakovlev wrote:

>>> + * Write sector 0 with random data to make AHCI storage dirty
>> If we ever have a case where we open a disk without specifying -raw, the
>> random data _might_ resemble some other format and cause probe to
>> misbehave; as such, we also have code in the block layer that
>> specifically prevents writes to sector 0 for some data. Should you pick
>> a different sector than 0, so as to avoid any (remote) possibility that
>> the random data could change probe results or be rejected?
>>
> 
> Not sure if i understand the problem you're referring to here. Those are
> blkdebug tests, those disks are created, emulated with blkdebug backend,
> flushed and then thrown away. So is there really any possibility for
> reopening the image and accidentally parsing a partition table in sector 0?
> 
> Also, not sure what you mean by "code in the block layer that
> specifically prevents writes to sector 0 for some data". Can you explain
> that bit, because it sounds pretty scary. How can we deny guest VM to
> write anything to sector 0 on its emulated disk?

Read block/raw_bsd.c:raw_co_writev_flags() for the gory details.  If the
guest ever gets a raw format driver because the user forgot to say
'--format $foo', then we prevent the guest from writing anything into
sector 0 that would be probed as non-raw.  It means there are only a
handful of patterns that the guest cannot write into the first sector,
but it IS a non-zero number of patterns.  How the guest behaves if such
a write is attempted depends on the error policy you have on that
device; it might show up as an EIO error to the guest, or it might stop
the guest from executing and raise a qemu event to the management
application, but the point is that we actively prohibit some writes to
sector 0 on a probed raw disk.  Using any sector other than 0 doesn't
have this limitation, or you can ensure that your test ALWAYS passes the
appropriate --format $foo so that the disk is never probed as another
way to avoid limitations on sector 0.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org

signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH v3 0/2] trace: [*-user] Add commandline arguments to control tracing

2016-07-15 Thread Peter Maydell

On 15 July 2016 at 18:08, Lluís Vilanova  wrote:
> Adds three commandline arguments to the main *-user programs, following what's
> already available in softmmu:
>
> * -trace-enable
> * -trace-events
> * -trace-file

So when would you want to use these rather than the existing
-d trace:pattern and -D logfile options?

thanks
-- PMM

Re: [Qemu-devel] [PATCH 1/2] hostmem: fix QEMU crash by 'info memdev'

2016-07-15 Thread Eric Blake

On 07/15/2016 12:56 AM, Xiao Guangrong wrote:

>>> Note that you don't have to call visit_next_list() in a virtual visit.
>>> For an example, see prop_get_fdt().  Good enough already?
>>
>> Yes, definitely!  I'm queueing Guangrong's patch because it fixes a
>> crash and the leak existed before, but without next_list we can indeed
>> visit a "virtual" list and fix the leak.  It can be done during the -rc
>> period.
> 
> So you want to build uint16List list and save it as a "virtual" list in
> host_memory_backend_get_host_nodes(), then its caller can directly fetch
> this 'virtual' list from the visit?

With a virtual list visit, you don't even need a uint16List object.
Merely call visit_start_list(NULL) to start the list with no matching
uint16List, then visit_type_int16() for each list element (note no
visit_next_list() calls), then visit_end_list().


-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

[Qemu-devel] [PATCH v3 2/2] trace: [bsd-user] Commandline arguments to control tracing

2016-07-15 Thread Lluís Vilanova

Signed-off-by: Lluís Vilanova 
---
 bsd-user/main.c |   16 
 1 file changed, 16 insertions(+)

diff --git a/bsd-user/main.c b/bsd-user/main.c
index 4819b9e..3bef796 100644
--- a/bsd-user/main.c
+++ b/bsd-user/main.c
@@ -21,6 +21,7 @@
 
 #include "qapi/error.h"
 #include "qemu.h"
+#include "qemu/config-file.h"
 #include "qemu/path.h"
 #include "qemu/help_option.h"
 /* For tb_lock */
@@ -30,6 +31,8 @@
 #include "qemu/timer.h"
 #include "qemu/envlist.h"
 #include "exec/log.h"
+#include "trace/control.h"
+#include "glib-compat.h"
 
 int singlestep;
 unsigned long mmap_min_addr;
@@ -687,6 +690,8 @@ static void usage(void)
"-p pagesize   set the host page size to 'pagesize'\n"
"-singlestep   always run in singlestep mode\n"
"-strace   log system calls\n"
+   "-trace
[[enable=]][,events=][,file=]\n"
+   "  specify tracing options\n"
"\n"
"Environment variables:\n"
"QEMU_STRACE   Print system calls and arguments similar to 
the\n"
@@ -735,6 +740,7 @@ int main(int argc, char **argv)
 int gdbstub_port = 0;
 char **target_environ, **wrk;
 envlist_t *envlist = NULL;
+const char *trace_file = NULL;
 bsd_type = target_openbsd;
 
 if (argc <= 1)
@@ -754,6 +760,8 @@ int main(int argc, char **argv)
 
 cpu_model = NULL;
 
+qemu_add_opts(&qemu_trace_opts);
+
 optind = 1;
 for(;;) {
 if (optind >= argc)
@@ -840,6 +848,9 @@ int main(int argc, char **argv)
 singlestep = 1;
 } else if (!strcmp(r, "strace")) {
 do_strace = 1;
+} else if (!strcmp(r, "trace")) {
+g_free(trace_file);
+trace_file = trace_opt_parse(optarg);
 } else
 {
 usage();
@@ -865,6 +876,11 @@ int main(int argc, char **argv)
 }
 filename = argv[optind];
 
+if (!trace_init_backends()) {
+exit(1);
+}
+trace_init_file(trace_file);
+
 /* Zero out regs */
 memset(regs, 0, sizeof(struct target_pt_regs));

[Qemu-devel] [PATCH v3 1/2] trace: [linux-user] Commandline arguments to control tracing

2016-07-15 Thread Lluís Vilanova

Signed-off-by: Lluís Vilanova 
---
 linux-user/main.c |   19 +++
 1 file changed, 19 insertions(+)

diff --git a/linux-user/main.c b/linux-user/main.c
index 617a179..53be5dd 100644
--- a/linux-user/main.c
+++ b/linux-user/main.c
@@ -24,6 +24,7 @@
 #include "qapi/error.h"
 #include "qemu.h"
 #include "qemu/path.h"
+#include "qemu/config-file.h"
 #include "qemu/cutils.h"
 #include "qemu/help_option.h"
 #include "cpu.h"
@@ -33,6 +34,8 @@
 #include "qemu/envlist.h"
 #include "elf.h"
 #include "exec/log.h"
+#include "trace/control.h"
+#include "glib-compat.h"
 
 char *exec_path;
 
@@ -4001,6 +4004,13 @@ static void handle_arg_version(const char *arg)
 exit(EXIT_SUCCESS);
 }
 
+static const char * trace_file = NULL;
+static void handle_arg_trace(const char *arg)
+{
+g_free((char*)trace_file);
+trace_file = trace_opt_parse(arg);
+}
+
 struct qemu_argument {
 const char *argv;
 const char *env;
@@ -4048,6 +4058,8 @@ static const struct qemu_argument arg_table[] = {
  "",   "log system calls"},
 {"seed",   "QEMU_RAND_SEED",   true,  handle_arg_randseed,
  "",   "Seed for pseudo-random number generator"},
+{"trace",  "QEMU_TRACE",   true,  handle_arg_trace,
+ "",   "[[enable=]][,events=][,file=]"},
 {"version","QEMU_VERSION", false, handle_arg_version,
  "",   "display version information and exit"},
 {NULL, NULL, false, NULL, NULL, NULL}
@@ -4237,8 +4249,15 @@ int main(int argc, char **argv, char **envp)
 
 srand(time(NULL));
 
+qemu_add_opts(&qemu_trace_opts);
+
 optind = parse_args(argc, argv);
 
+if (!trace_init_backends()) {
+exit(1);
+}
+trace_init_file(trace_file);
+
 /* Zero out regs */
 memset(regs, 0, sizeof(struct target_pt_regs));

[Qemu-devel] [PATCH v3 0/2] trace: [*-user] Add commandline arguments to control tracing

2016-07-15 Thread Lluís Vilanova

Adds three commandline arguments to the main *-user programs, following what's
already available in softmmu:

* -trace-enable
* -trace-events
* -trace-file


Changes in v2
=

* Tell user to use 'help' instead of '?' [Eric Blake].
* Remove newlines on argument docs for bsd-user [Eric Blake].


Changes in v3
=

* Use new trace_opt_parse() [Stefan Hajnoczi].


Signed-off-by: Lluís Vilanova 
---

Lluís Vilanova (2):
  trace: [linux-user] Commandline arguments to control tracing
  trace: [bsd-user] Commandline arguments to control tracing


 bsd-user/main.c   |   16 
 linux-user/main.c |   19 +++
 2 files changed, 35 insertions(+)


To: qemu-devel@nongnu.org
Cc: Stefan Hajnoczi 
Cc: Eric Blake

Re: [Qemu-devel] [PATCH v2 0/2] trace: [*-user] Add commandline arguments to control tracing

2016-07-15 Thread Lluís Vilanova

Stefan Hajnoczi writes:

> On Wed, Jun 22, 2016 at 12:04:30PM +0200, Lluís Vilanova wrote:
>> Adds three commandline arguments to the main *-user programs, following 
>> what's
>> already available in softmmu:
>> 
>> * -trace-enable
>> * -trace-events
>> * -trace-file
>> 
>> 
>> Changes in v2
>> =
>> 
>> * Tell user to use 'help' instead of '?' [Eric Blake].
>> * Remove newlines on argument docs for bsd-user [Eric Blake].
>> 
>> 
>> Signed-off-by: Lluís Vilanova 
>> ---
>> 
>> Lluís Vilanova (2):
>> trace: [linux-user] Commandline arguments to control tracing
>> trace: [bsd-user] Commandline arguments to control tracing
>> 
>> 
>> bsd-user/main.c   |   19 +++
>> linux-user/main.c |   28 
>> 2 files changed, 47 insertions(+)

> Hi Lluís,
> Commit e9e0bb2af2248eabafb54402e3127f9f8a8690f5 ("trace: move
> qemu_trace_opts to trace/control.c") made trace_events_init() static.
> This conflicts with your patch series.

> I suggest changing this series to use -trace ... just like
> qemu/qemu-img/qemu-nbd.

I'll send a new version adapted to that.

Thanks,
  Lluis

[Qemu-devel] [PATCH] migration: set state to post-migrate on failure

2016-07-15 Thread Dr. David Alan Gilbert (git)

From: "Dr. David Alan Gilbert" 

If a migration fails/is cancelled during the postcopy stage we currently
end up with the runstate as finish-migrate, where it should be post-migrate.
There's a small window in precopy where I think the same thing can
happen, but I've never seen it.

It rarely matters; the only postcopy case is if you restart a migration, which
again is a case that rarely matters in postcopy because it's only
safe to restart the migration if you know the destination hasn't
been running (which you might if you started the destination with -S
and hadn't got around to 'c' ing it before the postcopy failed).
Even then it's a small window but potentially you could hit if
there's a problem loading the devices on the destination.

This corresponds to:
https://bugzilla.redhat.com/show_bug.cgi?id=1355683

Signed-off-by: Dr. David Alan Gilbert 
---
 migration/migration.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index c4e0193..955d5ee 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1837,6 +1837,10 @@ static void *migration_thread(void *opaque)
 } else {
 if (old_vm_running && !entered_postcopy) {
 vm_start();
+} else {
+if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
+runstate_set(RUN_STATE_POSTMIGRATE);
+}
 }
 }
 qemu_bh_schedule(s->cleanup_bh);
-- 
2.7.4

[Qemu-devel] [PATCH] megasas: remove useless check for cmd->frame

2016-07-15 Thread Paolo Bonzini

megasas_enqueue_frame always returns with non-NULL cmd->frame.
Remove the "else" part as it is dead code.

Signed-off-by: Paolo Bonzini 
---
 hw/scsi/megasas.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index 52a4123..e968302 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -1981,11 +1981,7 @@ static void megasas_handle_frame(MegasasState *s, 
uint64_t frame_addr,
 break;
 }
 if (frame_status != MFI_STAT_INVALID_STATUS) {
-if (cmd->frame) {
-cmd->frame->header.cmd_status = frame_status;
-} else {
-megasas_frame_set_cmd_status(s, frame_addr, frame_status);
-}
+cmd->frame->header.cmd_status = frame_status;
 megasas_unmap_frame(s, cmd);
 megasas_complete_frame(s, cmd->context);
 }
-- 
2.7.4

1 2 3 >

1 - 100 of 263 matches

Mail list logo