Re: [Qemu-block] [Qemu-devel] [PATCH] vmdk: Switch to heap arrays for vmdk_write_cid

2016-03-07 Thread Fam Zheng
On Tue, 03/08 15:10, Peter Xu wrote:
> On Tue, Mar 08, 2016 at 02:18:35PM +0800, Fam Zheng wrote:
> > It is only called once for each opened image, so we can do it the easy
> > way.
> > 
> > Signed-off-by: Fam Zheng 
> > ---
> >  block/vmdk.c | 25 ++---
> >  1 file changed, 14 insertions(+), 11 deletions(-)
> > 
> > diff --git a/block/vmdk.c b/block/vmdk.c
> > index a8db5d9..1ec2452 100644
> > --- a/block/vmdk.c
> > +++ b/block/vmdk.c
> > @@ -274,36 +274,39 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, 
> > int parent)
> >  
> >  static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
> 
> If to use heap in write, do we need to do the same for e.g.,
> vmdk_parent_open() and vmdk_read_cid(), to make all things at least
> aligned?

Yes, let's change them altogether.

Fam



[Qemu-block] [PATCH] vmdk: Switch to heap arrays for vmdk_write_cid

2016-03-07 Thread Fam Zheng
It is only called once for each opened image, so we can do it the easy
way.

Signed-off-by: Fam Zheng 
---
 block/vmdk.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index a8db5d9..1ec2452 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -274,36 +274,39 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int 
parent)
 
 static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
 {
-char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
+char *desc, *tmp_desc;
 char *p_name, *tmp_str;
 BDRVVmdkState *s = bs->opaque;
-int ret;
+int ret = 0;
 
+desc = g_malloc0(DESC_SIZE);
+tmp_desc = g_malloc0(DESC_SIZE);
 ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
 if (ret < 0) {
-return ret;
+goto out;
 }
 
 desc[DESC_SIZE - 1] = '\0';
 tmp_str = strstr(desc, "parentCID");
 if (tmp_str == NULL) {
-return -EINVAL;
+ret = -EINVAL;
+goto out;
 }
 
-pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
+pstrcpy(tmp_desc, DESC_SIZE, tmp_str);
 p_name = strstr(desc, "CID");
 if (p_name != NULL) {
 p_name += sizeof("CID");
-snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid);
-pstrcat(desc, sizeof(desc), tmp_desc);
+snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid);
+pstrcat(desc, DESC_SIZE, tmp_desc);
 }
 
 ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE);
-if (ret < 0) {
-return ret;
-}
 
-return 0;
+out:
+g_free(desc);
+g_free(tmp_desc);
+return ret;
 }
 
 static int vmdk_is_cid_valid(BlockDriverState *bs)
-- 
2.4.3




Re: [Qemu-block] [PATCH v4 08/26] crypto: add support for the twofish cipher algorithm

2016-03-07 Thread Fam Zheng
On Mon, 02/29 12:00, Daniel P. Berrange wrote:
> New cipher algorithms 'twofish-128', 'twofish-192' and
> 'twofish-256' are defined for the Twofish algorithm.
> The gcrypt backend does not support 'twofish-192'.
> 
> The nettle and gcrypt cipher backends are updated to
> support the new cipher and a test vector added to the
> cipher test suite. The new algorithm is enabled in the
> LUKS block encryption driver.
> 
> Signed-off-by: Daniel P. Berrange 
> ---
>  crypto/cipher-gcrypt.c | 12 
>  crypto/cipher-nettle.c | 30 ++
>  crypto/cipher.c|  6 ++
>  qapi/crypto.json   |  6 +-
>  tests/test-crypto-cipher.c | 29 +
>  5 files changed, 82 insertions(+), 1 deletion(-)
> 
> diff --git a/crypto/cipher-gcrypt.c b/crypto/cipher-gcrypt.c
> index a667d59..ce26456 100644
> --- a/crypto/cipher-gcrypt.c
> +++ b/crypto/cipher-gcrypt.c
> @@ -33,6 +33,8 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
>  case QCRYPTO_CIPHER_ALG_SERPENT_128:
>  case QCRYPTO_CIPHER_ALG_SERPENT_192:
>  case QCRYPTO_CIPHER_ALG_SERPENT_256:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_128:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_256:
>  return true;
>  default:
>  return false;
> @@ -104,6 +106,14 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm 
> alg,
>  gcryalg = GCRY_CIPHER_SERPENT256;
>  break;
>  
> +case QCRYPTO_CIPHER_ALG_TWOFISH_128:
> +gcryalg = GCRY_CIPHER_TWOFISH128;
> +break;
> +
> +case QCRYPTO_CIPHER_ALG_TWOFISH_256:
> +gcryalg = GCRY_CIPHER_TWOFISH;
> +break;
> +
>  default:
>  error_setg(errp, "Unsupported cipher algorithm %d", alg);
>  return NULL;
> @@ -140,6 +150,8 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm 
> alg,
>  case QCRYPTO_CIPHER_ALG_SERPENT_128:
>  case QCRYPTO_CIPHER_ALG_SERPENT_192:
>  case QCRYPTO_CIPHER_ALG_SERPENT_256:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_128:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_256:
>  ctx->blocksize = 16;
>  break;
>  case QCRYPTO_CIPHER_ALG_CAST5_128:
> diff --git a/crypto/cipher-nettle.c b/crypto/cipher-nettle.c
> index 74b55ab..9b057fd 100644
> --- a/crypto/cipher-nettle.c
> +++ b/crypto/cipher-nettle.c
> @@ -25,6 +25,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #if CONFIG_NETTLE_VERSION_MAJOR < 3
>  typedef nettle_crypt_func nettle_cipher_func;
> @@ -89,6 +90,18 @@ static void serpent_decrypt_wrapper(cipher_ctx_t ctx, 
> cipher_length_t length,
>  serpent_decrypt(ctx, length, dst, src);
>  }
>  
> +static void twofish_encrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
> +uint8_t *dst, const uint8_t *src)
> +{
> +twofish_encrypt(ctx, length, dst, src);
> +}
> +
> +static void twofish_decrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length,
> +uint8_t *dst, const uint8_t *src)
> +{
> +twofish_decrypt(ctx, length, dst, src);
> +}
> +
>  typedef struct QCryptoCipherNettle QCryptoCipherNettle;
>  struct QCryptoCipherNettle {
>  void *ctx_encrypt;
> @@ -110,6 +123,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg)
>  case QCRYPTO_CIPHER_ALG_SERPENT_128:
>  case QCRYPTO_CIPHER_ALG_SERPENT_192:
>  case QCRYPTO_CIPHER_ALG_SERPENT_256:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_128:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_192:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_256:
>  return true;
>  default:
>  return false;
> @@ -200,6 +216,20 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm 
> alg,
>  ctx->blocksize = SERPENT_BLOCK_SIZE;
>  break;
>  
> +case QCRYPTO_CIPHER_ALG_TWOFISH_128:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_192:
> +case QCRYPTO_CIPHER_ALG_TWOFISH_256:
> +ctx->ctx_encrypt = g_new0(struct twofish_ctx, 1);
> +ctx->ctx_decrypt = NULL; /* 1 ctx can do both */
> +
> +twofish_set_key(ctx->ctx_encrypt, nkey, key);
> +
> +ctx->alg_encrypt = twofish_encrypt_wrapper;
> +ctx->alg_decrypt = twofish_decrypt_wrapper;
> +
> +ctx->blocksize = TWOFISH_BLOCK_SIZE;
> +break;
> +
>  default:
>  error_setg(errp, "Unsupported cipher algorithm %d", alg);
>  goto error;
> diff --git a/crypto/cipher.c b/crypto/cipher.c
> index 0f6fe98..89fa5a2 100644
> --- a/crypto/cipher.c
> +++ b/crypto/cipher.c
> @@ -31,6 +31,9 @@ static size_t alg_key_len[QCRYPTO_CIPHER_ALG__MAX] = {
>  [QCRYPTO_CIPHER_ALG_SERPENT_128] = 16,
>  [QCRYPTO_CIPHER_ALG_SERPENT_192] = 24,
>  [QCRYPTO_CIPHER_ALG_SERPENT_256] = 32,
> +[QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16,
> +[QCRYPTO_CIPHER_ALG_TWOFISH_192] = 24,
> +[QCRYPTO_CIPHER_ALG_TWOFISH_256] = 32,
>  };
>  
>  static size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = {
> @@ -42,6 

[Qemu-block] [PATCH v4 15/15] block: More operations for meta dirty bitmap

2016-03-07 Thread Fam Zheng
Callers can create an iterator of meta bitmap with
bdrv_dirty_meta_iter_new(), then use the bdrv_dirty_iter_* operations on
it. Meta iterators are also counted by bitmap->active_iterators.

Also add a couple of functions to retrieve granularity and count.

Signed-off-by: Fam Zheng 
---
 block/dirty-bitmap.c | 19 +++
 include/block/dirty-bitmap.h |  3 +++
 2 files changed, 22 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 3aef91a..3992eb5 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -393,6 +393,11 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap 
*bitmap)
 return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
 }
 
+uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap)
+{
+return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->meta);
+}
+
 BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
  uint64_t first_sector)
 {
@@ -403,6 +408,15 @@ BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap 
*bitmap,
 return iter;
 }
 
+BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap)
+{
+BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1);
+hbitmap_iter_init(>hbi, bitmap->meta, 0);
+iter->bitmap = bitmap;
+bitmap->active_iterators++;
+return iter;
+}
+
 void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter)
 {
 if (!iter) {
@@ -514,3 +528,8 @@ int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap)
 {
 return hbitmap_count(bitmap->bitmap);
 }
+
+int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap)
+{
+return hbitmap_count(bitmap->meta);
+}
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 40a09c0..3cbed02 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -30,6 +30,7 @@ void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
 uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
+uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
 const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap);
@@ -47,12 +48,14 @@ int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
 void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
   BdrvDirtyBitmap *bitmap, int64_t sector,
   int nb_sectors);
+BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap);
 BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
  uint64_t first_sector);
 void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter);
 int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
 void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num);
 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
+int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap);
 void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 
 uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
-- 
2.4.3




[Qemu-block] [PATCH v4 12/15] hbitmap: serialization

2016-03-07 Thread Fam Zheng
From: Vladimir Sementsov-Ogievskiy 

Functions to serialize / deserialize(restore) HBitmap. HBitmap should be
saved to linear sequence of bits independently of endianness and bitmap
array element (unsigned long) size. Therefore Little Endian is chosen.

These functions are appropriate for dirty bitmap migration, restoring
the bitmap in several steps is available. To save performance, every
step writes only the last level of the bitmap. All other levels are
restored by hbitmap_deserialize_finish() as a last step of restoring.
So, HBitmap is inconsistent while restoring.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
[Fix left shift operand to 1UL; add "finish" parameter. - Fam]
Signed-off-by: Fam Zheng 
---
 include/qemu/hbitmap.h |  79 
 util/hbitmap.c | 137 +
 2 files changed, 216 insertions(+)

diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index f8ed058..26cac7d 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -146,6 +146,85 @@ void hbitmap_reset_all(HBitmap *hb);
 bool hbitmap_get(const HBitmap *hb, uint64_t item);
 
 /**
+ * hbitmap_serialization_granularity:
+ * @hb: HBitmap to operate on.
+ *
+ * Granularity of serialization chunks, used by other serialization functions.
+ * For every chunk:
+ * 1. Chunk start should be aligned to this granularity.
+ * 2. Chunk size should be aligned too, except for last chunk (for which
+ *  start + count == hb->size)
+ */
+uint64_t hbitmap_serialization_granularity(const HBitmap *hb);
+
+/**
+ * hbitmap_serialization_size:
+ * @hb: HBitmap to operate on.
+ * @start: Starting bit
+ * @count: Number of bits
+ *
+ * Return number of bytes hbitmap_(de)serialize_part needs
+ */
+uint64_t hbitmap_serialization_size(const HBitmap *hb,
+uint64_t start, uint64_t count);
+
+/**
+ * hbitmap_serialize_part
+ * @hb: HBitmap to operate on.
+ * @buf: Buffer to store serialized bitmap.
+ * @start: First bit to store.
+ * @count: Number of bits to store.
+ *
+ * Stores HBitmap data corresponding to given region. The format of saved data
+ * is linear sequence of bits, so it can be used by hbitmap_deserialize_part
+ * independently of endianness and size of HBitmap level array elements
+ */
+void hbitmap_serialize_part(const HBitmap *hb, uint8_t *buf,
+uint64_t start, uint64_t count);
+
+/**
+ * hbitmap_deserialize_part
+ * @hb: HBitmap to operate on.
+ * @buf: Buffer to restore bitmap data from.
+ * @start: First bit to restore.
+ * @count: Number of bits to restore.
+ * @finish: Whether to call hbitmap_deserialize_finish automatically.
+ *
+ * Restores HBitmap data corresponding to given region. The format is the same
+ * as for hbitmap_serialize_part.
+ *
+ * If @finish is false, caller must call hbitmap_serialize_finish before using
+ * the bitmap.
+ */
+void hbitmap_deserialize_part(HBitmap *hb, uint8_t *buf,
+  uint64_t start, uint64_t count,
+  bool finish);
+
+/**
+ * hbitmap_deserialize_zeroes
+ * @hb: HBitmap to operate on.
+ * @start: First bit to restore.
+ * @count: Number of bits to restore.
+ * @finish: Whether to call hbitmap_deserialize_finish automatically.
+ *
+ * Fills the bitmap with zeroes.
+ *
+ * If @finish is false, caller must call hbitmap_serialize_finish before using
+ * the bitmap.
+ */
+void hbitmap_deserialize_zeroes(HBitmap *hb, uint64_t start, uint64_t count,
+bool finish);
+
+/**
+ * hbitmap_deserialize_finish
+ * @hb: HBitmap to operate on.
+ *
+ * Repair HBitmap after calling hbitmap_deserialize_data. Actually, all HBitmap
+ * layers are restored here.
+ */
+void hbitmap_deserialize_finish(HBitmap *hb);
+
+/**
  * hbitmap_free:
  * @hb: HBitmap to operate on.
  *
diff --git a/util/hbitmap.c b/util/hbitmap.c
index 2d3d04c..5f02c17 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -395,6 +395,143 @@ bool hbitmap_get(const HBitmap *hb, uint64_t item)
 return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0;
 }
 
+uint64_t hbitmap_serialization_granularity(const HBitmap *hb)
+{
+/* Require at least 64 bit granularity to be safe on both 64 bit and 32 bit
+ * hosts. */
+return 64 << hb->granularity;
+}
+
+/* Start should be aligned to serialization granularity, chunk size should be
+ * aligned to serialization granularity too, except for last chunk.
+ */
+static void serialization_chunk(const HBitmap *hb,
+uint64_t start, uint64_t count,
+unsigned long **first_el, size_t *el_count)
+{
+uint64_t last = start + count - 1;
+uint64_t gran = hbitmap_serialization_granularity(hb);
+
+assert((start & (gran - 1)) == 0);
+assert((last >> hb->granularity) < hb->size);
+if ((last >> hb->granularity) != 

[Qemu-block] [PATCH v4 13/15] block: BdrvDirtyBitmap serialization interface

2016-03-07 Thread Fam Zheng
From: Vladimir Sementsov-Ogievskiy 

Several functions to provide necessary access to BdrvDirtyBitmap for
block-migration.c

Signed-off-by: Vladimir Sementsov-Ogievskiy 
[Add the "finish" parameters. - Fam]
Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
---
 block/dirty-bitmap.c | 37 +
 include/block/dirty-bitmap.h | 14 ++
 2 files changed, 51 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 295788d..3aef91a 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -453,6 +453,43 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, 
HBitmap *in)
 hbitmap_free(tmp);
 }
 
+uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
+  uint64_t start, uint64_t count)
+{
+return hbitmap_serialization_size(bitmap->bitmap, start, count);
+}
+
+uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap)
+{
+return hbitmap_serialization_granularity(bitmap->bitmap);
+}
+
+void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
+  uint8_t *buf, uint64_t start,
+  uint64_t count)
+{
+hbitmap_serialize_part(bitmap->bitmap, buf, start, count);
+}
+
+void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
+uint8_t *buf, uint64_t start,
+uint64_t count, bool finish)
+{
+hbitmap_deserialize_part(bitmap->bitmap, buf, start, count, finish);
+}
+
+void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
+  uint64_t start, uint64_t count,
+  bool finish)
+{
+hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish);
+}
+
+void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap)
+{
+hbitmap_deserialize_finish(bitmap->bitmap);
+}
+
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
 int nr_sectors)
 {
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index caa4d82..40a09c0 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -55,4 +55,18 @@ void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t 
sector_num);
 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
 void bdrv_dirty_bitmap_truncate(BlockDriverState *bs);
 
+uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
+  uint64_t start, uint64_t count);
+uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
+  uint8_t *buf, uint64_t start,
+  uint64_t count);
+void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
+uint8_t *buf, uint64_t start,
+uint64_t count, bool finish);
+void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
+  uint64_t start, uint64_t count,
+  bool finish);
+void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap);
+
 #endif
-- 
2.4.3




[Qemu-block] [PATCH v4 06/15] block: Hide HBitmap in block dirty bitmap interface

2016-03-07 Thread Fam Zheng
HBitmap is an implementation detail of block dirty bitmap that should be hidden
from users. Introduce a BdrvDirtyBitmapIter to encapsulate the underlying
HBitmapIter.

A small difference in the interface is, before, an HBitmapIter is initialized
in place, now the new BdrvDirtyBitmapIter must be dynamically allocated because
the structure definition is in block.c.

Two current users are converted too.

Signed-off-by: Fam Zheng 
---
 block/backup.c   | 14 --
 block/dirty-bitmap.c | 39 +--
 block/mirror.c   | 15 +--
 include/block/dirty-bitmap.h |  7 +--
 include/qemu/typedefs.h  |  1 +
 5 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index ab3e345..f5a5e03 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -330,14 +330,14 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 int64_t last_cluster = -1;
 int64_t sectors_per_cluster = cluster_size_sectors(job);
 BlockDriverState *bs = job->common.bs;
-HBitmapIter hbi;
+BdrvDirtyBitmapIter *dbi;
 
 granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
 clusters_per_iter = MAX((granularity / job->cluster_size), 1);
-bdrv_dirty_iter_init(job->sync_bitmap, );
+dbi = bdrv_dirty_iter_new(job->sync_bitmap, 0);
 
 /* Find the next dirty sector(s) */
-while ((sector = hbitmap_iter_next()) != -1) {
+while ((sector = bdrv_dirty_iter_next(dbi)) != -1) {
 cluster = sector / sectors_per_cluster;
 
 /* Fake progress updates for any clusters we skipped */
@@ -349,7 +349,7 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
 do {
 if (yield_and_check(job)) {
-return ret;
+goto out;
 }
 ret = backup_do_cow(bs, cluster * sectors_per_cluster,
 sectors_per_cluster, _is_read,
@@ -357,7 +357,7 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 if ((ret < 0) &&
 backup_error_action(job, error_is_read, -ret) ==
 BLOCK_ERROR_ACTION_REPORT) {
-return ret;
+goto out;
 }
 } while (ret < 0);
 }
@@ -365,7 +365,7 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 /* If the bitmap granularity is smaller than the backup granularity,
  * we need to advance the iterator pointer to the next cluster. */
 if (granularity < job->cluster_size) {
-bdrv_set_dirty_iter(, cluster * sectors_per_cluster);
+bdrv_set_dirty_iter(dbi, cluster * sectors_per_cluster);
 }
 
 last_cluster = cluster - 1;
@@ -377,6 +377,8 @@ static int coroutine_fn 
backup_run_incremental(BackupBlockJob *job)
 job->common.offset += ((end - last_cluster - 1) * job->cluster_size);
 }
 
+out:
+bdrv_dirty_iter_free(dbi);
 return ret;
 }
 
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 556e1d1..16f73b2 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -42,9 +42,15 @@ struct BdrvDirtyBitmap {
 char *name; /* Optional non-empty unique ID */
 int64_t size;   /* Size of the bitmap (Number of sectors) */
 bool disabled;  /* Bitmap is read-only */
+int active_iterators;   /* How many iterators are active */
 QLIST_ENTRY(BdrvDirtyBitmap) list;
 };
 
+struct BdrvDirtyBitmapIter {
+HBitmapIter hbi;
+BdrvDirtyBitmap *bitmap;
+};
+
 BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name)
 {
 BdrvDirtyBitmap *bm;
@@ -212,6 +218,7 @@ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs)
 
 QLIST_FOREACH(bitmap, >dirty_bitmaps, list) {
 assert(!bdrv_dirty_bitmap_frozen(bitmap));
+assert(!bitmap->active_iterators);
 hbitmap_truncate(bitmap->bitmap, size);
 bitmap->size = size;
 }
@@ -224,6 +231,7 @@ static void 
bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
 BdrvDirtyBitmap *bm, *next;
 QLIST_FOREACH_SAFE(bm, >dirty_bitmaps, list, next) {
 if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
+assert(!bitmap->active_iterators);
 assert(!bdrv_dirty_bitmap_frozen(bm));
 QLIST_REMOVE(bm, list);
 hbitmap_free(bm->bitmap);
@@ -320,9 +328,29 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap 
*bitmap)
 return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap);
 }
 
-void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi)
+BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
+ uint64_t 

[Qemu-block] [PATCH v4 10/15] block: Add two dirty bitmap getters

2016-03-07 Thread Fam Zheng
For dirty bitmap users to get the size and the name of a
BdrvDirtyBitmap.

Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
---
 block/dirty-bitmap.c | 10 ++
 include/block/dirty-bitmap.h |  2 ++
 2 files changed, 12 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index a2413c0..50bb9e0 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -154,6 +154,16 @@ void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
 hbitmap_reset(bitmap->meta, sector, nb_sectors);
 }
 
+int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap)
+{
+return bitmap->size;
+}
+
+const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap)
+{
+return bitmap->name;
+}
+
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
 {
 return bitmap->successor;
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 50e0fca..caa4d82 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -32,6 +32,8 @@ uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState 
*bs);
 uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap);
+const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap);
+int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap);
 DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap);
 int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
int64_t sector);
-- 
2.4.3




[Qemu-block] [PATCH v4 07/15] HBitmap: Introduce "meta" bitmap to track bit changes

2016-03-07 Thread Fam Zheng
Upon each bit toggle, the corresponding bit in the meta bitmap will be
set.

Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
---
 block/dirty-bitmap.c   |  2 +-
 include/qemu/hbitmap.h | 17 +
 util/hbitmap.c | 66 ++
 3 files changed, 69 insertions(+), 16 deletions(-)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 16f73b2..0a188f2 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -231,7 +231,7 @@ static void 
bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
 BdrvDirtyBitmap *bm, *next;
 QLIST_FOREACH_SAFE(bm, >dirty_bitmaps, list, next) {
 if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
-assert(!bitmap->active_iterators);
+assert(!bm->active_iterators);
 assert(!bdrv_dirty_bitmap_frozen(bm));
 QLIST_REMOVE(bm, list);
 hbitmap_free(bm->bitmap);
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index e29188c..f8ed058 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -178,6 +178,23 @@ void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap 
*hb, uint64_t first);
  */
 unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi);
 
+/* hbitmap_create_meta:
+ * Create a "meta" hbitmap to track dirtiness of the bits in this HBitmap.
+ * The caller owns the created bitmap and must call hbitmap_free_meta(hb) to
+ * free it.
+ *
+ * @hb: The HBitmap to operate on.
+ * @chunk_size: How many bits in @hb does one bit in the meta track.
+ */
+HBitmap *hbitmap_create_meta(HBitmap *hb, int chunk_size);
+
+/* hbitmap_free_meta:
+ * Free the meta bitmap of @hb.
+ *
+ * @hb: The HBitmap whose meta bitmap should be freed.
+ */
+void hbitmap_free_meta(HBitmap *hb);
+
 /**
  * hbitmap_iter_next:
  * @hbi: HBitmapIter to operate on.
diff --git a/util/hbitmap.c b/util/hbitmap.c
index b22b87d..2d3d04c 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -79,6 +79,9 @@ struct HBitmap {
  */
 int granularity;
 
+/* A meta dirty bitmap to track the dirtiness of bits in this HBitmap. */
+HBitmap *meta;
+
 /* A number of progressively less coarse bitmaps (i.e. level 0 is the
  * coarsest).  Each bit in level N represents a word in level N+1 that
  * has a set bit, except the last level where each bit represents the
@@ -210,25 +213,27 @@ static uint64_t hb_count_between(HBitmap *hb, uint64_t 
start, uint64_t last)
 }
 
 /* Setting starts at the last layer and propagates up if an element
- * changes from zero to non-zero.
+ * changes.
  */
 static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t 
last)
 {
 unsigned long mask;
-bool changed;
+unsigned long old;
 
 assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
 assert(start <= last);
 
 mask = 2UL << (last & (BITS_PER_LONG - 1));
 mask -= 1UL << (start & (BITS_PER_LONG - 1));
-changed = (*elem == 0);
+old = *elem;
 *elem |= mask;
-return changed;
+return old != *elem;
 }
 
-/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */
-static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t 
last)
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)...
+ * Returns true if at least one bit is changed. */
+static bool hb_set_between(HBitmap *hb, int level, uint64_t start,
+   uint64_t last)
 {
 size_t pos = start >> BITS_PER_LEVEL;
 size_t lastpos = last >> BITS_PER_LEVEL;
@@ -257,22 +262,27 @@ static void hb_set_between(HBitmap *hb, int level, 
uint64_t start, uint64_t last
 if (level > 0 && changed) {
 hb_set_between(hb, level - 1, pos, lastpos);
 }
+return changed;
 }
 
 void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
 {
 /* Compute range in the last layer.  */
+uint64_t first, n;
 uint64_t last = start + count - 1;
 
 trace_hbitmap_set(hb, start, count,
   start >> hb->granularity, last >> hb->granularity);
 
-start >>= hb->granularity;
+first = start >> hb->granularity;
 last >>= hb->granularity;
-count = last - start + 1;
+n = last - first + 1;
 
-hb->count += count - hb_count_between(hb, start, last);
-hb_set_between(hb, HBITMAP_LEVELS - 1, start, last);
+hb->count += n - hb_count_between(hb, first, last);
+if (hb_set_between(hb, HBITMAP_LEVELS - 1, first, last) &&
+hb->meta) {
+hbitmap_set(hb->meta, start, count);
+}
 }
 
 /* Resetting works the other way round: propagate up if the new
@@ -293,8 +303,10 @@ static inline bool hb_reset_elem(unsigned long *elem, 
uint64_t start, uint64_t l
 return blanked;
 }
 
-/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */
-static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t 
last)
+/* The recursive workhorse (the depth is 

[Qemu-block] [PATCH v4 08/15] tests: Add test code for meta bitmap

2016-03-07 Thread Fam Zheng
Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
---
 tests/test-hbitmap.c | 116 +++
 1 file changed, 116 insertions(+)

diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
index abe1427..c00c2b5 100644
--- a/tests/test-hbitmap.c
+++ b/tests/test-hbitmap.c
@@ -12,6 +12,7 @@
 #include "qemu/osdep.h"
 #include 
 #include "qemu/hbitmap.h"
+#include "block/block.h"
 
 #define LOG_BITS_PER_LONG  (BITS_PER_LONG == 32 ? 5 : 6)
 
@@ -21,6 +22,7 @@
 
 typedef struct TestHBitmapData {
 HBitmap   *hb;
+HBitmap   *meta;
 unsigned long *bits;
 size_t size;
 size_t old_size;
@@ -92,6 +94,14 @@ static void hbitmap_test_init(TestHBitmapData *data,
 }
 }
 
+static void hbitmap_test_init_meta(TestHBitmapData *data,
+   uint64_t size, int granularity,
+   int meta_chunk)
+{
+hbitmap_test_init(data, size, granularity);
+data->meta = hbitmap_create_meta(data->hb, meta_chunk);
+}
+
 static inline size_t hbitmap_test_array_size(size_t bits)
 {
 size_t n = (bits + BITS_PER_LONG - 1) / BITS_PER_LONG;
@@ -134,6 +144,9 @@ static void hbitmap_test_teardown(TestHBitmapData *data,
   const void *unused)
 {
 if (data->hb) {
+if (data->meta) {
+hbitmap_free_meta(data->hb);
+}
 hbitmap_free(data->hb);
 data->hb = NULL;
 }
@@ -635,6 +648,103 @@ static void 
test_hbitmap_truncate_shrink_large(TestHBitmapData *data,
 hbitmap_test_truncate(data, size, -diff, 0);
 }
 
+static void hbitmap_check_meta(TestHBitmapData *data,
+   int64_t start, int count)
+{
+int64_t i;
+
+for (i = 0; i < data->size; i++) {
+if (i >= start && i < start + count) {
+g_assert(hbitmap_get(data->meta, i));
+} else {
+g_assert(!hbitmap_get(data->meta, i));
+}
+}
+}
+
+static void hbitmap_test_meta(TestHBitmapData *data,
+  int64_t start, int count,
+  int64_t check_start, int check_count)
+{
+hbitmap_reset_all(data->hb);
+hbitmap_reset_all(data->meta);
+
+/* Test "unset" -> "unset" will not update meta. */
+hbitmap_reset(data->hb, start, count);
+hbitmap_check_meta(data, 0, 0);
+
+/* Test "unset" -> "set" will update meta */
+hbitmap_set(data->hb, start, count);
+hbitmap_check_meta(data, check_start, check_count);
+
+/* Test "set" -> "set" will not update meta */
+hbitmap_reset_all(data->meta);
+hbitmap_set(data->hb, start, count);
+hbitmap_check_meta(data, 0, 0);
+
+/* Test "set" -> "unset" will update meta */
+hbitmap_reset_all(data->meta);
+hbitmap_reset(data->hb, start, count);
+hbitmap_check_meta(data, check_start, check_count);
+}
+
+static void hbitmap_test_meta_do(TestHBitmapData *data, int chunk_size)
+{
+uint64_t size = chunk_size * 100;
+hbitmap_test_init_meta(data, size, 0, chunk_size);
+
+hbitmap_test_meta(data, 0, 1, 0, chunk_size);
+hbitmap_test_meta(data, 0, chunk_size, 0, chunk_size);
+hbitmap_test_meta(data, chunk_size - 1, 1, 0, chunk_size);
+hbitmap_test_meta(data, chunk_size - 1, 2, 0, chunk_size * 2);
+hbitmap_test_meta(data, chunk_size - 1, chunk_size + 1, 0, chunk_size * 2);
+hbitmap_test_meta(data, chunk_size - 1, chunk_size + 2, 0, chunk_size * 3);
+hbitmap_test_meta(data, 7 * chunk_size - 1, chunk_size + 2,
+  6 * chunk_size, chunk_size * 3);
+hbitmap_test_meta(data, size - 1, 1, size - chunk_size, chunk_size);
+hbitmap_test_meta(data, 0, size, 0, size);
+}
+
+static void test_hbitmap_meta_byte(TestHBitmapData *data, const void *unused)
+{
+hbitmap_test_meta_do(data, BITS_PER_BYTE);
+}
+
+static void test_hbitmap_meta_word(TestHBitmapData *data, const void *unused)
+{
+hbitmap_test_meta_do(data, BITS_PER_LONG);
+}
+
+static void test_hbitmap_meta_sector(TestHBitmapData *data, const void *unused)
+{
+hbitmap_test_meta_do(data, BDRV_SECTOR_SIZE * BITS_PER_BYTE);
+}
+
+/**
+ * Create an HBitmap and test set/unset.
+ */
+static void test_hbitmap_meta_one(TestHBitmapData *data, const void *unused)
+{
+int i;
+int64_t offsets[] = {
+0, 1, L1 - 1, L1, L1 + 1, L2 - 1, L2, L2 + 1, L3 - 1, L3, L3 + 1
+};
+
+hbitmap_test_init_meta(data, L3 * 2, 0, 1);
+for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+hbitmap_test_meta(data, offsets[i], 1, offsets[i], 1);
+hbitmap_test_meta(data, offsets[i], L1, offsets[i], L1);
+hbitmap_test_meta(data, offsets[i], L2, offsets[i], L2);
+}
+}
+
+static void test_hbitmap_meta_zero(TestHBitmapData *data, const void *unused)
+{
+hbitmap_test_init_meta(data, 0, 0, 1);
+
+hbitmap_check_meta(data, 0, 0);
+}
+
 static void hbitmap_test_add(const char *testpath,

[Qemu-block] [PATCH v4 09/15] block: Support meta dirty bitmap

2016-03-07 Thread Fam Zheng
The added group of operations enables tracking of the changed bits in
the dirty bitmap.

Signed-off-by: Fam Zheng 
---
 block/dirty-bitmap.c | 52 
 include/block/dirty-bitmap.h |  9 
 2 files changed, 61 insertions(+)

diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index 0a188f2..a2413c0 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -38,6 +38,7 @@
  */
 struct BdrvDirtyBitmap {
 HBitmap *bitmap;/* Dirty sector bitmap implementation */
+HBitmap *meta;  /* Meta dirty bitmap */
 BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */
 char *name; /* Optional non-empty unique ID */
 int64_t size;   /* Size of the bitmap (Number of sectors) */
@@ -103,6 +104,56 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState 
*bs,
 return bitmap;
 }
 
+/* bdrv_create_meta_dirty_bitmap
+ *
+ * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. I.e.
+ * when a dirty status bit in @bitmap is changed (either from reset to set or
+ * the other way around), its respective meta dirty bitmap bit will be marked
+ * dirty as well.
+ *
+ * @bitmap: the block dirty bitmap for which to create a meta dirty bitmap.
+ * @chunk_size: how many bytes of bitmap data does each bit in the meta bitmap
+ * track.
+ */
+void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+   int chunk_size)
+{
+assert(!bitmap->meta);
+bitmap->meta = hbitmap_create_meta(bitmap->bitmap,
+   chunk_size * BITS_PER_BYTE);
+}
+
+void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+{
+assert(bitmap->meta);
+hbitmap_free_meta(bitmap->bitmap);
+bitmap->meta = NULL;
+}
+
+int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
+   BdrvDirtyBitmap *bitmap, int64_t sector,
+   int nb_sectors)
+{
+uint64_t i;
+int gran = bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
+
+/* To optimize: we can make hbitmap to internally check the range in a
+ * coarse level, or at least do it word by word. */
+for (i = sector; i < sector + nb_sectors; i += gran) {
+if (hbitmap_get(bitmap->meta, i)) {
+return true;
+}
+}
+return false;
+}
+
+void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
+  BdrvDirtyBitmap *bitmap, int64_t sector,
+  int nb_sectors)
+{
+hbitmap_reset(bitmap->meta, sector, nb_sectors);
+}
+
 bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
 {
 return bitmap->successor;
@@ -233,6 +284,7 @@ static void 
bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs,
 if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) {
 assert(!bm->active_iterators);
 assert(!bdrv_dirty_bitmap_frozen(bm));
+assert(!bm->meta);
 QLIST_REMOVE(bm, list);
 hbitmap_free(bm->bitmap);
 g_free(bm->name);
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 2ea601b..50e0fca 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -8,6 +8,9 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
   uint32_t granularity,
   const char *name,
   Error **errp);
+void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+   int chunk_size);
+void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
BdrvDirtyBitmap *bitmap,
Error **errp);
@@ -36,6 +39,12 @@ void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
int64_t cur_sector, int nr_sectors);
 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
  int64_t cur_sector, int nr_sectors);
+int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
+   BdrvDirtyBitmap *bitmap, int64_t sector,
+   int nb_sectors);
+void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
+  BdrvDirtyBitmap *bitmap, int64_t sector,
+  int nb_sectors);
 BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
  uint64_t first_sector);
 void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter);
-- 
2.4.3




[Qemu-block] [PATCH v4 03/15] typedefs: Add BdrvDirtyBitmap

2016-03-07 Thread Fam Zheng
Following patches to refactor and move block dirty bitmap code could use
this.

Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
---
 include/block/block.h   | 1 -
 include/qemu/typedefs.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/block/block.h b/include/block/block.h
index 00827f7..5ee9b8f 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -476,7 +476,6 @@ void *qemu_try_blockalign(BlockDriverState *bs, size_t 
size);
 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 
-typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;
 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
   uint32_t granularity,
   const char *name,
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index 9a5ead6..fd039e0 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -10,6 +10,7 @@ typedef struct AddressSpace AddressSpace;
 typedef struct AioContext AioContext;
 typedef struct AllwinnerAHCIState AllwinnerAHCIState;
 typedef struct AudioState AudioState;
+typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;
 typedef struct BlockBackend BlockBackend;
 typedef struct BlockBackendRootState BlockBackendRootState;
 typedef struct BlockDriverState BlockDriverState;
-- 
2.4.3




[Qemu-block] [PATCH v4 02/15] block: Include hbitmap.h in block.h

2016-03-07 Thread Fam Zheng
Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
---
 include/block/block.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/block/block.h b/include/block/block.h
index 1c4f4d8..00827f7 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -8,6 +8,7 @@
 #include "block/accounting.h"
 #include "qapi/qmp/qobject.h"
 #include "qapi-types.h"
+#include "qemu/hbitmap.h"
 
 /* block.c */
 typedef struct BlockDriver BlockDriver;
@@ -475,7 +476,6 @@ void *qemu_try_blockalign(BlockDriverState *bs, size_t 
size);
 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
 
-struct HBitmapIter;
 typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;
 BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
   uint32_t granularity,
-- 
2.4.3




[Qemu-block] [PATCH v4 05/15] block: Remove unused typedef of BlockDriverDirtyHandler

2016-03-07 Thread Fam Zheng
Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
---
 include/block/block.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 3fbd70d..ff1933a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -322,8 +322,6 @@ BlockDriverState *check_to_replace_node(BlockDriverState 
*parent_bs,
 const char *node_name, Error **errp);
 
 /* async block I/O */
-typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector,
- int sector_num);
 BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
QEMUIOVector *iov, int nb_sectors,
BlockCompletionFunc *cb, void *opaque);
-- 
2.4.3




[Qemu-block] [PATCH v4 00/15] Dirty bitmap changes for migration/persistence work

2016-03-07 Thread Fam Zheng
v4: Rebase.
Add rev-by from John in patches 1-5, 7, 8.
Remove BdrvDirtyBitmap typedef from dirty-bitmap.h in patch 4. [Max]
Add assertion on bm->meta in patch 9. [John]

Two major features are added to block dirty bitmap (and underlying HBitmap) in
this series: meta bitmap and serialization, together with all other supportive
patches.

Both operations are common in dirty bitmap migration and persistence: they need
to find whether and which part of the dirty bitmap in question has changed with
meta dirty bitmap, and they need to write it to the target with serialization.


Fam Zheng (13):
  backup: Use Bitmap to replace "s->bitmap"
  block: Include hbitmap.h in block.h
  typedefs: Add BdrvDirtyBitmap
  block: Move block dirty bitmap code to separate files
  block: Remove unused typedef of BlockDriverDirtyHandler
  block: Hide HBitmap in block dirty bitmap interface
  HBitmap: Introduce "meta" bitmap to track bit changes
  tests: Add test code for meta bitmap
  block: Support meta dirty bitmap
  block: Add two dirty bitmap getters
  block: Assert that bdrv_release_dirty_bitmap succeeded
  tests: Add test code for hbitmap serialization
  block: More operations for meta dirty bitmap

Vladimir Sementsov-Ogievskiy (2):
  hbitmap: serialization
  block: BdrvDirtyBitmap serialization interface

 block.c  | 360 -
 block/Makefile.objs  |   2 +-
 block/backup.c   |  25 +-
 block/dirty-bitmap.c | 535 +++
 block/mirror.c   |  15 +-
 include/block/block.h|  40 +---
 include/block/dirty-bitmap.h |  75 ++
 include/qemu/hbitmap.h   |  96 
 include/qemu/typedefs.h  |   2 +
 tests/test-hbitmap.c | 255 +
 util/hbitmap.c   | 203 ++--
 11 files changed, 1177 insertions(+), 431 deletions(-)
 create mode 100644 block/dirty-bitmap.c
 create mode 100644 include/block/dirty-bitmap.h

-- 
2.4.3




[Qemu-block] [PATCH v4 01/15] backup: Use Bitmap to replace "s->bitmap"

2016-03-07 Thread Fam Zheng
"s->bitmap" tracks done sectors, we only check bit states without using any
iterator which HBitmap is good for. Switch to "Bitmap" which is simpler and
more memory efficient.

Meanwhile, rename it to done_bitmap, to reflect the intention.

Signed-off-by: Fam Zheng 
Reviewed-by: John Snow 
---
 block/backup.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 0f1b1bc..ab3e345 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -20,6 +20,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
 #include "sysemu/block-backend.h"
+#include "qemu/bitmap.h"
 
 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
 #define SLICE_TIME 1ULL /* ns */
@@ -42,7 +43,7 @@ typedef struct BackupBlockJob {
 BlockdevOnError on_target_error;
 CoRwlock flush_rwlock;
 uint64_t sectors_read;
-HBitmap *bitmap;
+unsigned long *done_bitmap;
 int64_t cluster_size;
 QLIST_HEAD(, CowRequest) inflight_reqs;
 } BackupBlockJob;
@@ -116,7 +117,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
 cow_request_begin(_request, job, start, end);
 
 for (; start < end; start++) {
-if (hbitmap_get(job->bitmap, start)) {
+if (test_bit(start, job->done_bitmap)) {
 trace_backup_do_cow_skip(job, start);
 continue; /* already copied */
 }
@@ -167,7 +168,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
 goto out;
 }
 
-hbitmap_set(job->bitmap, start, 1);
+set_bit(start, job->done_bitmap);
 
 /* Publish progress, guest I/O counts as progress too.  Note that the
  * offset field is an opaque progress value, it is not a disk offset.
@@ -399,7 +400,7 @@ static void coroutine_fn backup_run(void *opaque)
 start = 0;
 end = DIV_ROUND_UP(job->common.len, job->cluster_size);
 
-job->bitmap = hbitmap_alloc(end, 0);
+job->done_bitmap = bitmap_new(end);
 
 bdrv_set_enable_write_cache(target, true);
 if (target->blk) {
@@ -480,7 +481,7 @@ static void coroutine_fn backup_run(void *opaque)
 /* wait until pending backup_do_cow() calls have completed */
 qemu_co_rwlock_wrlock(>flush_rwlock);
 qemu_co_rwlock_unlock(>flush_rwlock);
-hbitmap_free(job->bitmap);
+g_free(job->done_bitmap);
 
 if (target->blk) {
 blk_iostatus_disable(target->blk);
-- 
2.4.3




Re: [Qemu-block] [PATCH] block/gluster: add support for SEEK_DATA/SEEK_HOLE

2016-03-07 Thread Niels de Vos
On Mon, Mar 07, 2016 at 01:27:38PM -0500, Jeff Cody wrote:
> On Mon, Mar 07, 2016 at 07:04:15PM +0100, Niels de Vos wrote:
> > GlusterFS 3.8 contains support for SEEK_DATA and SEEK_HOLE. This makes
> > it possible to detect sparse areas in files.
> > 
> > Signed-off-by: Niels de Vos 
> > 
> > --
> > Tested by compiling and running "qemu-img map gluster://..." with a
> > build of the current master branch of glusterfs. Using a Fedora
> > cloud image (in raw format) shows many SEEK procudure calls going back
> > and forth over the network. The output of "qemu map" matches the output
> > when run against the image on the local filesystem.
> > ---
> >  block/gluster.c | 159 
> > 
> >  configure   |  25 +
> >  2 files changed, 184 insertions(+)
> > 
> > diff --git a/block/gluster.c b/block/gluster.c
> > index 65077a0..1430010 100644
> > --- a/block/gluster.c
> > +++ b/block/gluster.c
> > @@ -677,6 +677,153 @@ static int 
> > qemu_gluster_has_zero_init(BlockDriverState *bs)
> >  return 0;
> >  }
> >  
> > +#ifdef CONFIG_GLUSTERFS_SEEK_DATA
> 
> Why do we need to make this a compile-time option?  Version checking
> is problematic; for instance, different distributions may have
> backported bug fixes / features, that are not reflected by the
> reported version number, etc..  Ideally, we can determine
> functionality during runtime, and behave accordingly.

This will not get backported to older Gluster versions, it required a
protocol change.

> If SEEK_DATA and SEEK_HOLE are not supported,
> qemu_gluster_co_get_block_status can return that sectors are all
> allocated (which is what happens in block/io.c anyway if the driver
> doesn't support the function).

Ok, good to know.

> As long as glfs_lseek() will return error (e.g. EINVAL) for an invalid
> whence value,  we can handle it runtime.  Does glfs_lseek() behave
> sanely?

Unfortunately older versions of libgfapi do not return EINVAL when
SEEK_DATA/HOLE is used. It is something we'll need to fix in the stable
releases. We can not assume that all users have installed a version of
the library that handles SEEK_DATA/HOLE correctly (return EINVAL) when
there is no support in the network protocol or on the server.

To be sure that we don't get some undefined behaviour, the compile time
check is needed.

Niels

> > +/*
> > + * Find allocation range in @bs around offset @start.
> > + * May change underlying file descriptor's file offset.
> > + * If @start is not in a hole, store @start in @data, and the
> > + * beginning of the next hole in @hole, and return 0.
> > + * If @start is in a non-trailing hole, store @start in @hole and the
> > + * beginning of the next non-hole in @data, and return 0.
> > + * If @start is in a trailing hole or beyond EOF, return -ENXIO.
> > + * If we can't find out, return a negative errno other than -ENXIO.
> > + *
> > + * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
> > + */
> > +static int find_allocation(BlockDriverState *bs, off_t start,
> > +   off_t *data, off_t *hole)
> > +{
> > +BDRVGlusterState *s = bs->opaque;
> > +off_t offs;
> > +
> > +/*
> > + * SEEK_DATA cases:
> > + * D1. offs == start: start is in data
> > + * D2. offs > start: start is in a hole, next data at offs
> > + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
> > + *  or start is beyond EOF
> > + * If the latter happens, the file has been truncated behind
> > + * our back since we opened it.  All bets are off then.
> > + * Treating like a trailing hole is simplest.
> > + * D4. offs < 0, errno != ENXIO: we learned nothing
> > + */
> > +offs = glfs_lseek(s->fd, start, SEEK_DATA);
> > +if (offs < 0) {
> > +return -errno;  /* D3 or D4 */
> > +}
> > +assert(offs >= start);
> > +
> > +if (offs > start) {
> > +/* D2: in hole, next data at offs */
> > +*hole = start;
> > +*data = offs;
> > +return 0;
> > +}
> > +
> > +/* D1: in data, end not yet known */
> > +
> > +/*
> > + * SEEK_HOLE cases:
> > + * H1. offs == start: start is in a hole
> > + * If this happens here, a hole has been dug behind our back
> > + * since the previous lseek().
> > + * H2. offs > start: either start is in data, next hole at offs,
> > + *   or start is in trailing hole, EOF at offs
> > + * Linux treats trailing holes like any other hole: offs ==
> > + * start.  Solaris seeks to EOF instead: offs > start (blech).
> > + * If that happens here, a hole has been dug behind our back
> > + * since the previous lseek().
> > + * H3. offs < 0, errno = ENXIO: start is beyond EOF
> > + * If this happens, the file has been truncated behind our
> > + * back since we opened it.  Treat it like a 

Re: [Qemu-block] [Qemu-devel] [PATCH v15 8/9] Implement new driver for block replication

2016-03-07 Thread Changlong Xie

On 03/05/2016 01:53 AM, Stefan Hajnoczi wrote:

On Fri, Feb 05, 2016 at 12:18:07PM +0800, Changlong Xie wrote:

+static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
+{
+Error *local_err = NULL;
+int ret;
+
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+return;
+}


Do you have test cases for the replication driver?

This error code path seems like something that should be exercised to
make sure it works.

Basic start/checkpoint/stop and checks that the active/secondary/hidden
disks contain the expected data are also important.

Probably the easiest way to do this would be a file called
tests/test-replication.c that calls start/checkpoint/stop and read/write
functions directly instead of running a guest.


Will do that in next version.

Thanks
-Xie








Re: [Qemu-block] [Qemu-devel] [PATCH v15 8/9] Implement new driver for block replication

2016-03-07 Thread Changlong Xie

On 03/05/2016 01:39 AM, Stefan Hajnoczi wrote:

On Fri, Feb 05, 2016 at 12:18:07PM +0800, Changlong Xie wrote:

+static void replication_start(ReplicationState *rs, ReplicationMode mode,
+  Error **errp)
+{
+BlockDriverState *bs = rs->opaque;
+BDRVReplicationState *s = bs->opaque;
+int64_t active_length, hidden_length, disk_length;
+AioContext *aio_context;
+Error *local_err = NULL;
+
+if (s->replication_state != BLOCK_REPLICATION_NONE) {


Dereferencing bs and s is not thread-safe since the AioContext isn't
acquired here.  Unless you have other locking rules, the default is that
everything in BlockDriverState and bs->opaque must be accessed with the
AioContext acquired.

Please apply this comment to the rest of the patch series, too.
Functions that are not part of BlockDriver are probably called outside
the AioContext and must acquire it before they are allowed to access
anything else in bs.



Will do implement in replication_start/stop/checkpoint/get_error callbacks.


+s->top_bs = get_top_bs(bs);
+if (!s->top_bs) {
+error_setg(errp, "Cannot get the top block driver state to do"
+   " internal backup");
+return;
+}


Traversing the BDS graph using the parent pointer is not safe - you
cannot stash the parent pointer because it changes if a parent node is
inserted/removed.

I suggest passing the drive name as an argument so that bdrv_lookup_bs()
can be used when installing the op blocker.  Then the BdrvChild.parent
pointer patch and get_top_bs() can be deleted.



ok


Jeff Cody is currently working on a new op blocker system.  Hopefully
this system will allow you to install blockers on bs instead of on the
drive's top BDS.  But let's not worry about that for now and just use
the drive name as an argument.


Good news.

Thanks




+/*
+ * Must protect backup target if backup job was stopped/cancelled
+ * unexpectedly
+ */
+bdrv_ref(s->hidden_disk->bs);


Where is the matching bdrv_unref() call?







Re: [Qemu-block] [PATCH v15 7/9] Introduce new APIs to do replication operation

2016-03-07 Thread Changlong Xie

On 03/05/2016 12:13 AM, Stefan Hajnoczi wrote:

On Fri, Feb 05, 2016 at 12:18:06PM +0800, Changlong Xie wrote:

diff --git a/replication.h b/replication.h
new file mode 100644
index 000..faea649
--- /dev/null
+++ b/replication.h
@@ -0,0 +1,53 @@
+/*
+ * Replication filter
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2016 FUJITSU LIMITED
+ *
+ * Author:
+ *   Wen Congyang 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef REPLICATION_H
+#define REPLICATION_H
+
+#include "sysemu/sysemu.h"
+
+typedef struct ReplicationOps ReplicationOps;
+typedef struct ReplicationState ReplicationState;
+typedef void (*Start)(ReplicationState *rs, ReplicationMode mode, Error 
**errp);
+typedef void (*Stop)(ReplicationState *rs, bool failover, Error **errp);
+typedef void (*Checkpoint)(ReplicationState *rs, Error **errp);
+typedef void (*GetError)(ReplicationState *rs, Error **errp);




Hi Stefan

Thanks for your comments.


These typedefs are likely to collide with system headers.  Please prefix
the name.

That said, I don't think Start, Stop, Checkpoint, GetError are
necessary.  Just declare the prototypes in struct ReplicationOps.  No
user passes single function pointers, they always pass ReplicationOps.
Therefore it's not necessary to declare types for these functions at
all.


Will try that.




+
+struct ReplicationState {
+void *opaque;
+ReplicationOps *ops;
+QLIST_ENTRY(ReplicationState) node;
+};
+
+struct ReplicationOps{
+Start start;
+Checkpoint checkpoint;
+GetError get_error;
+Stop stop;
+};


Please add doc comments that explain the semantics of these functions.
For examples, see include/qom/object.h.  This documentation should allow
someone implementing a new replication listener to understand the
constraints and the purpose of these functions.


Surely


Thanks
-Xie








Re: [Qemu-block] [PATCH v10 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-03-07 Thread Changlong Xie

On 03/08/2016 12:02 AM, Max Reitz wrote:

On 07.03.2016 17:02, Eric Blake wrote:

On 03/05/2016 11:13 AM, Max Reitz wrote:


+index = atoi(child->name + 9);


Optional: Assert absence of an error:



Indeed, atoi() is worthless, because it cannot do error detection.


unsigned long index;
char *endptr;

index = strtoul(child->name + 9, , 10);
assert(index >= 0 && !*endptr);


Still incorrect; you aren't handling errno properly for detecting all
errors.  Even better is to use qemu_strtoul(), which already handles
proper error detection.


Yeah, I keep forgetting that it returns ULONG_MAX on range error...


Yes, we should limit the range to INT_MAX. How do you like the following 
codes, i just steal it from xen_host_pci_get_value().


int rc;
const char *endptr;
unsigned long value;

assert(!strncmp(child->name, "children.", 9));
rc = qemu_strtoul(child->name + 9, , 10, );
if (!rc) {
assert(value <= INT_MAX);
index = value;
} else {
error_setg_errno(errp, -rc, "Failed to parse value '%s'",
 child->name + 9);
return;
}

Thanks
-Xie



Max







Re: [Qemu-block] [PATCH v3 04/15] block: Move block dirty bitmap code to separate files

2016-03-07 Thread Fam Zheng
On Mon, 03/07 19:49, Max Reitz wrote:
> On 27.02.2016 10:20, Fam Zheng wrote:
> > The only code change is making bdrv_dirty_bitmap_truncate public. It is
> > used in block.c.
> > 
> > Also two long lines (bdrv_get_dirty) are wrapped.
> > 
> > Signed-off-by: Fam Zheng 
> > Reviewed-by: John Snow 
> > ---
> >  block.c  | 360 
> >  block/Makefile.objs  |   2 +-
> >  block/dirty-bitmap.c | 387 
> > +++
> >  include/block/block.h|  35 +---
> >  include/block/dirty-bitmap.h |  45 +
> >  5 files changed, 434 insertions(+), 395 deletions(-)
> >  create mode 100644 block/dirty-bitmap.c
> >  create mode 100644 include/block/dirty-bitmap.h
> 
> [...]
> 
> > diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
> > new file mode 100644
> > index 000..45c6046
> > --- /dev/null
> > +++ b/include/block/dirty-bitmap.h
> > @@ -0,0 +1,45 @@
> > +#ifndef BLOCK_DIRTY_BITMAP_H
> > +#define BLOCK_DIRTY_BITMAP_H
> > +
> > +#include "qemu-common.h"
> > +#include "qemu/hbitmap.h"
> > +
> > +typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;
> 
> Doesn't patch 3 make this superfluous?
> 
> If so, I can remove it for you.
> 

Removing while rebasing.

Fam



Re: [Qemu-block] [PATCH v3 09/15] block: Support meta dirty bitmap

2016-03-07 Thread Fam Zheng
On Mon, 02/29 18:00, John Snow wrote:
> 
> 
> On 02/27/2016 04:20 AM, Fam Zheng wrote:
> > The added group of operations enables tracking of the changed bits in
> > the dirty bitmap.
> > 
> > Signed-off-by: Fam Zheng 
> > ---
> >  block/dirty-bitmap.c | 51 
> > 
> >  include/block/dirty-bitmap.h |  9 
> >  2 files changed, 60 insertions(+)
> > 
> > diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
> > index 16f73b2..5f19320 100644
> > --- a/block/dirty-bitmap.c
> > +++ b/block/dirty-bitmap.c
> > @@ -38,6 +38,7 @@
> >   */
> >  struct BdrvDirtyBitmap {
> >  HBitmap *bitmap;/* Dirty sector bitmap implementation */
> > +HBitmap *meta;  /* Meta dirty bitmap */
> >  BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status 
> > */
> >  char *name; /* Optional non-empty unique ID */
> >  int64_t size;   /* Size of the bitmap (Number of sectors) 
> > */
> > @@ -103,6 +104,56 @@ BdrvDirtyBitmap 
> > *bdrv_create_dirty_bitmap(BlockDriverState *bs,
> >  return bitmap;
> >  }
> >  
> > +/* bdrv_create_meta_dirty_bitmap
> > + *
> > + * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. 
> > I.e.
> > + * when a dirty status bit in @bitmap is changed (either from reset to set 
> > or
> > + * the other way around), its respective meta dirty bitmap bit will be 
> > marked
> > + * dirty as well.
> > + *
> > + * @bitmap: the block dirty bitmap for which to create a meta dirty bitmap.
> > + * @chunk_size: how many bytes of bitmap data does each bit in the meta 
> > bitmap
> > + * track.
> > + */
> > +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
> > +   int chunk_size)
> > +{
> > +assert(!bitmap->meta);
> > +bitmap->meta = hbitmap_create_meta(bitmap->bitmap,
> > +   chunk_size * BITS_PER_BYTE);
> > +}
> > +
> > +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap)
> > +{
> > +assert(bitmap->meta);
> > +hbitmap_free_meta(bitmap->bitmap);
> > +bitmap->meta = NULL;
> > +}
> > +
> > +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
> > +   BdrvDirtyBitmap *bitmap, int64_t sector,
> > +   int nb_sectors)
> > +{
> > +uint64_t i;
> > +int gran = bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
> > +
> > +/* To optimize: we can make hbitmap to internally check the range in a
> > + * coarse level, or at least do it word by word. */
> > +for (i = sector; i < sector + nb_sectors; i += gran) {
> > +if (hbitmap_get(bitmap->meta, i)) {
> > +return true;
> > +}
> > +}
> > +return false;
> > +}
> > +
> > +void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
> > +  BdrvDirtyBitmap *bitmap, int64_t sector,
> > +  int nb_sectors)
> > +{
> > +hbitmap_reset(bitmap->meta, sector, nb_sectors);
> > +}
> > +
> >  bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap)
> >  {
> >  return bitmap->successor;
> > diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
> > index e1dbd8e..3b27742 100644
> > --- a/include/block/dirty-bitmap.h
> > +++ b/include/block/dirty-bitmap.h
> > @@ -9,6 +9,9 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState 
> > *bs,
> >uint32_t granularity,
> >const char *name,
> >Error **errp);
> > +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap,
> > +   int chunk_size);
> > +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap);
> >  int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs,
> > BdrvDirtyBitmap *bitmap,
> > Error **errp);
> > @@ -37,6 +40,12 @@ void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
> > int64_t cur_sector, int nr_sectors);
> >  void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
> >   int64_t cur_sector, int nr_sectors);
> > +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs,
> > +   BdrvDirtyBitmap *bitmap, int64_t sector,
> > +   int nb_sectors);
> > +void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs,
> > +  BdrvDirtyBitmap *bitmap, int64_t sector,
> > +  int nb_sectors);
> >  BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap,
> >   uint64_t first_sector);
> >  void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter);
> > 
> 
> Do we need to amend the teardown in 

Re: [Qemu-block] [PATCH v3 06/15] block: Hide HBitmap in block dirty bitmap interface

2016-03-07 Thread Fam Zheng
On Mon, 03/07 20:11, Max Reitz wrote:
> On 27.02.2016 10:20, Fam Zheng wrote:
> > HBitmap is an implementation detail of block dirty bitmap that should be 
> > hidden
> > from users. Introduce a BdrvDirtyBitmapIter to encapsulate the underlying
> > HBitmapIter.
> > 
> > A small difference in the interface is, before, an HBitmapIter is 
> > initialized
> > in place, now the new BdrvDirtyBitmapIter must be dynamically allocated 
> > because
> > the structure definition is in block.c.
> > 
> > Two current users are converted too.
> > 
> > Signed-off-by: Fam Zheng 
> > Reviewed-by: John Snow 
> > ---
> >  block/backup.c   | 14 --
> >  block/dirty-bitmap.c | 39 +--
> >  block/mirror.c   | 14 --
> >  include/block/dirty-bitmap.h |  7 +--
> >  include/qemu/typedefs.h  |  1 +
> >  5 files changed, 55 insertions(+), 20 deletions(-)
> 
> I tried my best at fixing up the rebase conflicts, but block/mirror.c
> has just changed too much ("mirror: Rewrite mirror_iteration") to be
> able to pretend these fixes are trivial. Therefore, I'm afraid this
> patch will need a rebase.

Doing it now. Thanks!

Fam



Re: [Qemu-block] [PATCH v10 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-03-07 Thread Changlong Xie

On 03/08/2016 12:02 AM, Eric Blake wrote:

On 03/05/2016 11:13 AM, Max Reitz wrote:


+index = atoi(child->name + 9);


Optional: Assert absence of an error:



Indeed, atoi() is worthless, because it cannot do error detection.


unsigned long index;
char *endptr;

index = strtoul(child->name + 9, , 10);
assert(index >= 0 && !*endptr);


Still incorrect; you aren't handling errno properly for detecting all
errors.  Even better is to use qemu_strtoul(), which already handles
proper error detection.



Will fix this in next version, thanks for pointing it out.

Thanks
-Xie





Re: [Qemu-block] [PATCH] qed: fix bdrv_qed_drain

2016-03-07 Thread Paolo Bonzini


On 07/03/2016 21:56, Stefan Hajnoczi wrote:
> I think the timer concept itself is troublesome.  A radical approach but
> limited to changing QED itself is to drop the timer and instead keep a
> timestamp for the last allocating write request.  Next time a write
> request (allocating or rewriting) is about to complete, do the flush and
> clear the need check flag as part of the write request (if 5 seconds
> have passed since the timestamp).

bdrv_qed_drain should be easy to fix in my new drain implementation,
which is based on draining the parent before the BdrvChild-ren.  It's
just troublesome in the current one which alternates flushing and draining.

I would just revert the patch that introduced bdrv_qed_drain for now,
and reintroduce it later (note however that something was messed up in
commit df9a681, "qed: Implement .bdrv_drain", 2015-11-12, because it
includes some dirty bitmap stuff).

Or perhaps the idea of making QED read-only has some merit...

Paolo



Re: [Qemu-block] [PATCH] qed: fix bdrv_qed_drain

2016-03-07 Thread Stefan Hajnoczi
By the way, I'll send a patch on Tuesday showing the timerless need
check mechanism.  It's a little too late tonight to start it.

Stefan


signature.asc
Description: PGP signature


Re: [Qemu-block] [PATCH] qed: fix bdrv_qed_drain

2016-03-07 Thread Stefan Hajnoczi
On Mon, Mar 07, 2016 at 05:57:41PM +0100, Kevin Wolf wrote:
> Am 23.02.2016 um 14:54 hat Paolo Bonzini geschrieben:
> > 
> > 
> > On 23/02/2016 13:49, Fam Zheng wrote:
> > > On Tue, 02/23 11:43, Paolo Bonzini wrote:
> > >>
> > >>
> > >> On 23/02/2016 06:57, Fam Zheng wrote:
> > >> +qed_cancel_need_check_timer(s);
> > >> +qed_need_check_timer_cb(s);
> > >> +}
> > >
> > > What if an allocating write is queued (the else branch case)? Its 
> > > completion
> > > will be in bdrv_drain and it could arm the need_check_timer which is 
> > > wrong.
> > >
> > > We need to drain the allocating_write_reqs queue before checking the 
> > > timer.
> > 
> >  You're right, but how?  That's what bdrv_drain(bs) does, it's a
> >  chicken-and-egg problem.
> > >>>
> > >>> Maybe use an aio_poll loop before the if?
> > >>
> > >> That would not change the fact that you're reimplementing bdrv_drain
> > >> inside bdrv_qed_drain.
> > > 
> > > But it fulfills the contract of .bdrv_drain. This is the easy way, the 
> > > hard way
> > > would be iterating through the allocating_write_reqs list and process 
> > > reqs one
> > > by one synchronously, which still involves aio_poll indirectly.
> > 
> > The easy way would be better then.
> > 
> > Stefan, any second opinion?
> 
> What's the status here? It would be good to have qed not segfaulting all
> the time.

I think the timer concept itself is troublesome.  A radical approach but
limited to changing QED itself is to drop the timer and instead keep a
timestamp for the last allocating write request.  Next time a write
request (allocating or rewriting) is about to complete, do the flush and
clear the need check flag as part of the write request (if 5 seconds
have passed since the timestamp).

Because the need check flag clearing is part of the write request
completion, it will integrate properly with drain.

Stefan


signature.asc
Description: PGP signature


Re: [Qemu-block] [PATCH] block/gluster: add support for SEEK_DATA/SEEK_HOLE

2016-03-07 Thread Eric Blake
[adding qemu-devel; ALL patches must cc qemu-devel even when sent to
another list]

On 03/07/2016 11:04 AM, Niels de Vos wrote:
> GlusterFS 3.8 contains support for SEEK_DATA and SEEK_HOLE. This makes
> it possible to detect sparse areas in files.
> 
> Signed-off-by: Niels de Vos 
> 
> --
> Tested by compiling and running "qemu-img map gluster://..." with a
> build of the current master branch of glusterfs. Using a Fedora
> cloud image (in raw format) shows many SEEK procudure calls going back
> and forth over the network. The output of "qemu map" matches the output
> when run against the image on the local filesystem.
> ---
>  block/gluster.c | 159 
> 
>  configure   |  25 +
>  2 files changed, 184 insertions(+)
> 
> diff --git a/block/gluster.c b/block/gluster.c
> index 65077a0..1430010 100644
> --- a/block/gluster.c
> +++ b/block/gluster.c
> @@ -677,6 +677,153 @@ static int qemu_gluster_has_zero_init(BlockDriverState 
> *bs)
>  return 0;
>  }
>  
> +#ifdef CONFIG_GLUSTERFS_SEEK_DATA
> +/*
> + * Find allocation range in @bs around offset @start.
> + * May change underlying file descriptor's file offset.
> + * If @start is not in a hole, store @start in @data, and the
> + * beginning of the next hole in @hole, and return 0.
> + * If @start is in a non-trailing hole, store @start in @hole and the
> + * beginning of the next non-hole in @data, and return 0.
> + * If @start is in a trailing hole or beyond EOF, return -ENXIO.
> + * If we can't find out, return a negative errno other than -ENXIO.
> + *
> + * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
> + */
> +static int find_allocation(BlockDriverState *bs, off_t start,
> +   off_t *data, off_t *hole)
> +{
> +BDRVGlusterState *s = bs->opaque;
> +off_t offs;
> +
> +/*
> + * SEEK_DATA cases:
> + * D1. offs == start: start is in data
> + * D2. offs > start: start is in a hole, next data at offs
> + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
> + *  or start is beyond EOF
> + * If the latter happens, the file has been truncated behind
> + * our back since we opened it.  All bets are off then.
> + * Treating like a trailing hole is simplest.
> + * D4. offs < 0, errno != ENXIO: we learned nothing
> + */
> +offs = glfs_lseek(s->fd, start, SEEK_DATA);
> +if (offs < 0) {
> +return -errno;  /* D3 or D4 */
> +}
> +assert(offs >= start);
> +
> +if (offs > start) {
> +/* D2: in hole, next data at offs */
> +*hole = start;
> +*data = offs;
> +return 0;
> +}
> +
> +/* D1: in data, end not yet known */
> +
> +/*
> + * SEEK_HOLE cases:
> + * H1. offs == start: start is in a hole
> + * If this happens here, a hole has been dug behind our back
> + * since the previous lseek().
> + * H2. offs > start: either start is in data, next hole at offs,
> + *   or start is in trailing hole, EOF at offs
> + * Linux treats trailing holes like any other hole: offs ==
> + * start.  Solaris seeks to EOF instead: offs > start (blech).
> + * If that happens here, a hole has been dug behind our back
> + * since the previous lseek().
> + * H3. offs < 0, errno = ENXIO: start is beyond EOF
> + * If this happens, the file has been truncated behind our
> + * back since we opened it.  Treat it like a trailing hole.
> + * H4. offs < 0, errno != ENXIO: we learned nothing
> + * Pretend we know nothing at all, i.e. "forget" about D1.
> + */
> +offs = glfs_lseek(s->fd, start, SEEK_HOLE);
> +if (offs < 0) {
> +return -errno;  /* D1 and (H3 or H4) */
> +}
> +assert(offs >= start);
> +
> +if (offs > start) {
> +/*
> + * D1 and H2: either in data, next hole at offs, or it was in
> + * data but is now in a trailing hole.  In the latter case,
> + * all bets are off.  Treating it as if it there was data all
> + * the way to EOF is safe, so simply do that.
> + */
> +*data = start;
> +*hole = offs;
> +return 0;
> +}
> +
> +/* D1 and H1 */
> +return -EBUSY;
> +}
> +
> +/*
> + * Returns the allocation status of the specified sectors.
> + *
> + * If 'sector_num' is beyond the end of the disk image the return value is 0
> + * and 'pnum' is set to 0.
> + *
> + * 'pnum' is set to the number of sectors (including and immediately 
> following
> + * the specified sector) that are known to be in the same
> + * allocated/unallocated state.
> + *
> + * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
> + * beyond the end of the disk image it will be clamped.
> + *
> + * (Based on raw_co_get_block_status() from raw-posix.c.)
> + */
> +static 

Re: [Qemu-block] [PATCH v3 06/15] block: Hide HBitmap in block dirty bitmap interface

2016-03-07 Thread Max Reitz
On 27.02.2016 10:20, Fam Zheng wrote:
> HBitmap is an implementation detail of block dirty bitmap that should be 
> hidden
> from users. Introduce a BdrvDirtyBitmapIter to encapsulate the underlying
> HBitmapIter.
> 
> A small difference in the interface is, before, an HBitmapIter is initialized
> in place, now the new BdrvDirtyBitmapIter must be dynamically allocated 
> because
> the structure definition is in block.c.
> 
> Two current users are converted too.
> 
> Signed-off-by: Fam Zheng 
> Reviewed-by: John Snow 
> ---
>  block/backup.c   | 14 --
>  block/dirty-bitmap.c | 39 +--
>  block/mirror.c   | 14 --
>  include/block/dirty-bitmap.h |  7 +--
>  include/qemu/typedefs.h  |  1 +
>  5 files changed, 55 insertions(+), 20 deletions(-)

I tried my best at fixing up the rebase conflicts, but block/mirror.c
has just changed too much ("mirror: Rewrite mirror_iteration") to be
able to pretend these fixes are trivial. Therefore, I'm afraid this
patch will need a rebase.

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] strange crash in tracked_request_begin

2016-03-07 Thread Christian Borntraeger
On 03/07/2016 06:01 PM, Stefan Hajnoczi wrote:
> On Mon, Mar 07, 2016 at 01:29:08PM +0100, Christian Borntraeger wrote:
>> Folks,
>>
>> I had a crash of a qemu guest in tracked_request_begin.
>> The testcase was a guest with ramdisk/kernel that reboots in a 
>> loop. (about 10 times per second) with a single null-co disk 
>> attached. No idea how to reproduce this, seems to be a lucky hit.
>>
>> (gdb) bt
>> #0  0x101db5ba in tracked_request_begin 
>> (req=req@entry=0x3ff90f1bdc0, bs=bs@entry=0x42a39190, offset=offset@entry=0, 
>> bytes=bytes@entry=4096, type=type@entry=BDRV_TRACKED_READ)
>> at /home/cborntra/REPOS/qemu/block/io.c:390
>> #1  0x101de91e in bdrv_co_do_preadv (bs=0x42a39190, offset=0, 
>> bytes=4096, qiov=0x3ff7400cbd8, flags=, flags@entry=(unknown: 
>> 0))
>> at /home/cborntra/REPOS/qemu/block/io.c:1001
>> #2  0x101dfc3e in bdrv_co_do_readv (flags=(unknown: 0), 
>> qiov=, nb_sectors=, sector_num=> out>, bs=)
>> at /home/cborntra/REPOS/qemu/block/io.c:1024
>> #3  bdrv_co_do_rw (opaque=0x3ff7400e370) at 
>> /home/cborntra/REPOS/qemu/block/io.c:2173
>> #4  0x1022d8f6 in coroutine_trampoline (i0=, 
>> i1=-1946150928) at /home/cborntra/REPOS/qemu/util/coroutine-ucontext.c:79
>> #5  0x03ff95ed150a in __makecontext_ret () from /lib64/libc.so.6
>>
>> looking at the code we are at
>>
>> QLIST_INSERT_HEAD(>tracked_requests, req, list);
>> which translates to
>>
>> if (((req)->list.le_next = (>tracked_requests)->lh_first) != NULL) 
>> (>tracked_requests)->lh_first->list.le_prev = &(req)->list.le_next;
>> (>tracked_requests)->lh_first = (req);   
>> (req)->list.le_prev = &(>tracked_requests)->lh_first;
>>
>> gdb says, that (>tracked_requests)->lh_first) is zero in the corefile
>> (gdb) print /x bs->tracked_requests
>> $6 = {lh_first = 0x0}
>>
>> Now looking at the code I am asking myself if this can happen in parallel
>> to another code that touches tracked_requests, because gcc seems to read
>> >tracked_requests)->lh_first twice (first to check the value, then
>> to use it as pointer)
> 
> tracked_requests is protected by AioContext.  Perhaps something is doing
> I/O without acquiring AioContext?

Hmm, the guest was rebooting, which resets all devices. Maybe something
in that code is still not right? I will have a look.
> 
> Luckily there is only 1 place where items are added and removed from
> tracked_requests.  This might make debugging somewhat easier.

I have trouble reproducing the issue, which makes it hard :-/
 


>>
>> 388  qemu_co_queue_init(>wait_queue);
>>0x101db594 <+76>: la  %r2,72(%r13)
>>0x101db598 <+80>: brasl   %r14,0x1022cdc0 
>>
>> 389  
>> 390  QLIST_INSERT_HEAD(>tracked_requests, req, list);
>>0x101db59e <+86>: lg  %r1,12744(%r12) # r1 = 
>> (>tracked_requests)->lh_first)
>>0x101db5a4 <+92>: stg %r1,48(%r13)# 
>> (req)->list.le_next = r1
>>0x101db5aa <+98>: cgij%r1,0,8,0x101db5c0 ---+ # if r1==0 goto
>>0x101db5b0 <+104>:lg  %r1,12744(%r12)   | # r1 = 
>> (>tracked_requests)->lh_first) (again!!)
>>0x101db5b6 <+110>:la  %r2,48(%r13)  | 
>> => 0x101db5ba <+114>:stg %r2,56(%r1)   | # r1==0 
>> bang
>>0x101db5c0 <+120>:stg %r13,12744(%r12)<-+
>>0x101db5c6 <+126>:lay %r12,12744(%r12)
>>0x101db5cc <+132>:stg %r12,56(%r13)
>>
>>
>> Christian
>>




Re: [Qemu-block] [PATCH v3 04/15] block: Move block dirty bitmap code to separate files

2016-03-07 Thread Max Reitz
On 27.02.2016 10:20, Fam Zheng wrote:
> The only code change is making bdrv_dirty_bitmap_truncate public. It is
> used in block.c.
> 
> Also two long lines (bdrv_get_dirty) are wrapped.
> 
> Signed-off-by: Fam Zheng 
> Reviewed-by: John Snow 
> ---
>  block.c  | 360 
>  block/Makefile.objs  |   2 +-
>  block/dirty-bitmap.c | 387 
> +++
>  include/block/block.h|  35 +---
>  include/block/dirty-bitmap.h |  45 +
>  5 files changed, 434 insertions(+), 395 deletions(-)
>  create mode 100644 block/dirty-bitmap.c
>  create mode 100644 include/block/dirty-bitmap.h

[...]

> diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
> new file mode 100644
> index 000..45c6046
> --- /dev/null
> +++ b/include/block/dirty-bitmap.h
> @@ -0,0 +1,45 @@
> +#ifndef BLOCK_DIRTY_BITMAP_H
> +#define BLOCK_DIRTY_BITMAP_H
> +
> +#include "qemu-common.h"
> +#include "qemu/hbitmap.h"
> +
> +typedef struct BdrvDirtyBitmap BdrvDirtyBitmap;

Doesn't patch 3 make this superfluous?

If so, I can remove it for you.

Max




signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH] block/gluster: add support for SEEK_DATA/SEEK_HOLE

2016-03-07 Thread Jeff Cody
On Mon, Mar 07, 2016 at 07:04:15PM +0100, Niels de Vos wrote:
> GlusterFS 3.8 contains support for SEEK_DATA and SEEK_HOLE. This makes
> it possible to detect sparse areas in files.
> 
> Signed-off-by: Niels de Vos 
> 
> --
> Tested by compiling and running "qemu-img map gluster://..." with a
> build of the current master branch of glusterfs. Using a Fedora
> cloud image (in raw format) shows many SEEK procudure calls going back
> and forth over the network. The output of "qemu map" matches the output
> when run against the image on the local filesystem.
> ---
>  block/gluster.c | 159 
> 
>  configure   |  25 +
>  2 files changed, 184 insertions(+)
> 
> diff --git a/block/gluster.c b/block/gluster.c
> index 65077a0..1430010 100644
> --- a/block/gluster.c
> +++ b/block/gluster.c
> @@ -677,6 +677,153 @@ static int qemu_gluster_has_zero_init(BlockDriverState 
> *bs)
>  return 0;
>  }
>  
> +#ifdef CONFIG_GLUSTERFS_SEEK_DATA

Why do we need to make this a compile-time option?  Version checking
is problematic; for instance, different distributions may have
backported bug fixes / features, that are not reflected by the
reported version number, etc..  Ideally, we can determine
functionality during runtime, and behave accordingly.

If SEEK_DATA and SEEK_HOLE are not supported,
qemu_gluster_co_get_block_status can return that sectors are all
allocated (which is what happens in block/io.c anyway if the driver
doesn't support the function).

As long as glfs_lseek() will return error (e.g. EINVAL) for an invalid
whence value,  we can handle it runtime.  Does glfs_lseek() behave
sanely?

> +/*
> + * Find allocation range in @bs around offset @start.
> + * May change underlying file descriptor's file offset.
> + * If @start is not in a hole, store @start in @data, and the
> + * beginning of the next hole in @hole, and return 0.
> + * If @start is in a non-trailing hole, store @start in @hole and the
> + * beginning of the next non-hole in @data, and return 0.
> + * If @start is in a trailing hole or beyond EOF, return -ENXIO.
> + * If we can't find out, return a negative errno other than -ENXIO.
> + *
> + * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
> + */
> +static int find_allocation(BlockDriverState *bs, off_t start,
> +   off_t *data, off_t *hole)
> +{
> +BDRVGlusterState *s = bs->opaque;
> +off_t offs;
> +
> +/*
> + * SEEK_DATA cases:
> + * D1. offs == start: start is in data
> + * D2. offs > start: start is in a hole, next data at offs
> + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
> + *  or start is beyond EOF
> + * If the latter happens, the file has been truncated behind
> + * our back since we opened it.  All bets are off then.
> + * Treating like a trailing hole is simplest.
> + * D4. offs < 0, errno != ENXIO: we learned nothing
> + */
> +offs = glfs_lseek(s->fd, start, SEEK_DATA);
> +if (offs < 0) {
> +return -errno;  /* D3 or D4 */
> +}
> +assert(offs >= start);
> +
> +if (offs > start) {
> +/* D2: in hole, next data at offs */
> +*hole = start;
> +*data = offs;
> +return 0;
> +}
> +
> +/* D1: in data, end not yet known */
> +
> +/*
> + * SEEK_HOLE cases:
> + * H1. offs == start: start is in a hole
> + * If this happens here, a hole has been dug behind our back
> + * since the previous lseek().
> + * H2. offs > start: either start is in data, next hole at offs,
> + *   or start is in trailing hole, EOF at offs
> + * Linux treats trailing holes like any other hole: offs ==
> + * start.  Solaris seeks to EOF instead: offs > start (blech).
> + * If that happens here, a hole has been dug behind our back
> + * since the previous lseek().
> + * H3. offs < 0, errno = ENXIO: start is beyond EOF
> + * If this happens, the file has been truncated behind our
> + * back since we opened it.  Treat it like a trailing hole.
> + * H4. offs < 0, errno != ENXIO: we learned nothing
> + * Pretend we know nothing at all, i.e. "forget" about D1.
> + */
> +offs = glfs_lseek(s->fd, start, SEEK_HOLE);
> +if (offs < 0) {
> +return -errno;  /* D1 and (H3 or H4) */
> +}
> +assert(offs >= start);
> +
> +if (offs > start) {
> +/*
> + * D1 and H2: either in data, next hole at offs, or it was in
> + * data but is now in a trailing hole.  In the latter case,
> + * all bets are off.  Treating it as if it there was data all
> + * the way to EOF is safe, so simply do that.
> + */
> +*data = start;
> +*hole = offs;
> +return 0;
> +}
> +
> +/* D1 and H1 */
> +return -EBUSY;
> +}
> +
> 

Re: [Qemu-block] [PATCH] block: Fix snapshot=on cache modes

2016-03-07 Thread Max Reitz
On 07.03.2016 13:26, Kevin Wolf wrote:
> Since commit 91a097e, we end up with a somewhat weird cache mode
> configuration with snapshot=on: The commit broke the cache mode
> inheritance for the snapshot overlay so that it is opened as
> writethrough instead of unsafe now. The following bdrv_append() call to
> put it on top of the tree swaps the WCE flag with the snapshot's backing
> file (i.e. the originally given file), so what we eventually get is
> cache=writeback on the temporary overlay and
> cache=writethrough,cache.no-flush=on on the real image file.
> 
> This patch changes things so that the temporary overlay gets
> cache=unsafe again like it used to, and the real images get whatever the
> user specified. This means that cache.direct is now respected even with
> snapshot=on, and in the case of committing changes, the final flush is
> no longer ignored except explicitly requested by the user.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block.c   | 34 --
>  blockdev.c|  7 ---
>  include/block/block.h |  1 -
>  3 files changed, 24 insertions(+), 18 deletions(-)
> 
> diff --git a/block.c b/block.c
> index ba24b8e..e3fe8ed 100644
> --- a/block.c
> +++ b/block.c
> @@ -687,13 +687,19 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
>  }
>  
>  /*
> - * Returns the flags that a temporary snapshot should get, based on the
> - * originally requested flags (the originally requested image will have flags
> - * like a backing file)
> + * Returns the options and flags that a temporary snapshot should get, based 
> on
> + * the originally requested flags (the originally requested image will have
> + * flags like a backing file)
>   */
> -static int bdrv_temp_snapshot_flags(int flags)
> +static void bdrv_temp_snapshot_options(int *child_flags, QDict 
> *child_options,
> +   int parent_flags, QDict 
> *parent_options)
>  {
> -return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
> +*child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
> +
> +/* For temporary files, unconditional cache=unsafe is fine */
> +qdict_set_default_str(child_options, BDRV_OPT_CACHE_WB, "on");
> +qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
> +qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
>  }
>  
>  /*
> @@ -1424,13 +1430,13 @@ done:
>  return c;
>  }
>  
> -int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
> +static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
> + QDict *snapshot_options, Error **errp)
>  {
>  /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
>  char *tmp_filename = g_malloc0(PATH_MAX + 1);
>  int64_t total_size;
>  QemuOpts *opts = NULL;
> -QDict *snapshot_options;
>  BlockDriverState *bs_snapshot;
>  Error *local_err = NULL;
>  int ret;
> @@ -1465,7 +1471,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int 
> flags, Error **errp)
>  }
>  
>  /* Prepare a new options QDict for the temporary file */

This comment reads a bit weird now.

Rest looks good and this is not exactly critical, so:

Reviewed-by: Max Reitz 

> -snapshot_options = qdict_new();
>  qdict_put(snapshot_options, "file.driver",
>qstring_from_str("file"));
>  qdict_put(snapshot_options, "file.filename",
> @@ -1477,6 +1482,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int 
> flags, Error **errp)
>  
>  ret = bdrv_open(_snapshot, NULL, NULL, snapshot_options,
>  flags, _err);
> +snapshot_options = NULL;
>  if (ret < 0) {
>  error_propagate(errp, local_err);
>  goto out;
> @@ -1485,6 +1491,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int 
> flags, Error **errp)
>  bdrv_append(bs_snapshot, bs);
>  
>  out:
> +QDECREF(snapshot_options);
>  g_free(tmp_filename);
>  return ret;
>  }
> @@ -1516,6 +1523,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, 
> const char *filename,
>  const char *drvname;
>  const char *backing;
>  Error *local_err = NULL;
> +QDict *snapshot_options = NULL;
>  int snapshot_flags = 0;
>  
>  assert(pbs);
> @@ -1607,7 +1615,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, 
> const char *filename,
>  flags |= BDRV_O_ALLOW_RDWR;
>  }
>  if (flags & BDRV_O_SNAPSHOT) {
> -snapshot_flags = bdrv_temp_snapshot_flags(flags);
> +snapshot_options = qdict_new();
> +bdrv_temp_snapshot_options(_flags, snapshot_options,
> +   flags, options);
>  bdrv_backing_options(, options, flags, options);
>  }
>  
> @@ -1709,7 +1719,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, 
> const char *filename,
>  

[Qemu-block] [PATCH] block/gluster: add support for SEEK_DATA/SEEK_HOLE

2016-03-07 Thread Niels de Vos
GlusterFS 3.8 contains support for SEEK_DATA and SEEK_HOLE. This makes
it possible to detect sparse areas in files.

Signed-off-by: Niels de Vos 

--
Tested by compiling and running "qemu-img map gluster://..." with a
build of the current master branch of glusterfs. Using a Fedora
cloud image (in raw format) shows many SEEK procudure calls going back
and forth over the network. The output of "qemu map" matches the output
when run against the image on the local filesystem.
---
 block/gluster.c | 159 
 configure   |  25 +
 2 files changed, 184 insertions(+)

diff --git a/block/gluster.c b/block/gluster.c
index 65077a0..1430010 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -677,6 +677,153 @@ static int qemu_gluster_has_zero_init(BlockDriverState 
*bs)
 return 0;
 }
 
+#ifdef CONFIG_GLUSTERFS_SEEK_DATA
+/*
+ * Find allocation range in @bs around offset @start.
+ * May change underlying file descriptor's file offset.
+ * If @start is not in a hole, store @start in @data, and the
+ * beginning of the next hole in @hole, and return 0.
+ * If @start is in a non-trailing hole, store @start in @hole and the
+ * beginning of the next non-hole in @data, and return 0.
+ * If @start is in a trailing hole or beyond EOF, return -ENXIO.
+ * If we can't find out, return a negative errno other than -ENXIO.
+ *
+ * (Shamefully copied from raw-posix.c, only miniscule adaptions.)
+ */
+static int find_allocation(BlockDriverState *bs, off_t start,
+   off_t *data, off_t *hole)
+{
+BDRVGlusterState *s = bs->opaque;
+off_t offs;
+
+/*
+ * SEEK_DATA cases:
+ * D1. offs == start: start is in data
+ * D2. offs > start: start is in a hole, next data at offs
+ * D3. offs < 0, errno = ENXIO: either start is in a trailing hole
+ *  or start is beyond EOF
+ * If the latter happens, the file has been truncated behind
+ * our back since we opened it.  All bets are off then.
+ * Treating like a trailing hole is simplest.
+ * D4. offs < 0, errno != ENXIO: we learned nothing
+ */
+offs = glfs_lseek(s->fd, start, SEEK_DATA);
+if (offs < 0) {
+return -errno;  /* D3 or D4 */
+}
+assert(offs >= start);
+
+if (offs > start) {
+/* D2: in hole, next data at offs */
+*hole = start;
+*data = offs;
+return 0;
+}
+
+/* D1: in data, end not yet known */
+
+/*
+ * SEEK_HOLE cases:
+ * H1. offs == start: start is in a hole
+ * If this happens here, a hole has been dug behind our back
+ * since the previous lseek().
+ * H2. offs > start: either start is in data, next hole at offs,
+ *   or start is in trailing hole, EOF at offs
+ * Linux treats trailing holes like any other hole: offs ==
+ * start.  Solaris seeks to EOF instead: offs > start (blech).
+ * If that happens here, a hole has been dug behind our back
+ * since the previous lseek().
+ * H3. offs < 0, errno = ENXIO: start is beyond EOF
+ * If this happens, the file has been truncated behind our
+ * back since we opened it.  Treat it like a trailing hole.
+ * H4. offs < 0, errno != ENXIO: we learned nothing
+ * Pretend we know nothing at all, i.e. "forget" about D1.
+ */
+offs = glfs_lseek(s->fd, start, SEEK_HOLE);
+if (offs < 0) {
+return -errno;  /* D1 and (H3 or H4) */
+}
+assert(offs >= start);
+
+if (offs > start) {
+/*
+ * D1 and H2: either in data, next hole at offs, or it was in
+ * data but is now in a trailing hole.  In the latter case,
+ * all bets are off.  Treating it as if it there was data all
+ * the way to EOF is safe, so simply do that.
+ */
+*data = start;
+*hole = offs;
+return 0;
+}
+
+/* D1 and H1 */
+return -EBUSY;
+}
+
+/*
+ * Returns the allocation status of the specified sectors.
+ *
+ * If 'sector_num' is beyond the end of the disk image the return value is 0
+ * and 'pnum' is set to 0.
+ *
+ * 'pnum' is set to the number of sectors (including and immediately following
+ * the specified sector) that are known to be in the same
+ * allocated/unallocated state.
+ *
+ * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
+ * beyond the end of the disk image it will be clamped.
+ *
+ * (Based on raw_co_get_block_status() from raw-posix.c.)
+ */
+static int64_t coroutine_fn qemu_gluster_co_get_block_status(
+BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum)
+{
+BDRVGlusterState *s = bs->opaque;
+off_t start, data = 0, hole = 0;
+int64_t total_size;
+int ret = -EINVAL;
+
+if (!s->fd) {
+return ret;
+}
+
+start = sector_num * BDRV_SECTOR_SIZE;
+total_size = 

Re: [Qemu-block] [PATCH] qed: fix bdrv_qed_drain

2016-03-07 Thread Kevin Wolf
Am 23.02.2016 um 14:54 hat Paolo Bonzini geschrieben:
> 
> 
> On 23/02/2016 13:49, Fam Zheng wrote:
> > On Tue, 02/23 11:43, Paolo Bonzini wrote:
> >>
> >>
> >> On 23/02/2016 06:57, Fam Zheng wrote:
> >> +qed_cancel_need_check_timer(s);
> >> +qed_need_check_timer_cb(s);
> >> +}
> >
> > What if an allocating write is queued (the else branch case)? Its 
> > completion
> > will be in bdrv_drain and it could arm the need_check_timer which is 
> > wrong.
> >
> > We need to drain the allocating_write_reqs queue before checking the 
> > timer.
> 
>  You're right, but how?  That's what bdrv_drain(bs) does, it's a
>  chicken-and-egg problem.
> >>>
> >>> Maybe use an aio_poll loop before the if?
> >>
> >> That would not change the fact that you're reimplementing bdrv_drain
> >> inside bdrv_qed_drain.
> > 
> > But it fulfills the contract of .bdrv_drain. This is the easy way, the hard 
> > way
> > would be iterating through the allocating_write_reqs list and process reqs 
> > one
> > by one synchronously, which still involves aio_poll indirectly.
> 
> The easy way would be better then.
> 
> Stefan, any second opinion?

What's the status here? It would be good to have qed not segfaulting all
the time.

Kevin



Re: [Qemu-block] strange crash in tracked_request_begin

2016-03-07 Thread Stefan Hajnoczi
On Mon, Mar 07, 2016 at 01:29:08PM +0100, Christian Borntraeger wrote:
> Folks,
> 
> I had a crash of a qemu guest in tracked_request_begin.
> The testcase was a guest with ramdisk/kernel that reboots in a 
> loop. (about 10 times per second) with a single null-co disk 
> attached. No idea how to reproduce this, seems to be a lucky hit.
> 
> (gdb) bt
> #0  0x101db5ba in tracked_request_begin (req=req@entry=0x3ff90f1bdc0, 
> bs=bs@entry=0x42a39190, offset=offset@entry=0, bytes=bytes@entry=4096, 
> type=type@entry=BDRV_TRACKED_READ)
> at /home/cborntra/REPOS/qemu/block/io.c:390
> #1  0x101de91e in bdrv_co_do_preadv (bs=0x42a39190, offset=0, 
> bytes=4096, qiov=0x3ff7400cbd8, flags=, flags@entry=(unknown: 
> 0))
> at /home/cborntra/REPOS/qemu/block/io.c:1001
> #2  0x101dfc3e in bdrv_co_do_readv (flags=(unknown: 0), 
> qiov=, nb_sectors=, sector_num=, 
> bs=)
> at /home/cborntra/REPOS/qemu/block/io.c:1024
> #3  bdrv_co_do_rw (opaque=0x3ff7400e370) at 
> /home/cborntra/REPOS/qemu/block/io.c:2173
> #4  0x1022d8f6 in coroutine_trampoline (i0=, 
> i1=-1946150928) at /home/cborntra/REPOS/qemu/util/coroutine-ucontext.c:79
> #5  0x03ff95ed150a in __makecontext_ret () from /lib64/libc.so.6
> 
> looking at the code we are at
> 
> QLIST_INSERT_HEAD(>tracked_requests, req, list);
> which translates to
> 
> if (((req)->list.le_next = (>tracked_requests)->lh_first) != NULL) 
> (>tracked_requests)->lh_first->list.le_prev = &(req)->list.le_next;
> (>tracked_requests)->lh_first = (req);   
> (req)->list.le_prev = &(>tracked_requests)->lh_first;
> 
> gdb says, that (>tracked_requests)->lh_first) is zero in the corefile
> (gdb) print /x bs->tracked_requests
> $6 = {lh_first = 0x0}
> 
> Now looking at the code I am asking myself if this can happen in parallel
> to another code that touches tracked_requests, because gcc seems to read
> >tracked_requests)->lh_first twice (first to check the value, then
> to use it as pointer)

tracked_requests is protected by AioContext.  Perhaps something is doing
I/O without acquiring AioContext?

Luckily there is only 1 place where items are added and removed from
tracked_requests.  This might make debugging somewhat easier.

> 
> 388   qemu_co_queue_init(>wait_queue);
>0x101db594 <+76>:  la  %r2,72(%r13)
>0x101db598 <+80>:  brasl   %r14,0x1022cdc0 
> 
> 389   
> 390   QLIST_INSERT_HEAD(>tracked_requests, req, list);
>0x101db59e <+86>:  lg  %r1,12744(%r12) # r1 = 
> (>tracked_requests)->lh_first)
>0x101db5a4 <+92>:  stg %r1,48(%r13)# 
> (req)->list.le_next = r1
>0x101db5aa <+98>:  cgij%r1,0,8,0x101db5c0 ---+ # if r1==0 goto
>0x101db5b0 <+104>: lg  %r1,12744(%r12)   | # r1 = 
> (>tracked_requests)->lh_first) (again!!)
>0x101db5b6 <+110>: la  %r2,48(%r13)  | 
> => 0x101db5ba <+114>: stg %r2,56(%r1)   | # r1==0 bang
>0x101db5c0 <+120>: stg %r13,12744(%r12)<-+
>0x101db5c6 <+126>: lay %r12,12744(%r12)
>0x101db5cc <+132>: stg %r12,56(%r13)
> 
> 
> Christian
> 


signature.asc
Description: PGP signature


Re: [Qemu-block] [PATCH v10 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-03-07 Thread Max Reitz
On 07.03.2016 17:02, Eric Blake wrote:
> On 03/05/2016 11:13 AM, Max Reitz wrote:
> 
>>> +index = atoi(child->name + 9);
>>
>> Optional: Assert absence of an error:
>>
> 
> Indeed, atoi() is worthless, because it cannot do error detection.
> 
>> unsigned long index;
>> char *endptr;
>>
>> index = strtoul(child->name + 9, , 10);
>> assert(index >= 0 && !*endptr);
> 
> Still incorrect; you aren't handling errno properly for detecting all
> errors.  Even better is to use qemu_strtoul(), which already handles
> proper error detection.

Yeah, I keep forgetting that it returns ULONG_MAX on range error...

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v10 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-03-07 Thread Eric Blake
On 03/05/2016 11:13 AM, Max Reitz wrote:

>> +index = atoi(child->name + 9);
> 
> Optional: Assert absence of an error:
> 

Indeed, atoi() is worthless, because it cannot do error detection.

> unsigned long index;
> char *endptr;
> 
> index = strtoul(child->name + 9, , 10);
> assert(index >= 0 && !*endptr);

Still incorrect; you aren't handling errno properly for detecting all
errors.  Even better is to use qemu_strtoul(), which already handles
proper error detection.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH] block: Fix cache mode defaults in bds_tree_init()

2016-03-07 Thread Eric Blake
On 03/07/2016 06:42 AM, Kevin Wolf wrote:
> Without setting explicit defaults in the options, blockdev-add without
> an ID ended up defaulting to writethrough. It should be writeback as
> documented.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  blockdev.c | 7 +++
>  1 file changed, 7 insertions(+)
> 

Reviewed-by: Eric Blake 

> diff --git a/blockdev.c b/blockdev.c
> index eecd78d..1824cae 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -675,6 +675,13 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, 
> Error **errp)
>  goto fail;
>  }
>  
> +/* bdrv_open() defaults to the values in bdrv_flags (for compatibility
> + * with other callers) rather than what we want as the real defaults.
> + * Apply the defaults here instead. */
> +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_WB, "on");
> +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
> +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");
> +
>  if (runstate_check(RUN_STATE_INMIGRATE)) {
>  bdrv_flags |= BDRV_O_INACTIVE;
>  }
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-block] strange crash in tracked_request_begin

2016-03-07 Thread Christian Borntraeger
Folks,

I had a crash of a qemu guest in tracked_request_begin.
The testcase was a guest with ramdisk/kernel that reboots in a 
loop. (about 10 times per second) with a single null-co disk 
attached. No idea how to reproduce this, seems to be a lucky hit.

(gdb) bt
#0  0x101db5ba in tracked_request_begin (req=req@entry=0x3ff90f1bdc0, 
bs=bs@entry=0x42a39190, offset=offset@entry=0, bytes=bytes@entry=4096, 
type=type@entry=BDRV_TRACKED_READ)
at /home/cborntra/REPOS/qemu/block/io.c:390
#1  0x101de91e in bdrv_co_do_preadv (bs=0x42a39190, offset=0, 
bytes=4096, qiov=0x3ff7400cbd8, flags=, flags@entry=(unknown: 0))
at /home/cborntra/REPOS/qemu/block/io.c:1001
#2  0x101dfc3e in bdrv_co_do_readv (flags=(unknown: 0), qiov=, nb_sectors=, sector_num=, bs=)
at /home/cborntra/REPOS/qemu/block/io.c:1024
#3  bdrv_co_do_rw (opaque=0x3ff7400e370) at 
/home/cborntra/REPOS/qemu/block/io.c:2173
#4  0x1022d8f6 in coroutine_trampoline (i0=, 
i1=-1946150928) at /home/cborntra/REPOS/qemu/util/coroutine-ucontext.c:79
#5  0x03ff95ed150a in __makecontext_ret () from /lib64/libc.so.6

looking at the code we are at

QLIST_INSERT_HEAD(>tracked_requests, req, list);
which translates to

if (((req)->list.le_next = (>tracked_requests)->lh_first) != NULL) 
(>tracked_requests)->lh_first->list.le_prev = &(req)->list.le_next;
(>tracked_requests)->lh_first = (req);   
(req)->list.le_prev = &(>tracked_requests)->lh_first;

gdb says, that (>tracked_requests)->lh_first) is zero in the corefile
(gdb) print /x bs->tracked_requests
$6 = {lh_first = 0x0}

Now looking at the code I am asking myself if this can happen in parallel
to another code that touches tracked_requests, because gcc seems to read
>tracked_requests)->lh_first twice (first to check the value, then
to use it as pointer)

388 qemu_co_queue_init(>wait_queue);
   0x101db594 <+76>:la  %r2,72(%r13)
   0x101db598 <+80>:brasl   %r14,0x1022cdc0 

389 
390 QLIST_INSERT_HEAD(>tracked_requests, req, list);
   0x101db59e <+86>:lg  %r1,12744(%r12) # r1 = 
(>tracked_requests)->lh_first)
   0x101db5a4 <+92>:stg %r1,48(%r13)# 
(req)->list.le_next = r1
   0x101db5aa <+98>:cgij%r1,0,8,0x101db5c0 ---+ # if r1==0 goto
   0x101db5b0 <+104>:   lg  %r1,12744(%r12)   | # r1 = 
(>tracked_requests)->lh_first) (again!!)
   0x101db5b6 <+110>:   la  %r2,48(%r13)  | 
=> 0x101db5ba <+114>:   stg %r2,56(%r1)   | # r1==0 bang
   0x101db5c0 <+120>:   stg %r13,12744(%r12)<-+
   0x101db5c6 <+126>:   lay %r12,12744(%r12)
   0x101db5cc <+132>:   stg %r12,56(%r13)


Christian




[Qemu-block] [PATCH] block: Fix cache mode defaults in bds_tree_init()

2016-03-07 Thread Kevin Wolf
Without setting explicit defaults in the options, blockdev-add without
an ID ended up defaulting to writethrough. It should be writeback as
documented.

Signed-off-by: Kevin Wolf 
---
 blockdev.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index eecd78d..1824cae 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -675,6 +675,13 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, 
Error **errp)
 goto fail;
 }
 
+/* bdrv_open() defaults to the values in bdrv_flags (for compatibility
+ * with other callers) rather than what we want as the real defaults.
+ * Apply the defaults here instead. */
+qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_WB, "on");
+qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off");
+qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off");
+
 if (runstate_check(RUN_STATE_INMIGRATE)) {
 bdrv_flags |= BDRV_O_INACTIVE;
 }
-- 
1.8.3.1




Re: [Qemu-block] strange crash in tracked_request_begin

2016-03-07 Thread Paolo Bonzini


On 07/03/2016 13:29, Christian Borntraeger wrote:
> Folks,
> 
> I had a crash of a qemu guest in tracked_request_begin.
> The testcase was a guest with ramdisk/kernel that reboots in a 
> loop. (about 10 times per second) with a single null-co disk 
> attached.

Does it use a separate iothread?

Paolo



[Qemu-block] [PATCH] block: Fix snapshot=on cache modes

2016-03-07 Thread Kevin Wolf
Since commit 91a097e, we end up with a somewhat weird cache mode
configuration with snapshot=on: The commit broke the cache mode
inheritance for the snapshot overlay so that it is opened as
writethrough instead of unsafe now. The following bdrv_append() call to
put it on top of the tree swaps the WCE flag with the snapshot's backing
file (i.e. the originally given file), so what we eventually get is
cache=writeback on the temporary overlay and
cache=writethrough,cache.no-flush=on on the real image file.

This patch changes things so that the temporary overlay gets
cache=unsafe again like it used to, and the real images get whatever the
user specified. This means that cache.direct is now respected even with
snapshot=on, and in the case of committing changes, the final flush is
no longer ignored except explicitly requested by the user.

Signed-off-by: Kevin Wolf 
---
 block.c   | 34 --
 blockdev.c|  7 ---
 include/block/block.h |  1 -
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/block.c b/block.c
index ba24b8e..e3fe8ed 100644
--- a/block.c
+++ b/block.c
@@ -687,13 +687,19 @@ int bdrv_parse_cache_flags(const char *mode, int *flags)
 }
 
 /*
- * Returns the flags that a temporary snapshot should get, based on the
- * originally requested flags (the originally requested image will have flags
- * like a backing file)
+ * Returns the options and flags that a temporary snapshot should get, based on
+ * the originally requested flags (the originally requested image will have
+ * flags like a backing file)
  */
-static int bdrv_temp_snapshot_flags(int flags)
+static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options,
+   int parent_flags, QDict *parent_options)
 {
-return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
+*child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY;
+
+/* For temporary files, unconditional cache=unsafe is fine */
+qdict_set_default_str(child_options, BDRV_OPT_CACHE_WB, "on");
+qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off");
+qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on");
 }
 
 /*
@@ -1424,13 +1430,13 @@ done:
 return c;
 }
 
-int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp)
+static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags,
+ QDict *snapshot_options, Error **errp)
 {
 /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */
 char *tmp_filename = g_malloc0(PATH_MAX + 1);
 int64_t total_size;
 QemuOpts *opts = NULL;
-QDict *snapshot_options;
 BlockDriverState *bs_snapshot;
 Error *local_err = NULL;
 int ret;
@@ -1465,7 +1471,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int 
flags, Error **errp)
 }
 
 /* Prepare a new options QDict for the temporary file */
-snapshot_options = qdict_new();
 qdict_put(snapshot_options, "file.driver",
   qstring_from_str("file"));
 qdict_put(snapshot_options, "file.filename",
@@ -1477,6 +1482,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int 
flags, Error **errp)
 
 ret = bdrv_open(_snapshot, NULL, NULL, snapshot_options,
 flags, _err);
+snapshot_options = NULL;
 if (ret < 0) {
 error_propagate(errp, local_err);
 goto out;
@@ -1485,6 +1491,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int 
flags, Error **errp)
 bdrv_append(bs_snapshot, bs);
 
 out:
+QDECREF(snapshot_options);
 g_free(tmp_filename);
 return ret;
 }
@@ -1516,6 +1523,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, 
const char *filename,
 const char *drvname;
 const char *backing;
 Error *local_err = NULL;
+QDict *snapshot_options = NULL;
 int snapshot_flags = 0;
 
 assert(pbs);
@@ -1607,7 +1615,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, 
const char *filename,
 flags |= BDRV_O_ALLOW_RDWR;
 }
 if (flags & BDRV_O_SNAPSHOT) {
-snapshot_flags = bdrv_temp_snapshot_flags(flags);
+snapshot_options = qdict_new();
+bdrv_temp_snapshot_options(_flags, snapshot_options,
+   flags, options);
 bdrv_backing_options(, options, flags, options);
 }
 
@@ -1709,7 +1719,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, 
const char *filename,
 /* For snapshot=on, create a temporary qcow2 overlay. bs points to the
  * temporary snapshot afterwards. */
 if (snapshot_flags) {
-ret = bdrv_append_temp_snapshot(bs, snapshot_flags, _err);
+ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options,
+_err);
+snapshot_options = NULL;
 if (local_err) {
 goto