Re: [Qemu-block] [Qemu-devel] [PATCH] vmdk: Switch to heap arrays for vmdk_write_cid
On Tue, 03/08 15:10, Peter Xu wrote: > On Tue, Mar 08, 2016 at 02:18:35PM +0800, Fam Zheng wrote: > > It is only called once for each opened image, so we can do it the easy > > way. > > > > Signed-off-by: Fam Zheng> > --- > > block/vmdk.c | 25 ++--- > > 1 file changed, 14 insertions(+), 11 deletions(-) > > > > diff --git a/block/vmdk.c b/block/vmdk.c > > index a8db5d9..1ec2452 100644 > > --- a/block/vmdk.c > > +++ b/block/vmdk.c > > @@ -274,36 +274,39 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, > > int parent) > > > > static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) > > If to use heap in write, do we need to do the same for e.g., > vmdk_parent_open() and vmdk_read_cid(), to make all things at least > aligned? Yes, let's change them altogether. Fam
[Qemu-block] [PATCH] vmdk: Switch to heap arrays for vmdk_write_cid
It is only called once for each opened image, so we can do it the easy way. Signed-off-by: Fam Zheng--- block/vmdk.c | 25 ++--- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/block/vmdk.c b/block/vmdk.c index a8db5d9..1ec2452 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -274,36 +274,39 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) { -char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; +char *desc, *tmp_desc; char *p_name, *tmp_str; BDRVVmdkState *s = bs->opaque; -int ret; +int ret = 0; +desc = g_malloc0(DESC_SIZE); +tmp_desc = g_malloc0(DESC_SIZE); ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { -return ret; +goto out; } desc[DESC_SIZE - 1] = '\0'; tmp_str = strstr(desc, "parentCID"); if (tmp_str == NULL) { -return -EINVAL; +ret = -EINVAL; +goto out; } -pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); +pstrcpy(tmp_desc, DESC_SIZE, tmp_str); p_name = strstr(desc, "CID"); if (p_name != NULL) { p_name += sizeof("CID"); -snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid); -pstrcat(desc, sizeof(desc), tmp_desc); +snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid); +pstrcat(desc, DESC_SIZE, tmp_desc); } ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE); -if (ret < 0) { -return ret; -} -return 0; +out: +g_free(desc); +g_free(tmp_desc); +return ret; } static int vmdk_is_cid_valid(BlockDriverState *bs) -- 2.4.3
Re: [Qemu-block] [PATCH v4 08/26] crypto: add support for the twofish cipher algorithm
On Mon, 02/29 12:00, Daniel P. Berrange wrote: > New cipher algorithms 'twofish-128', 'twofish-192' and > 'twofish-256' are defined for the Twofish algorithm. > The gcrypt backend does not support 'twofish-192'. > > The nettle and gcrypt cipher backends are updated to > support the new cipher and a test vector added to the > cipher test suite. The new algorithm is enabled in the > LUKS block encryption driver. > > Signed-off-by: Daniel P. Berrange> --- > crypto/cipher-gcrypt.c | 12 > crypto/cipher-nettle.c | 30 ++ > crypto/cipher.c| 6 ++ > qapi/crypto.json | 6 +- > tests/test-crypto-cipher.c | 29 + > 5 files changed, 82 insertions(+), 1 deletion(-) > > diff --git a/crypto/cipher-gcrypt.c b/crypto/cipher-gcrypt.c > index a667d59..ce26456 100644 > --- a/crypto/cipher-gcrypt.c > +++ b/crypto/cipher-gcrypt.c > @@ -33,6 +33,8 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg) > case QCRYPTO_CIPHER_ALG_SERPENT_128: > case QCRYPTO_CIPHER_ALG_SERPENT_192: > case QCRYPTO_CIPHER_ALG_SERPENT_256: > +case QCRYPTO_CIPHER_ALG_TWOFISH_128: > +case QCRYPTO_CIPHER_ALG_TWOFISH_256: > return true; > default: > return false; > @@ -104,6 +106,14 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm > alg, > gcryalg = GCRY_CIPHER_SERPENT256; > break; > > +case QCRYPTO_CIPHER_ALG_TWOFISH_128: > +gcryalg = GCRY_CIPHER_TWOFISH128; > +break; > + > +case QCRYPTO_CIPHER_ALG_TWOFISH_256: > +gcryalg = GCRY_CIPHER_TWOFISH; > +break; > + > default: > error_setg(errp, "Unsupported cipher algorithm %d", alg); > return NULL; > @@ -140,6 +150,8 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm > alg, > case QCRYPTO_CIPHER_ALG_SERPENT_128: > case QCRYPTO_CIPHER_ALG_SERPENT_192: > case QCRYPTO_CIPHER_ALG_SERPENT_256: > +case QCRYPTO_CIPHER_ALG_TWOFISH_128: > +case QCRYPTO_CIPHER_ALG_TWOFISH_256: > ctx->blocksize = 16; > break; > case QCRYPTO_CIPHER_ALG_CAST5_128: > diff --git a/crypto/cipher-nettle.c b/crypto/cipher-nettle.c > index 74b55ab..9b057fd 100644 > --- a/crypto/cipher-nettle.c > +++ b/crypto/cipher-nettle.c > @@ -25,6 +25,7 @@ > #include > #include > #include > +#include > > #if CONFIG_NETTLE_VERSION_MAJOR < 3 > typedef nettle_crypt_func nettle_cipher_func; > @@ -89,6 +90,18 @@ static void serpent_decrypt_wrapper(cipher_ctx_t ctx, > cipher_length_t length, > serpent_decrypt(ctx, length, dst, src); > } > > +static void twofish_encrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length, > +uint8_t *dst, const uint8_t *src) > +{ > +twofish_encrypt(ctx, length, dst, src); > +} > + > +static void twofish_decrypt_wrapper(cipher_ctx_t ctx, cipher_length_t length, > +uint8_t *dst, const uint8_t *src) > +{ > +twofish_decrypt(ctx, length, dst, src); > +} > + > typedef struct QCryptoCipherNettle QCryptoCipherNettle; > struct QCryptoCipherNettle { > void *ctx_encrypt; > @@ -110,6 +123,9 @@ bool qcrypto_cipher_supports(QCryptoCipherAlgorithm alg) > case QCRYPTO_CIPHER_ALG_SERPENT_128: > case QCRYPTO_CIPHER_ALG_SERPENT_192: > case QCRYPTO_CIPHER_ALG_SERPENT_256: > +case QCRYPTO_CIPHER_ALG_TWOFISH_128: > +case QCRYPTO_CIPHER_ALG_TWOFISH_192: > +case QCRYPTO_CIPHER_ALG_TWOFISH_256: > return true; > default: > return false; > @@ -200,6 +216,20 @@ QCryptoCipher *qcrypto_cipher_new(QCryptoCipherAlgorithm > alg, > ctx->blocksize = SERPENT_BLOCK_SIZE; > break; > > +case QCRYPTO_CIPHER_ALG_TWOFISH_128: > +case QCRYPTO_CIPHER_ALG_TWOFISH_192: > +case QCRYPTO_CIPHER_ALG_TWOFISH_256: > +ctx->ctx_encrypt = g_new0(struct twofish_ctx, 1); > +ctx->ctx_decrypt = NULL; /* 1 ctx can do both */ > + > +twofish_set_key(ctx->ctx_encrypt, nkey, key); > + > +ctx->alg_encrypt = twofish_encrypt_wrapper; > +ctx->alg_decrypt = twofish_decrypt_wrapper; > + > +ctx->blocksize = TWOFISH_BLOCK_SIZE; > +break; > + > default: > error_setg(errp, "Unsupported cipher algorithm %d", alg); > goto error; > diff --git a/crypto/cipher.c b/crypto/cipher.c > index 0f6fe98..89fa5a2 100644 > --- a/crypto/cipher.c > +++ b/crypto/cipher.c > @@ -31,6 +31,9 @@ static size_t alg_key_len[QCRYPTO_CIPHER_ALG__MAX] = { > [QCRYPTO_CIPHER_ALG_SERPENT_128] = 16, > [QCRYPTO_CIPHER_ALG_SERPENT_192] = 24, > [QCRYPTO_CIPHER_ALG_SERPENT_256] = 32, > +[QCRYPTO_CIPHER_ALG_TWOFISH_128] = 16, > +[QCRYPTO_CIPHER_ALG_TWOFISH_192] = 24, > +[QCRYPTO_CIPHER_ALG_TWOFISH_256] = 32, > }; > > static size_t alg_block_len[QCRYPTO_CIPHER_ALG__MAX] = { > @@ -42,6
[Qemu-block] [PATCH v4 15/15] block: More operations for meta dirty bitmap
Callers can create an iterator of meta bitmap with bdrv_dirty_meta_iter_new(), then use the bdrv_dirty_iter_* operations on it. Meta iterators are also counted by bitmap->active_iterators. Also add a couple of functions to retrieve granularity and count. Signed-off-by: Fam Zheng--- block/dirty-bitmap.c | 19 +++ include/block/dirty-bitmap.h | 3 +++ 2 files changed, 22 insertions(+) diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 3aef91a..3992eb5 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -393,6 +393,11 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); } +uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap) +{ +return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->meta); +} + BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, uint64_t first_sector) { @@ -403,6 +408,15 @@ BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, return iter; } +BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap) +{ +BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1); +hbitmap_iter_init(>hbi, bitmap->meta, 0); +iter->bitmap = bitmap; +bitmap->active_iterators++; +return iter; +} + void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter) { if (!iter) { @@ -514,3 +528,8 @@ int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) { return hbitmap_count(bitmap->bitmap); } + +int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap) +{ +return hbitmap_count(bitmap->meta); +} diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h index 40a09c0..3cbed02 100644 --- a/include/block/dirty-bitmap.h +++ b/include/block/dirty-bitmap.h @@ -30,6 +30,7 @@ void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs); uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap); +uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap); bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap); @@ -47,12 +48,14 @@ int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector, int nb_sectors); +BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap); BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, uint64_t first_sector); void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter); int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter); void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num); int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); +int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap); void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap, -- 2.4.3
[Qemu-block] [PATCH v4 12/15] hbitmap: serialization
From: Vladimir Sementsov-OgievskiyFunctions to serialize / deserialize(restore) HBitmap. HBitmap should be saved to linear sequence of bits independently of endianness and bitmap array element (unsigned long) size. Therefore Little Endian is chosen. These functions are appropriate for dirty bitmap migration, restoring the bitmap in several steps is available. To save performance, every step writes only the last level of the bitmap. All other levels are restored by hbitmap_deserialize_finish() as a last step of restoring. So, HBitmap is inconsistent while restoring. Signed-off-by: Vladimir Sementsov-Ogievskiy [Fix left shift operand to 1UL; add "finish" parameter. - Fam] Signed-off-by: Fam Zheng --- include/qemu/hbitmap.h | 79 util/hbitmap.c | 137 + 2 files changed, 216 insertions(+) diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h index f8ed058..26cac7d 100644 --- a/include/qemu/hbitmap.h +++ b/include/qemu/hbitmap.h @@ -146,6 +146,85 @@ void hbitmap_reset_all(HBitmap *hb); bool hbitmap_get(const HBitmap *hb, uint64_t item); /** + * hbitmap_serialization_granularity: + * @hb: HBitmap to operate on. + * + * Granularity of serialization chunks, used by other serialization functions. + * For every chunk: + * 1. Chunk start should be aligned to this granularity. + * 2. Chunk size should be aligned too, except for last chunk (for which + * start + count == hb->size) + */ +uint64_t hbitmap_serialization_granularity(const HBitmap *hb); + +/** + * hbitmap_serialization_size: + * @hb: HBitmap to operate on. + * @start: Starting bit + * @count: Number of bits + * + * Return number of bytes hbitmap_(de)serialize_part needs + */ +uint64_t hbitmap_serialization_size(const HBitmap *hb, +uint64_t start, uint64_t count); + +/** + * hbitmap_serialize_part + * @hb: HBitmap to operate on. + * @buf: Buffer to store serialized bitmap. + * @start: First bit to store. + * @count: Number of bits to store. + * + * Stores HBitmap data corresponding to given region. The format of saved data + * is linear sequence of bits, so it can be used by hbitmap_deserialize_part + * independently of endianness and size of HBitmap level array elements + */ +void hbitmap_serialize_part(const HBitmap *hb, uint8_t *buf, +uint64_t start, uint64_t count); + +/** + * hbitmap_deserialize_part + * @hb: HBitmap to operate on. + * @buf: Buffer to restore bitmap data from. + * @start: First bit to restore. + * @count: Number of bits to restore. + * @finish: Whether to call hbitmap_deserialize_finish automatically. + * + * Restores HBitmap data corresponding to given region. The format is the same + * as for hbitmap_serialize_part. + * + * If @finish is false, caller must call hbitmap_serialize_finish before using + * the bitmap. + */ +void hbitmap_deserialize_part(HBitmap *hb, uint8_t *buf, + uint64_t start, uint64_t count, + bool finish); + +/** + * hbitmap_deserialize_zeroes + * @hb: HBitmap to operate on. + * @start: First bit to restore. + * @count: Number of bits to restore. + * @finish: Whether to call hbitmap_deserialize_finish automatically. + * + * Fills the bitmap with zeroes. + * + * If @finish is false, caller must call hbitmap_serialize_finish before using + * the bitmap. + */ +void hbitmap_deserialize_zeroes(HBitmap *hb, uint64_t start, uint64_t count, +bool finish); + +/** + * hbitmap_deserialize_finish + * @hb: HBitmap to operate on. + * + * Repair HBitmap after calling hbitmap_deserialize_data. Actually, all HBitmap + * layers are restored here. + */ +void hbitmap_deserialize_finish(HBitmap *hb); + +/** * hbitmap_free: * @hb: HBitmap to operate on. * diff --git a/util/hbitmap.c b/util/hbitmap.c index 2d3d04c..5f02c17 100644 --- a/util/hbitmap.c +++ b/util/hbitmap.c @@ -395,6 +395,143 @@ bool hbitmap_get(const HBitmap *hb, uint64_t item) return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0; } +uint64_t hbitmap_serialization_granularity(const HBitmap *hb) +{ +/* Require at least 64 bit granularity to be safe on both 64 bit and 32 bit + * hosts. */ +return 64 << hb->granularity; +} + +/* Start should be aligned to serialization granularity, chunk size should be + * aligned to serialization granularity too, except for last chunk. + */ +static void serialization_chunk(const HBitmap *hb, +uint64_t start, uint64_t count, +unsigned long **first_el, size_t *el_count) +{ +uint64_t last = start + count - 1; +uint64_t gran = hbitmap_serialization_granularity(hb); + +assert((start & (gran - 1)) == 0); +assert((last >> hb->granularity) < hb->size); +if ((last >> hb->granularity) !=
[Qemu-block] [PATCH v4 13/15] block: BdrvDirtyBitmap serialization interface
From: Vladimir Sementsov-OgievskiySeveral functions to provide necessary access to BdrvDirtyBitmap for block-migration.c Signed-off-by: Vladimir Sementsov-Ogievskiy [Add the "finish" parameters. - Fam] Signed-off-by: Fam Zheng Reviewed-by: John Snow --- block/dirty-bitmap.c | 37 + include/block/dirty-bitmap.h | 14 ++ 2 files changed, 51 insertions(+) diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 295788d..3aef91a 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -453,6 +453,43 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) hbitmap_free(tmp); } +uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count) +{ +return hbitmap_serialization_size(bitmap->bitmap, start, count); +} + +uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap) +{ +return hbitmap_serialization_granularity(bitmap->bitmap); +} + +void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap, + uint8_t *buf, uint64_t start, + uint64_t count) +{ +hbitmap_serialize_part(bitmap->bitmap, buf, start, count); +} + +void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap, +uint8_t *buf, uint64_t start, +uint64_t count, bool finish) +{ +hbitmap_deserialize_part(bitmap->bitmap, buf, start, count, finish); +} + +void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count, + bool finish) +{ +hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish); +} + +void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap) +{ +hbitmap_deserialize_finish(bitmap->bitmap); +} + void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h index caa4d82..40a09c0 100644 --- a/include/block/dirty-bitmap.h +++ b/include/block/dirty-bitmap.h @@ -55,4 +55,18 @@ void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num); int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); +uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count); +uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap, + uint8_t *buf, uint64_t start, + uint64_t count); +void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap, +uint8_t *buf, uint64_t start, +uint64_t count, bool finish); +void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count, + bool finish); +void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap); + #endif -- 2.4.3
[Qemu-block] [PATCH v4 06/15] block: Hide HBitmap in block dirty bitmap interface
HBitmap is an implementation detail of block dirty bitmap that should be hidden from users. Introduce a BdrvDirtyBitmapIter to encapsulate the underlying HBitmapIter. A small difference in the interface is, before, an HBitmapIter is initialized in place, now the new BdrvDirtyBitmapIter must be dynamically allocated because the structure definition is in block.c. Two current users are converted too. Signed-off-by: Fam Zheng--- block/backup.c | 14 -- block/dirty-bitmap.c | 39 +-- block/mirror.c | 15 +-- include/block/dirty-bitmap.h | 7 +-- include/qemu/typedefs.h | 1 + 5 files changed, 56 insertions(+), 20 deletions(-) diff --git a/block/backup.c b/block/backup.c index ab3e345..f5a5e03 100644 --- a/block/backup.c +++ b/block/backup.c @@ -330,14 +330,14 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) int64_t last_cluster = -1; int64_t sectors_per_cluster = cluster_size_sectors(job); BlockDriverState *bs = job->common.bs; -HBitmapIter hbi; +BdrvDirtyBitmapIter *dbi; granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); clusters_per_iter = MAX((granularity / job->cluster_size), 1); -bdrv_dirty_iter_init(job->sync_bitmap, ); +dbi = bdrv_dirty_iter_new(job->sync_bitmap, 0); /* Find the next dirty sector(s) */ -while ((sector = hbitmap_iter_next()) != -1) { +while ((sector = bdrv_dirty_iter_next(dbi)) != -1) { cluster = sector / sectors_per_cluster; /* Fake progress updates for any clusters we skipped */ @@ -349,7 +349,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) for (end = cluster + clusters_per_iter; cluster < end; cluster++) { do { if (yield_and_check(job)) { -return ret; +goto out; } ret = backup_do_cow(bs, cluster * sectors_per_cluster, sectors_per_cluster, _is_read, @@ -357,7 +357,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) if ((ret < 0) && backup_error_action(job, error_is_read, -ret) == BLOCK_ERROR_ACTION_REPORT) { -return ret; +goto out; } } while (ret < 0); } @@ -365,7 +365,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) /* If the bitmap granularity is smaller than the backup granularity, * we need to advance the iterator pointer to the next cluster. */ if (granularity < job->cluster_size) { -bdrv_set_dirty_iter(, cluster * sectors_per_cluster); +bdrv_set_dirty_iter(dbi, cluster * sectors_per_cluster); } last_cluster = cluster - 1; @@ -377,6 +377,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) job->common.offset += ((end - last_cluster - 1) * job->cluster_size); } +out: +bdrv_dirty_iter_free(dbi); return ret; } diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 556e1d1..16f73b2 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -42,9 +42,15 @@ struct BdrvDirtyBitmap { char *name; /* Optional non-empty unique ID */ int64_t size; /* Size of the bitmap (Number of sectors) */ bool disabled; /* Bitmap is read-only */ +int active_iterators; /* How many iterators are active */ QLIST_ENTRY(BdrvDirtyBitmap) list; }; +struct BdrvDirtyBitmapIter { +HBitmapIter hbi; +BdrvDirtyBitmap *bitmap; +}; + BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) { BdrvDirtyBitmap *bm; @@ -212,6 +218,7 @@ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) QLIST_FOREACH(bitmap, >dirty_bitmaps, list) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); +assert(!bitmap->active_iterators); hbitmap_truncate(bitmap->bitmap, size); bitmap->size = size; } @@ -224,6 +231,7 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bm, *next; QLIST_FOREACH_SAFE(bm, >dirty_bitmaps, list, next) { if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { +assert(!bitmap->active_iterators); assert(!bdrv_dirty_bitmap_frozen(bm)); QLIST_REMOVE(bm, list); hbitmap_free(bm->bitmap); @@ -320,9 +328,29 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); } -void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) +BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, + uint64_t
[Qemu-block] [PATCH v4 10/15] block: Add two dirty bitmap getters
For dirty bitmap users to get the size and the name of a BdrvDirtyBitmap. Signed-off-by: Fam ZhengReviewed-by: John Snow --- block/dirty-bitmap.c | 10 ++ include/block/dirty-bitmap.h | 2 ++ 2 files changed, 12 insertions(+) diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index a2413c0..50bb9e0 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -154,6 +154,16 @@ void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, hbitmap_reset(bitmap->meta, sector, nb_sectors); } +int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap) +{ +return bitmap->size; +} + +const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap) +{ +return bitmap->name; +} + bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) { return bitmap->successor; diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h index 50e0fca..caa4d82 100644 --- a/include/block/dirty-bitmap.h +++ b/include/block/dirty-bitmap.h @@ -32,6 +32,8 @@ uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap); bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); +const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap); +int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap); DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap); int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector); -- 2.4.3
[Qemu-block] [PATCH v4 07/15] HBitmap: Introduce "meta" bitmap to track bit changes
Upon each bit toggle, the corresponding bit in the meta bitmap will be set. Signed-off-by: Fam ZhengReviewed-by: John Snow --- block/dirty-bitmap.c | 2 +- include/qemu/hbitmap.h | 17 + util/hbitmap.c | 66 ++ 3 files changed, 69 insertions(+), 16 deletions(-) diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 16f73b2..0a188f2 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -231,7 +231,7 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bm, *next; QLIST_FOREACH_SAFE(bm, >dirty_bitmaps, list, next) { if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { -assert(!bitmap->active_iterators); +assert(!bm->active_iterators); assert(!bdrv_dirty_bitmap_frozen(bm)); QLIST_REMOVE(bm, list); hbitmap_free(bm->bitmap); diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h index e29188c..f8ed058 100644 --- a/include/qemu/hbitmap.h +++ b/include/qemu/hbitmap.h @@ -178,6 +178,23 @@ void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first); */ unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi); +/* hbitmap_create_meta: + * Create a "meta" hbitmap to track dirtiness of the bits in this HBitmap. + * The caller owns the created bitmap and must call hbitmap_free_meta(hb) to + * free it. + * + * @hb: The HBitmap to operate on. + * @chunk_size: How many bits in @hb does one bit in the meta track. + */ +HBitmap *hbitmap_create_meta(HBitmap *hb, int chunk_size); + +/* hbitmap_free_meta: + * Free the meta bitmap of @hb. + * + * @hb: The HBitmap whose meta bitmap should be freed. + */ +void hbitmap_free_meta(HBitmap *hb); + /** * hbitmap_iter_next: * @hbi: HBitmapIter to operate on. diff --git a/util/hbitmap.c b/util/hbitmap.c index b22b87d..2d3d04c 100644 --- a/util/hbitmap.c +++ b/util/hbitmap.c @@ -79,6 +79,9 @@ struct HBitmap { */ int granularity; +/* A meta dirty bitmap to track the dirtiness of bits in this HBitmap. */ +HBitmap *meta; + /* A number of progressively less coarse bitmaps (i.e. level 0 is the * coarsest). Each bit in level N represents a word in level N+1 that * has a set bit, except the last level where each bit represents the @@ -210,25 +213,27 @@ static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last) } /* Setting starts at the last layer and propagates up if an element - * changes from zero to non-zero. + * changes. */ static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last) { unsigned long mask; -bool changed; +unsigned long old; assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL)); assert(start <= last); mask = 2UL << (last & (BITS_PER_LONG - 1)); mask -= 1UL << (start & (BITS_PER_LONG - 1)); -changed = (*elem == 0); +old = *elem; *elem |= mask; -return changed; +return old != *elem; } -/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */ -static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t last) +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... + * Returns true if at least one bit is changed. */ +static bool hb_set_between(HBitmap *hb, int level, uint64_t start, + uint64_t last) { size_t pos = start >> BITS_PER_LEVEL; size_t lastpos = last >> BITS_PER_LEVEL; @@ -257,22 +262,27 @@ static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t last if (level > 0 && changed) { hb_set_between(hb, level - 1, pos, lastpos); } +return changed; } void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count) { /* Compute range in the last layer. */ +uint64_t first, n; uint64_t last = start + count - 1; trace_hbitmap_set(hb, start, count, start >> hb->granularity, last >> hb->granularity); -start >>= hb->granularity; +first = start >> hb->granularity; last >>= hb->granularity; -count = last - start + 1; +n = last - first + 1; -hb->count += count - hb_count_between(hb, start, last); -hb_set_between(hb, HBITMAP_LEVELS - 1, start, last); +hb->count += n - hb_count_between(hb, first, last); +if (hb_set_between(hb, HBITMAP_LEVELS - 1, first, last) && +hb->meta) { +hbitmap_set(hb->meta, start, count); +} } /* Resetting works the other way round: propagate up if the new @@ -293,8 +303,10 @@ static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t l return blanked; } -/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */ -static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t last) +/* The recursive workhorse (the depth is
[Qemu-block] [PATCH v4 08/15] tests: Add test code for meta bitmap
Signed-off-by: Fam ZhengReviewed-by: John Snow --- tests/test-hbitmap.c | 116 +++ 1 file changed, 116 insertions(+) diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c index abe1427..c00c2b5 100644 --- a/tests/test-hbitmap.c +++ b/tests/test-hbitmap.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include #include "qemu/hbitmap.h" +#include "block/block.h" #define LOG_BITS_PER_LONG (BITS_PER_LONG == 32 ? 5 : 6) @@ -21,6 +22,7 @@ typedef struct TestHBitmapData { HBitmap *hb; +HBitmap *meta; unsigned long *bits; size_t size; size_t old_size; @@ -92,6 +94,14 @@ static void hbitmap_test_init(TestHBitmapData *data, } } +static void hbitmap_test_init_meta(TestHBitmapData *data, + uint64_t size, int granularity, + int meta_chunk) +{ +hbitmap_test_init(data, size, granularity); +data->meta = hbitmap_create_meta(data->hb, meta_chunk); +} + static inline size_t hbitmap_test_array_size(size_t bits) { size_t n = (bits + BITS_PER_LONG - 1) / BITS_PER_LONG; @@ -134,6 +144,9 @@ static void hbitmap_test_teardown(TestHBitmapData *data, const void *unused) { if (data->hb) { +if (data->meta) { +hbitmap_free_meta(data->hb); +} hbitmap_free(data->hb); data->hb = NULL; } @@ -635,6 +648,103 @@ static void test_hbitmap_truncate_shrink_large(TestHBitmapData *data, hbitmap_test_truncate(data, size, -diff, 0); } +static void hbitmap_check_meta(TestHBitmapData *data, + int64_t start, int count) +{ +int64_t i; + +for (i = 0; i < data->size; i++) { +if (i >= start && i < start + count) { +g_assert(hbitmap_get(data->meta, i)); +} else { +g_assert(!hbitmap_get(data->meta, i)); +} +} +} + +static void hbitmap_test_meta(TestHBitmapData *data, + int64_t start, int count, + int64_t check_start, int check_count) +{ +hbitmap_reset_all(data->hb); +hbitmap_reset_all(data->meta); + +/* Test "unset" -> "unset" will not update meta. */ +hbitmap_reset(data->hb, start, count); +hbitmap_check_meta(data, 0, 0); + +/* Test "unset" -> "set" will update meta */ +hbitmap_set(data->hb, start, count); +hbitmap_check_meta(data, check_start, check_count); + +/* Test "set" -> "set" will not update meta */ +hbitmap_reset_all(data->meta); +hbitmap_set(data->hb, start, count); +hbitmap_check_meta(data, 0, 0); + +/* Test "set" -> "unset" will update meta */ +hbitmap_reset_all(data->meta); +hbitmap_reset(data->hb, start, count); +hbitmap_check_meta(data, check_start, check_count); +} + +static void hbitmap_test_meta_do(TestHBitmapData *data, int chunk_size) +{ +uint64_t size = chunk_size * 100; +hbitmap_test_init_meta(data, size, 0, chunk_size); + +hbitmap_test_meta(data, 0, 1, 0, chunk_size); +hbitmap_test_meta(data, 0, chunk_size, 0, chunk_size); +hbitmap_test_meta(data, chunk_size - 1, 1, 0, chunk_size); +hbitmap_test_meta(data, chunk_size - 1, 2, 0, chunk_size * 2); +hbitmap_test_meta(data, chunk_size - 1, chunk_size + 1, 0, chunk_size * 2); +hbitmap_test_meta(data, chunk_size - 1, chunk_size + 2, 0, chunk_size * 3); +hbitmap_test_meta(data, 7 * chunk_size - 1, chunk_size + 2, + 6 * chunk_size, chunk_size * 3); +hbitmap_test_meta(data, size - 1, 1, size - chunk_size, chunk_size); +hbitmap_test_meta(data, 0, size, 0, size); +} + +static void test_hbitmap_meta_byte(TestHBitmapData *data, const void *unused) +{ +hbitmap_test_meta_do(data, BITS_PER_BYTE); +} + +static void test_hbitmap_meta_word(TestHBitmapData *data, const void *unused) +{ +hbitmap_test_meta_do(data, BITS_PER_LONG); +} + +static void test_hbitmap_meta_sector(TestHBitmapData *data, const void *unused) +{ +hbitmap_test_meta_do(data, BDRV_SECTOR_SIZE * BITS_PER_BYTE); +} + +/** + * Create an HBitmap and test set/unset. + */ +static void test_hbitmap_meta_one(TestHBitmapData *data, const void *unused) +{ +int i; +int64_t offsets[] = { +0, 1, L1 - 1, L1, L1 + 1, L2 - 1, L2, L2 + 1, L3 - 1, L3, L3 + 1 +}; + +hbitmap_test_init_meta(data, L3 * 2, 0, 1); +for (i = 0; i < ARRAY_SIZE(offsets); i++) { +hbitmap_test_meta(data, offsets[i], 1, offsets[i], 1); +hbitmap_test_meta(data, offsets[i], L1, offsets[i], L1); +hbitmap_test_meta(data, offsets[i], L2, offsets[i], L2); +} +} + +static void test_hbitmap_meta_zero(TestHBitmapData *data, const void *unused) +{ +hbitmap_test_init_meta(data, 0, 0, 1); + +hbitmap_check_meta(data, 0, 0); +} + static void hbitmap_test_add(const char *testpath,
[Qemu-block] [PATCH v4 09/15] block: Support meta dirty bitmap
The added group of operations enables tracking of the changed bits in the dirty bitmap. Signed-off-by: Fam Zheng--- block/dirty-bitmap.c | 52 include/block/dirty-bitmap.h | 9 2 files changed, 61 insertions(+) diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index 0a188f2..a2413c0 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -38,6 +38,7 @@ */ struct BdrvDirtyBitmap { HBitmap *bitmap;/* Dirty sector bitmap implementation */ +HBitmap *meta; /* Meta dirty bitmap */ BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ char *name; /* Optional non-empty unique ID */ int64_t size; /* Size of the bitmap (Number of sectors) */ @@ -103,6 +104,56 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, return bitmap; } +/* bdrv_create_meta_dirty_bitmap + * + * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. I.e. + * when a dirty status bit in @bitmap is changed (either from reset to set or + * the other way around), its respective meta dirty bitmap bit will be marked + * dirty as well. + * + * @bitmap: the block dirty bitmap for which to create a meta dirty bitmap. + * @chunk_size: how many bytes of bitmap data does each bit in the meta bitmap + * track. + */ +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int chunk_size) +{ +assert(!bitmap->meta); +bitmap->meta = hbitmap_create_meta(bitmap->bitmap, + chunk_size * BITS_PER_BYTE); +} + +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ +assert(bitmap->meta); +hbitmap_free_meta(bitmap->bitmap); +bitmap->meta = NULL; +} + +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors) +{ +uint64_t i; +int gran = bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS; + +/* To optimize: we can make hbitmap to internally check the range in a + * coarse level, or at least do it word by word. */ +for (i = sector; i < sector + nb_sectors; i += gran) { +if (hbitmap_get(bitmap->meta, i)) { +return true; +} +} +return false; +} + +void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors) +{ +hbitmap_reset(bitmap->meta, sector, nb_sectors); +} + bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) { return bitmap->successor; @@ -233,6 +284,7 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { assert(!bm->active_iterators); assert(!bdrv_dirty_bitmap_frozen(bm)); +assert(!bm->meta); QLIST_REMOVE(bm, list); hbitmap_free(bm->bitmap); g_free(bm->name); diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h index 2ea601b..50e0fca 100644 --- a/include/block/dirty-bitmap.h +++ b/include/block/dirty-bitmap.h @@ -8,6 +8,9 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, uint32_t granularity, const char *name, Error **errp); +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int chunk_size); +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap); int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, Error **errp); @@ -36,6 +39,12 @@ void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int nr_sectors); void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int nr_sectors); +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors); +void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors); BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, uint64_t first_sector); void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter); -- 2.4.3
[Qemu-block] [PATCH v4 03/15] typedefs: Add BdrvDirtyBitmap
Following patches to refactor and move block dirty bitmap code could use this. Signed-off-by: Fam ZhengReviewed-by: John Snow --- include/block/block.h | 1 - include/qemu/typedefs.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/block/block.h b/include/block/block.h index 00827f7..5ee9b8f 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -476,7 +476,6 @@ void *qemu_try_blockalign(BlockDriverState *bs, size_t size); void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, uint32_t granularity, const char *name, diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 9a5ead6..fd039e0 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -10,6 +10,7 @@ typedef struct AddressSpace AddressSpace; typedef struct AioContext AioContext; typedef struct AllwinnerAHCIState AllwinnerAHCIState; typedef struct AudioState AudioState; +typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; typedef struct BlockBackend BlockBackend; typedef struct BlockBackendRootState BlockBackendRootState; typedef struct BlockDriverState BlockDriverState; -- 2.4.3
[Qemu-block] [PATCH v4 02/15] block: Include hbitmap.h in block.h
Signed-off-by: Fam ZhengReviewed-by: John Snow --- include/block/block.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/block/block.h b/include/block/block.h index 1c4f4d8..00827f7 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -8,6 +8,7 @@ #include "block/accounting.h" #include "qapi/qmp/qobject.h" #include "qapi-types.h" +#include "qemu/hbitmap.h" /* block.c */ typedef struct BlockDriver BlockDriver; @@ -475,7 +476,6 @@ void *qemu_try_blockalign(BlockDriverState *bs, size_t size); void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); -struct HBitmapIter; typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, uint32_t granularity, -- 2.4.3
[Qemu-block] [PATCH v4 05/15] block: Remove unused typedef of BlockDriverDirtyHandler
Signed-off-by: Fam ZhengReviewed-by: John Snow --- include/block/block.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/block/block.h b/include/block/block.h index 3fbd70d..ff1933a 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -322,8 +322,6 @@ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, const char *node_name, Error **errp); /* async block I/O */ -typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector, - int sector_num); BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *iov, int nb_sectors, BlockCompletionFunc *cb, void *opaque); -- 2.4.3
[Qemu-block] [PATCH v4 00/15] Dirty bitmap changes for migration/persistence work
v4: Rebase. Add rev-by from John in patches 1-5, 7, 8. Remove BdrvDirtyBitmap typedef from dirty-bitmap.h in patch 4. [Max] Add assertion on bm->meta in patch 9. [John] Two major features are added to block dirty bitmap (and underlying HBitmap) in this series: meta bitmap and serialization, together with all other supportive patches. Both operations are common in dirty bitmap migration and persistence: they need to find whether and which part of the dirty bitmap in question has changed with meta dirty bitmap, and they need to write it to the target with serialization. Fam Zheng (13): backup: Use Bitmap to replace "s->bitmap" block: Include hbitmap.h in block.h typedefs: Add BdrvDirtyBitmap block: Move block dirty bitmap code to separate files block: Remove unused typedef of BlockDriverDirtyHandler block: Hide HBitmap in block dirty bitmap interface HBitmap: Introduce "meta" bitmap to track bit changes tests: Add test code for meta bitmap block: Support meta dirty bitmap block: Add two dirty bitmap getters block: Assert that bdrv_release_dirty_bitmap succeeded tests: Add test code for hbitmap serialization block: More operations for meta dirty bitmap Vladimir Sementsov-Ogievskiy (2): hbitmap: serialization block: BdrvDirtyBitmap serialization interface block.c | 360 - block/Makefile.objs | 2 +- block/backup.c | 25 +- block/dirty-bitmap.c | 535 +++ block/mirror.c | 15 +- include/block/block.h| 40 +--- include/block/dirty-bitmap.h | 75 ++ include/qemu/hbitmap.h | 96 include/qemu/typedefs.h | 2 + tests/test-hbitmap.c | 255 + util/hbitmap.c | 203 ++-- 11 files changed, 1177 insertions(+), 431 deletions(-) create mode 100644 block/dirty-bitmap.c create mode 100644 include/block/dirty-bitmap.h -- 2.4.3
[Qemu-block] [PATCH v4 01/15] backup: Use Bitmap to replace "s->bitmap"
"s->bitmap" tracks done sectors, we only check bit states without using any iterator which HBitmap is good for. Switch to "Bitmap" which is simpler and more memory efficient. Meanwhile, rename it to done_bitmap, to reflect the intention. Signed-off-by: Fam ZhengReviewed-by: John Snow --- block/backup.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/backup.c b/block/backup.c index 0f1b1bc..ab3e345 100644 --- a/block/backup.c +++ b/block/backup.c @@ -20,6 +20,7 @@ #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "sysemu/block-backend.h" +#include "qemu/bitmap.h" #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) #define SLICE_TIME 1ULL /* ns */ @@ -42,7 +43,7 @@ typedef struct BackupBlockJob { BlockdevOnError on_target_error; CoRwlock flush_rwlock; uint64_t sectors_read; -HBitmap *bitmap; +unsigned long *done_bitmap; int64_t cluster_size; QLIST_HEAD(, CowRequest) inflight_reqs; } BackupBlockJob; @@ -116,7 +117,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, cow_request_begin(_request, job, start, end); for (; start < end; start++) { -if (hbitmap_get(job->bitmap, start)) { +if (test_bit(start, job->done_bitmap)) { trace_backup_do_cow_skip(job, start); continue; /* already copied */ } @@ -167,7 +168,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, goto out; } -hbitmap_set(job->bitmap, start, 1); +set_bit(start, job->done_bitmap); /* Publish progress, guest I/O counts as progress too. Note that the * offset field is an opaque progress value, it is not a disk offset. @@ -399,7 +400,7 @@ static void coroutine_fn backup_run(void *opaque) start = 0; end = DIV_ROUND_UP(job->common.len, job->cluster_size); -job->bitmap = hbitmap_alloc(end, 0); +job->done_bitmap = bitmap_new(end); bdrv_set_enable_write_cache(target, true); if (target->blk) { @@ -480,7 +481,7 @@ static void coroutine_fn backup_run(void *opaque) /* wait until pending backup_do_cow() calls have completed */ qemu_co_rwlock_wrlock(>flush_rwlock); qemu_co_rwlock_unlock(>flush_rwlock); -hbitmap_free(job->bitmap); +g_free(job->done_bitmap); if (target->blk) { blk_iostatus_disable(target->blk); -- 2.4.3
Re: [Qemu-block] [PATCH] block/gluster: add support for SEEK_DATA/SEEK_HOLE
On Mon, Mar 07, 2016 at 01:27:38PM -0500, Jeff Cody wrote: > On Mon, Mar 07, 2016 at 07:04:15PM +0100, Niels de Vos wrote: > > GlusterFS 3.8 contains support for SEEK_DATA and SEEK_HOLE. This makes > > it possible to detect sparse areas in files. > > > > Signed-off-by: Niels de Vos> > > > -- > > Tested by compiling and running "qemu-img map gluster://..." with a > > build of the current master branch of glusterfs. Using a Fedora > > cloud image (in raw format) shows many SEEK procudure calls going back > > and forth over the network. The output of "qemu map" matches the output > > when run against the image on the local filesystem. > > --- > > block/gluster.c | 159 > > > > configure | 25 + > > 2 files changed, 184 insertions(+) > > > > diff --git a/block/gluster.c b/block/gluster.c > > index 65077a0..1430010 100644 > > --- a/block/gluster.c > > +++ b/block/gluster.c > > @@ -677,6 +677,153 @@ static int > > qemu_gluster_has_zero_init(BlockDriverState *bs) > > return 0; > > } > > > > +#ifdef CONFIG_GLUSTERFS_SEEK_DATA > > Why do we need to make this a compile-time option? Version checking > is problematic; for instance, different distributions may have > backported bug fixes / features, that are not reflected by the > reported version number, etc.. Ideally, we can determine > functionality during runtime, and behave accordingly. This will not get backported to older Gluster versions, it required a protocol change. > If SEEK_DATA and SEEK_HOLE are not supported, > qemu_gluster_co_get_block_status can return that sectors are all > allocated (which is what happens in block/io.c anyway if the driver > doesn't support the function). Ok, good to know. > As long as glfs_lseek() will return error (e.g. EINVAL) for an invalid > whence value, we can handle it runtime. Does glfs_lseek() behave > sanely? Unfortunately older versions of libgfapi do not return EINVAL when SEEK_DATA/HOLE is used. It is something we'll need to fix in the stable releases. We can not assume that all users have installed a version of the library that handles SEEK_DATA/HOLE correctly (return EINVAL) when there is no support in the network protocol or on the server. To be sure that we don't get some undefined behaviour, the compile time check is needed. Niels > > +/* > > + * Find allocation range in @bs around offset @start. > > + * May change underlying file descriptor's file offset. > > + * If @start is not in a hole, store @start in @data, and the > > + * beginning of the next hole in @hole, and return 0. > > + * If @start is in a non-trailing hole, store @start in @hole and the > > + * beginning of the next non-hole in @data, and return 0. > > + * If @start is in a trailing hole or beyond EOF, return -ENXIO. > > + * If we can't find out, return a negative errno other than -ENXIO. > > + * > > + * (Shamefully copied from raw-posix.c, only miniscule adaptions.) > > + */ > > +static int find_allocation(BlockDriverState *bs, off_t start, > > + off_t *data, off_t *hole) > > +{ > > +BDRVGlusterState *s = bs->opaque; > > +off_t offs; > > + > > +/* > > + * SEEK_DATA cases: > > + * D1. offs == start: start is in data > > + * D2. offs > start: start is in a hole, next data at offs > > + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole > > + * or start is beyond EOF > > + * If the latter happens, the file has been truncated behind > > + * our back since we opened it. All bets are off then. > > + * Treating like a trailing hole is simplest. > > + * D4. offs < 0, errno != ENXIO: we learned nothing > > + */ > > +offs = glfs_lseek(s->fd, start, SEEK_DATA); > > +if (offs < 0) { > > +return -errno; /* D3 or D4 */ > > +} > > +assert(offs >= start); > > + > > +if (offs > start) { > > +/* D2: in hole, next data at offs */ > > +*hole = start; > > +*data = offs; > > +return 0; > > +} > > + > > +/* D1: in data, end not yet known */ > > + > > +/* > > + * SEEK_HOLE cases: > > + * H1. offs == start: start is in a hole > > + * If this happens here, a hole has been dug behind our back > > + * since the previous lseek(). > > + * H2. offs > start: either start is in data, next hole at offs, > > + * or start is in trailing hole, EOF at offs > > + * Linux treats trailing holes like any other hole: offs == > > + * start. Solaris seeks to EOF instead: offs > start (blech). > > + * If that happens here, a hole has been dug behind our back > > + * since the previous lseek(). > > + * H3. offs < 0, errno = ENXIO: start is beyond EOF > > + * If this happens, the file has been truncated behind our > > + * back since we opened it. Treat it like a
Re: [Qemu-block] [Qemu-devel] [PATCH v15 8/9] Implement new driver for block replication
On 03/05/2016 01:53 AM, Stefan Hajnoczi wrote: On Fri, Feb 05, 2016 at 12:18:07PM +0800, Changlong Xie wrote: +static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp) +{ +Error *local_err = NULL; +int ret; + +if (!s->secondary_disk->bs->job) { +error_setg(errp, "Backup job was cancelled unexpectedly"); +return; +} Do you have test cases for the replication driver? This error code path seems like something that should be exercised to make sure it works. Basic start/checkpoint/stop and checks that the active/secondary/hidden disks contain the expected data are also important. Probably the easiest way to do this would be a file called tests/test-replication.c that calls start/checkpoint/stop and read/write functions directly instead of running a guest. Will do that in next version. Thanks -Xie
Re: [Qemu-block] [Qemu-devel] [PATCH v15 8/9] Implement new driver for block replication
On 03/05/2016 01:39 AM, Stefan Hajnoczi wrote: On Fri, Feb 05, 2016 at 12:18:07PM +0800, Changlong Xie wrote: +static void replication_start(ReplicationState *rs, ReplicationMode mode, + Error **errp) +{ +BlockDriverState *bs = rs->opaque; +BDRVReplicationState *s = bs->opaque; +int64_t active_length, hidden_length, disk_length; +AioContext *aio_context; +Error *local_err = NULL; + +if (s->replication_state != BLOCK_REPLICATION_NONE) { Dereferencing bs and s is not thread-safe since the AioContext isn't acquired here. Unless you have other locking rules, the default is that everything in BlockDriverState and bs->opaque must be accessed with the AioContext acquired. Please apply this comment to the rest of the patch series, too. Functions that are not part of BlockDriver are probably called outside the AioContext and must acquire it before they are allowed to access anything else in bs. Will do implement in replication_start/stop/checkpoint/get_error callbacks. +s->top_bs = get_top_bs(bs); +if (!s->top_bs) { +error_setg(errp, "Cannot get the top block driver state to do" + " internal backup"); +return; +} Traversing the BDS graph using the parent pointer is not safe - you cannot stash the parent pointer because it changes if a parent node is inserted/removed. I suggest passing the drive name as an argument so that bdrv_lookup_bs() can be used when installing the op blocker. Then the BdrvChild.parent pointer patch and get_top_bs() can be deleted. ok Jeff Cody is currently working on a new op blocker system. Hopefully this system will allow you to install blockers on bs instead of on the drive's top BDS. But let's not worry about that for now and just use the drive name as an argument. Good news. Thanks +/* + * Must protect backup target if backup job was stopped/cancelled + * unexpectedly + */ +bdrv_ref(s->hidden_disk->bs); Where is the matching bdrv_unref() call?
Re: [Qemu-block] [PATCH v15 7/9] Introduce new APIs to do replication operation
On 03/05/2016 12:13 AM, Stefan Hajnoczi wrote: On Fri, Feb 05, 2016 at 12:18:06PM +0800, Changlong Xie wrote: diff --git a/replication.h b/replication.h new file mode 100644 index 000..faea649 --- /dev/null +++ b/replication.h @@ -0,0 +1,53 @@ +/* + * Replication filter + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2016 Intel Corporation + * Copyright (c) 2016 FUJITSU LIMITED + * + * Author: + * Wen Congyang+ * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef REPLICATION_H +#define REPLICATION_H + +#include "sysemu/sysemu.h" + +typedef struct ReplicationOps ReplicationOps; +typedef struct ReplicationState ReplicationState; +typedef void (*Start)(ReplicationState *rs, ReplicationMode mode, Error **errp); +typedef void (*Stop)(ReplicationState *rs, bool failover, Error **errp); +typedef void (*Checkpoint)(ReplicationState *rs, Error **errp); +typedef void (*GetError)(ReplicationState *rs, Error **errp); Hi Stefan Thanks for your comments. These typedefs are likely to collide with system headers. Please prefix the name. That said, I don't think Start, Stop, Checkpoint, GetError are necessary. Just declare the prototypes in struct ReplicationOps. No user passes single function pointers, they always pass ReplicationOps. Therefore it's not necessary to declare types for these functions at all. Will try that. + +struct ReplicationState { +void *opaque; +ReplicationOps *ops; +QLIST_ENTRY(ReplicationState) node; +}; + +struct ReplicationOps{ +Start start; +Checkpoint checkpoint; +GetError get_error; +Stop stop; +}; Please add doc comments that explain the semantics of these functions. For examples, see include/qom/object.h. This documentation should allow someone implementing a new replication listener to understand the constraints and the purpose of these functions. Surely Thanks -Xie
Re: [Qemu-block] [PATCH v10 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()
On 03/08/2016 12:02 AM, Max Reitz wrote: On 07.03.2016 17:02, Eric Blake wrote: On 03/05/2016 11:13 AM, Max Reitz wrote: +index = atoi(child->name + 9); Optional: Assert absence of an error: Indeed, atoi() is worthless, because it cannot do error detection. unsigned long index; char *endptr; index = strtoul(child->name + 9, , 10); assert(index >= 0 && !*endptr); Still incorrect; you aren't handling errno properly for detecting all errors. Even better is to use qemu_strtoul(), which already handles proper error detection. Yeah, I keep forgetting that it returns ULONG_MAX on range error... Yes, we should limit the range to INT_MAX. How do you like the following codes, i just steal it from xen_host_pci_get_value(). int rc; const char *endptr; unsigned long value; assert(!strncmp(child->name, "children.", 9)); rc = qemu_strtoul(child->name + 9, , 10, ); if (!rc) { assert(value <= INT_MAX); index = value; } else { error_setg_errno(errp, -rc, "Failed to parse value '%s'", child->name + 9); return; } Thanks -Xie Max
Re: [Qemu-block] [PATCH v3 04/15] block: Move block dirty bitmap code to separate files
On Mon, 03/07 19:49, Max Reitz wrote: > On 27.02.2016 10:20, Fam Zheng wrote: > > The only code change is making bdrv_dirty_bitmap_truncate public. It is > > used in block.c. > > > > Also two long lines (bdrv_get_dirty) are wrapped. > > > > Signed-off-by: Fam Zheng> > Reviewed-by: John Snow > > --- > > block.c | 360 > > block/Makefile.objs | 2 +- > > block/dirty-bitmap.c | 387 > > +++ > > include/block/block.h| 35 +--- > > include/block/dirty-bitmap.h | 45 + > > 5 files changed, 434 insertions(+), 395 deletions(-) > > create mode 100644 block/dirty-bitmap.c > > create mode 100644 include/block/dirty-bitmap.h > > [...] > > > diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h > > new file mode 100644 > > index 000..45c6046 > > --- /dev/null > > +++ b/include/block/dirty-bitmap.h > > @@ -0,0 +1,45 @@ > > +#ifndef BLOCK_DIRTY_BITMAP_H > > +#define BLOCK_DIRTY_BITMAP_H > > + > > +#include "qemu-common.h" > > +#include "qemu/hbitmap.h" > > + > > +typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; > > Doesn't patch 3 make this superfluous? > > If so, I can remove it for you. > Removing while rebasing. Fam
Re: [Qemu-block] [PATCH v3 09/15] block: Support meta dirty bitmap
On Mon, 02/29 18:00, John Snow wrote: > > > On 02/27/2016 04:20 AM, Fam Zheng wrote: > > The added group of operations enables tracking of the changed bits in > > the dirty bitmap. > > > > Signed-off-by: Fam Zheng> > --- > > block/dirty-bitmap.c | 51 > > > > include/block/dirty-bitmap.h | 9 > > 2 files changed, 60 insertions(+) > > > > diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c > > index 16f73b2..5f19320 100644 > > --- a/block/dirty-bitmap.c > > +++ b/block/dirty-bitmap.c > > @@ -38,6 +38,7 @@ > > */ > > struct BdrvDirtyBitmap { > > HBitmap *bitmap;/* Dirty sector bitmap implementation */ > > +HBitmap *meta; /* Meta dirty bitmap */ > > BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status > > */ > > char *name; /* Optional non-empty unique ID */ > > int64_t size; /* Size of the bitmap (Number of sectors) > > */ > > @@ -103,6 +104,56 @@ BdrvDirtyBitmap > > *bdrv_create_dirty_bitmap(BlockDriverState *bs, > > return bitmap; > > } > > > > +/* bdrv_create_meta_dirty_bitmap > > + * > > + * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. > > I.e. > > + * when a dirty status bit in @bitmap is changed (either from reset to set > > or > > + * the other way around), its respective meta dirty bitmap bit will be > > marked > > + * dirty as well. > > + * > > + * @bitmap: the block dirty bitmap for which to create a meta dirty bitmap. > > + * @chunk_size: how many bytes of bitmap data does each bit in the meta > > bitmap > > + * track. > > + */ > > +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap, > > + int chunk_size) > > +{ > > +assert(!bitmap->meta); > > +bitmap->meta = hbitmap_create_meta(bitmap->bitmap, > > + chunk_size * BITS_PER_BYTE); > > +} > > + > > +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap) > > +{ > > +assert(bitmap->meta); > > +hbitmap_free_meta(bitmap->bitmap); > > +bitmap->meta = NULL; > > +} > > + > > +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, > > + BdrvDirtyBitmap *bitmap, int64_t sector, > > + int nb_sectors) > > +{ > > +uint64_t i; > > +int gran = bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS; > > + > > +/* To optimize: we can make hbitmap to internally check the range in a > > + * coarse level, or at least do it word by word. */ > > +for (i = sector; i < sector + nb_sectors; i += gran) { > > +if (hbitmap_get(bitmap->meta, i)) { > > +return true; > > +} > > +} > > +return false; > > +} > > + > > +void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, > > + BdrvDirtyBitmap *bitmap, int64_t sector, > > + int nb_sectors) > > +{ > > +hbitmap_reset(bitmap->meta, sector, nb_sectors); > > +} > > + > > bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) > > { > > return bitmap->successor; > > diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h > > index e1dbd8e..3b27742 100644 > > --- a/include/block/dirty-bitmap.h > > +++ b/include/block/dirty-bitmap.h > > @@ -9,6 +9,9 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState > > *bs, > >uint32_t granularity, > >const char *name, > >Error **errp); > > +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap, > > + int chunk_size); > > +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap); > > int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, > > BdrvDirtyBitmap *bitmap, > > Error **errp); > > @@ -37,6 +40,12 @@ void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, > > int64_t cur_sector, int nr_sectors); > > void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, > > int64_t cur_sector, int nr_sectors); > > +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, > > + BdrvDirtyBitmap *bitmap, int64_t sector, > > + int nb_sectors); > > +void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, > > + BdrvDirtyBitmap *bitmap, int64_t sector, > > + int nb_sectors); > > BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, > > uint64_t first_sector); > > void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter); > > > > Do we need to amend the teardown in
Re: [Qemu-block] [PATCH v3 06/15] block: Hide HBitmap in block dirty bitmap interface
On Mon, 03/07 20:11, Max Reitz wrote: > On 27.02.2016 10:20, Fam Zheng wrote: > > HBitmap is an implementation detail of block dirty bitmap that should be > > hidden > > from users. Introduce a BdrvDirtyBitmapIter to encapsulate the underlying > > HBitmapIter. > > > > A small difference in the interface is, before, an HBitmapIter is > > initialized > > in place, now the new BdrvDirtyBitmapIter must be dynamically allocated > > because > > the structure definition is in block.c. > > > > Two current users are converted too. > > > > Signed-off-by: Fam Zheng> > Reviewed-by: John Snow > > --- > > block/backup.c | 14 -- > > block/dirty-bitmap.c | 39 +-- > > block/mirror.c | 14 -- > > include/block/dirty-bitmap.h | 7 +-- > > include/qemu/typedefs.h | 1 + > > 5 files changed, 55 insertions(+), 20 deletions(-) > > I tried my best at fixing up the rebase conflicts, but block/mirror.c > has just changed too much ("mirror: Rewrite mirror_iteration") to be > able to pretend these fixes are trivial. Therefore, I'm afraid this > patch will need a rebase. Doing it now. Thanks! Fam
Re: [Qemu-block] [PATCH v10 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()
On 03/08/2016 12:02 AM, Eric Blake wrote: On 03/05/2016 11:13 AM, Max Reitz wrote: +index = atoi(child->name + 9); Optional: Assert absence of an error: Indeed, atoi() is worthless, because it cannot do error detection. unsigned long index; char *endptr; index = strtoul(child->name + 9, , 10); assert(index >= 0 && !*endptr); Still incorrect; you aren't handling errno properly for detecting all errors. Even better is to use qemu_strtoul(), which already handles proper error detection. Will fix this in next version, thanks for pointing it out. Thanks -Xie
Re: [Qemu-block] [PATCH] qed: fix bdrv_qed_drain
On 07/03/2016 21:56, Stefan Hajnoczi wrote: > I think the timer concept itself is troublesome. A radical approach but > limited to changing QED itself is to drop the timer and instead keep a > timestamp for the last allocating write request. Next time a write > request (allocating or rewriting) is about to complete, do the flush and > clear the need check flag as part of the write request (if 5 seconds > have passed since the timestamp). bdrv_qed_drain should be easy to fix in my new drain implementation, which is based on draining the parent before the BdrvChild-ren. It's just troublesome in the current one which alternates flushing and draining. I would just revert the patch that introduced bdrv_qed_drain for now, and reintroduce it later (note however that something was messed up in commit df9a681, "qed: Implement .bdrv_drain", 2015-11-12, because it includes some dirty bitmap stuff). Or perhaps the idea of making QED read-only has some merit... Paolo
Re: [Qemu-block] [PATCH] qed: fix bdrv_qed_drain
By the way, I'll send a patch on Tuesday showing the timerless need check mechanism. It's a little too late tonight to start it. Stefan signature.asc Description: PGP signature
Re: [Qemu-block] [PATCH] qed: fix bdrv_qed_drain
On Mon, Mar 07, 2016 at 05:57:41PM +0100, Kevin Wolf wrote: > Am 23.02.2016 um 14:54 hat Paolo Bonzini geschrieben: > > > > > > On 23/02/2016 13:49, Fam Zheng wrote: > > > On Tue, 02/23 11:43, Paolo Bonzini wrote: > > >> > > >> > > >> On 23/02/2016 06:57, Fam Zheng wrote: > > >> +qed_cancel_need_check_timer(s); > > >> +qed_need_check_timer_cb(s); > > >> +} > > > > > > What if an allocating write is queued (the else branch case)? Its > > > completion > > > will be in bdrv_drain and it could arm the need_check_timer which is > > > wrong. > > > > > > We need to drain the allocating_write_reqs queue before checking the > > > timer. > > > > You're right, but how? That's what bdrv_drain(bs) does, it's a > > chicken-and-egg problem. > > >>> > > >>> Maybe use an aio_poll loop before the if? > > >> > > >> That would not change the fact that you're reimplementing bdrv_drain > > >> inside bdrv_qed_drain. > > > > > > But it fulfills the contract of .bdrv_drain. This is the easy way, the > > > hard way > > > would be iterating through the allocating_write_reqs list and process > > > reqs one > > > by one synchronously, which still involves aio_poll indirectly. > > > > The easy way would be better then. > > > > Stefan, any second opinion? > > What's the status here? It would be good to have qed not segfaulting all > the time. I think the timer concept itself is troublesome. A radical approach but limited to changing QED itself is to drop the timer and instead keep a timestamp for the last allocating write request. Next time a write request (allocating or rewriting) is about to complete, do the flush and clear the need check flag as part of the write request (if 5 seconds have passed since the timestamp). Because the need check flag clearing is part of the write request completion, it will integrate properly with drain. Stefan signature.asc Description: PGP signature
Re: [Qemu-block] [PATCH] block/gluster: add support for SEEK_DATA/SEEK_HOLE
[adding qemu-devel; ALL patches must cc qemu-devel even when sent to another list] On 03/07/2016 11:04 AM, Niels de Vos wrote: > GlusterFS 3.8 contains support for SEEK_DATA and SEEK_HOLE. This makes > it possible to detect sparse areas in files. > > Signed-off-by: Niels de Vos> > -- > Tested by compiling and running "qemu-img map gluster://..." with a > build of the current master branch of glusterfs. Using a Fedora > cloud image (in raw format) shows many SEEK procudure calls going back > and forth over the network. The output of "qemu map" matches the output > when run against the image on the local filesystem. > --- > block/gluster.c | 159 > > configure | 25 + > 2 files changed, 184 insertions(+) > > diff --git a/block/gluster.c b/block/gluster.c > index 65077a0..1430010 100644 > --- a/block/gluster.c > +++ b/block/gluster.c > @@ -677,6 +677,153 @@ static int qemu_gluster_has_zero_init(BlockDriverState > *bs) > return 0; > } > > +#ifdef CONFIG_GLUSTERFS_SEEK_DATA > +/* > + * Find allocation range in @bs around offset @start. > + * May change underlying file descriptor's file offset. > + * If @start is not in a hole, store @start in @data, and the > + * beginning of the next hole in @hole, and return 0. > + * If @start is in a non-trailing hole, store @start in @hole and the > + * beginning of the next non-hole in @data, and return 0. > + * If @start is in a trailing hole or beyond EOF, return -ENXIO. > + * If we can't find out, return a negative errno other than -ENXIO. > + * > + * (Shamefully copied from raw-posix.c, only miniscule adaptions.) > + */ > +static int find_allocation(BlockDriverState *bs, off_t start, > + off_t *data, off_t *hole) > +{ > +BDRVGlusterState *s = bs->opaque; > +off_t offs; > + > +/* > + * SEEK_DATA cases: > + * D1. offs == start: start is in data > + * D2. offs > start: start is in a hole, next data at offs > + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole > + * or start is beyond EOF > + * If the latter happens, the file has been truncated behind > + * our back since we opened it. All bets are off then. > + * Treating like a trailing hole is simplest. > + * D4. offs < 0, errno != ENXIO: we learned nothing > + */ > +offs = glfs_lseek(s->fd, start, SEEK_DATA); > +if (offs < 0) { > +return -errno; /* D3 or D4 */ > +} > +assert(offs >= start); > + > +if (offs > start) { > +/* D2: in hole, next data at offs */ > +*hole = start; > +*data = offs; > +return 0; > +} > + > +/* D1: in data, end not yet known */ > + > +/* > + * SEEK_HOLE cases: > + * H1. offs == start: start is in a hole > + * If this happens here, a hole has been dug behind our back > + * since the previous lseek(). > + * H2. offs > start: either start is in data, next hole at offs, > + * or start is in trailing hole, EOF at offs > + * Linux treats trailing holes like any other hole: offs == > + * start. Solaris seeks to EOF instead: offs > start (blech). > + * If that happens here, a hole has been dug behind our back > + * since the previous lseek(). > + * H3. offs < 0, errno = ENXIO: start is beyond EOF > + * If this happens, the file has been truncated behind our > + * back since we opened it. Treat it like a trailing hole. > + * H4. offs < 0, errno != ENXIO: we learned nothing > + * Pretend we know nothing at all, i.e. "forget" about D1. > + */ > +offs = glfs_lseek(s->fd, start, SEEK_HOLE); > +if (offs < 0) { > +return -errno; /* D1 and (H3 or H4) */ > +} > +assert(offs >= start); > + > +if (offs > start) { > +/* > + * D1 and H2: either in data, next hole at offs, or it was in > + * data but is now in a trailing hole. In the latter case, > + * all bets are off. Treating it as if it there was data all > + * the way to EOF is safe, so simply do that. > + */ > +*data = start; > +*hole = offs; > +return 0; > +} > + > +/* D1 and H1 */ > +return -EBUSY; > +} > + > +/* > + * Returns the allocation status of the specified sectors. > + * > + * If 'sector_num' is beyond the end of the disk image the return value is 0 > + * and 'pnum' is set to 0. > + * > + * 'pnum' is set to the number of sectors (including and immediately > following > + * the specified sector) that are known to be in the same > + * allocated/unallocated state. > + * > + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes > + * beyond the end of the disk image it will be clamped. > + * > + * (Based on raw_co_get_block_status() from raw-posix.c.) > + */ > +static
Re: [Qemu-block] [PATCH v3 06/15] block: Hide HBitmap in block dirty bitmap interface
On 27.02.2016 10:20, Fam Zheng wrote: > HBitmap is an implementation detail of block dirty bitmap that should be > hidden > from users. Introduce a BdrvDirtyBitmapIter to encapsulate the underlying > HBitmapIter. > > A small difference in the interface is, before, an HBitmapIter is initialized > in place, now the new BdrvDirtyBitmapIter must be dynamically allocated > because > the structure definition is in block.c. > > Two current users are converted too. > > Signed-off-by: Fam Zheng> Reviewed-by: John Snow > --- > block/backup.c | 14 -- > block/dirty-bitmap.c | 39 +-- > block/mirror.c | 14 -- > include/block/dirty-bitmap.h | 7 +-- > include/qemu/typedefs.h | 1 + > 5 files changed, 55 insertions(+), 20 deletions(-) I tried my best at fixing up the rebase conflicts, but block/mirror.c has just changed too much ("mirror: Rewrite mirror_iteration") to be able to pretend these fixes are trivial. Therefore, I'm afraid this patch will need a rebase. Max signature.asc Description: OpenPGP digital signature
Re: [Qemu-block] strange crash in tracked_request_begin
On 03/07/2016 06:01 PM, Stefan Hajnoczi wrote: > On Mon, Mar 07, 2016 at 01:29:08PM +0100, Christian Borntraeger wrote: >> Folks, >> >> I had a crash of a qemu guest in tracked_request_begin. >> The testcase was a guest with ramdisk/kernel that reboots in a >> loop. (about 10 times per second) with a single null-co disk >> attached. No idea how to reproduce this, seems to be a lucky hit. >> >> (gdb) bt >> #0 0x101db5ba in tracked_request_begin >> (req=req@entry=0x3ff90f1bdc0, bs=bs@entry=0x42a39190, offset=offset@entry=0, >> bytes=bytes@entry=4096, type=type@entry=BDRV_TRACKED_READ) >> at /home/cborntra/REPOS/qemu/block/io.c:390 >> #1 0x101de91e in bdrv_co_do_preadv (bs=0x42a39190, offset=0, >> bytes=4096, qiov=0x3ff7400cbd8, flags=, flags@entry=(unknown: >> 0)) >> at /home/cborntra/REPOS/qemu/block/io.c:1001 >> #2 0x101dfc3e in bdrv_co_do_readv (flags=(unknown: 0), >> qiov=, nb_sectors=, sector_num=> out>, bs=) >> at /home/cborntra/REPOS/qemu/block/io.c:1024 >> #3 bdrv_co_do_rw (opaque=0x3ff7400e370) at >> /home/cborntra/REPOS/qemu/block/io.c:2173 >> #4 0x1022d8f6 in coroutine_trampoline (i0=, >> i1=-1946150928) at /home/cborntra/REPOS/qemu/util/coroutine-ucontext.c:79 >> #5 0x03ff95ed150a in __makecontext_ret () from /lib64/libc.so.6 >> >> looking at the code we are at >> >> QLIST_INSERT_HEAD(>tracked_requests, req, list); >> which translates to >> >> if (((req)->list.le_next = (>tracked_requests)->lh_first) != NULL) >> (>tracked_requests)->lh_first->list.le_prev = &(req)->list.le_next; >> (>tracked_requests)->lh_first = (req); >> (req)->list.le_prev = &(>tracked_requests)->lh_first; >> >> gdb says, that (>tracked_requests)->lh_first) is zero in the corefile >> (gdb) print /x bs->tracked_requests >> $6 = {lh_first = 0x0} >> >> Now looking at the code I am asking myself if this can happen in parallel >> to another code that touches tracked_requests, because gcc seems to read >> >tracked_requests)->lh_first twice (first to check the value, then >> to use it as pointer) > > tracked_requests is protected by AioContext. Perhaps something is doing > I/O without acquiring AioContext? Hmm, the guest was rebooting, which resets all devices. Maybe something in that code is still not right? I will have a look. > > Luckily there is only 1 place where items are added and removed from > tracked_requests. This might make debugging somewhat easier. I have trouble reproducing the issue, which makes it hard :-/ >> >> 388 qemu_co_queue_init(>wait_queue); >>0x101db594 <+76>: la %r2,72(%r13) >>0x101db598 <+80>: brasl %r14,0x1022cdc0 >> >> 389 >> 390 QLIST_INSERT_HEAD(>tracked_requests, req, list); >>0x101db59e <+86>: lg %r1,12744(%r12) # r1 = >> (>tracked_requests)->lh_first) >>0x101db5a4 <+92>: stg %r1,48(%r13)# >> (req)->list.le_next = r1 >>0x101db5aa <+98>: cgij%r1,0,8,0x101db5c0 ---+ # if r1==0 goto >>0x101db5b0 <+104>:lg %r1,12744(%r12) | # r1 = >> (>tracked_requests)->lh_first) (again!!) >>0x101db5b6 <+110>:la %r2,48(%r13) | >> => 0x101db5ba <+114>:stg %r2,56(%r1) | # r1==0 >> bang >>0x101db5c0 <+120>:stg %r13,12744(%r12)<-+ >>0x101db5c6 <+126>:lay %r12,12744(%r12) >>0x101db5cc <+132>:stg %r12,56(%r13) >> >> >> Christian >>
Re: [Qemu-block] [PATCH v3 04/15] block: Move block dirty bitmap code to separate files
On 27.02.2016 10:20, Fam Zheng wrote: > The only code change is making bdrv_dirty_bitmap_truncate public. It is > used in block.c. > > Also two long lines (bdrv_get_dirty) are wrapped. > > Signed-off-by: Fam Zheng> Reviewed-by: John Snow > --- > block.c | 360 > block/Makefile.objs | 2 +- > block/dirty-bitmap.c | 387 > +++ > include/block/block.h| 35 +--- > include/block/dirty-bitmap.h | 45 + > 5 files changed, 434 insertions(+), 395 deletions(-) > create mode 100644 block/dirty-bitmap.c > create mode 100644 include/block/dirty-bitmap.h [...] > diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h > new file mode 100644 > index 000..45c6046 > --- /dev/null > +++ b/include/block/dirty-bitmap.h > @@ -0,0 +1,45 @@ > +#ifndef BLOCK_DIRTY_BITMAP_H > +#define BLOCK_DIRTY_BITMAP_H > + > +#include "qemu-common.h" > +#include "qemu/hbitmap.h" > + > +typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; Doesn't patch 3 make this superfluous? If so, I can remove it for you. Max signature.asc Description: OpenPGP digital signature
Re: [Qemu-block] [PATCH] block/gluster: add support for SEEK_DATA/SEEK_HOLE
On Mon, Mar 07, 2016 at 07:04:15PM +0100, Niels de Vos wrote: > GlusterFS 3.8 contains support for SEEK_DATA and SEEK_HOLE. This makes > it possible to detect sparse areas in files. > > Signed-off-by: Niels de Vos> > -- > Tested by compiling and running "qemu-img map gluster://..." with a > build of the current master branch of glusterfs. Using a Fedora > cloud image (in raw format) shows many SEEK procudure calls going back > and forth over the network. The output of "qemu map" matches the output > when run against the image on the local filesystem. > --- > block/gluster.c | 159 > > configure | 25 + > 2 files changed, 184 insertions(+) > > diff --git a/block/gluster.c b/block/gluster.c > index 65077a0..1430010 100644 > --- a/block/gluster.c > +++ b/block/gluster.c > @@ -677,6 +677,153 @@ static int qemu_gluster_has_zero_init(BlockDriverState > *bs) > return 0; > } > > +#ifdef CONFIG_GLUSTERFS_SEEK_DATA Why do we need to make this a compile-time option? Version checking is problematic; for instance, different distributions may have backported bug fixes / features, that are not reflected by the reported version number, etc.. Ideally, we can determine functionality during runtime, and behave accordingly. If SEEK_DATA and SEEK_HOLE are not supported, qemu_gluster_co_get_block_status can return that sectors are all allocated (which is what happens in block/io.c anyway if the driver doesn't support the function). As long as glfs_lseek() will return error (e.g. EINVAL) for an invalid whence value, we can handle it runtime. Does glfs_lseek() behave sanely? > +/* > + * Find allocation range in @bs around offset @start. > + * May change underlying file descriptor's file offset. > + * If @start is not in a hole, store @start in @data, and the > + * beginning of the next hole in @hole, and return 0. > + * If @start is in a non-trailing hole, store @start in @hole and the > + * beginning of the next non-hole in @data, and return 0. > + * If @start is in a trailing hole or beyond EOF, return -ENXIO. > + * If we can't find out, return a negative errno other than -ENXIO. > + * > + * (Shamefully copied from raw-posix.c, only miniscule adaptions.) > + */ > +static int find_allocation(BlockDriverState *bs, off_t start, > + off_t *data, off_t *hole) > +{ > +BDRVGlusterState *s = bs->opaque; > +off_t offs; > + > +/* > + * SEEK_DATA cases: > + * D1. offs == start: start is in data > + * D2. offs > start: start is in a hole, next data at offs > + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole > + * or start is beyond EOF > + * If the latter happens, the file has been truncated behind > + * our back since we opened it. All bets are off then. > + * Treating like a trailing hole is simplest. > + * D4. offs < 0, errno != ENXIO: we learned nothing > + */ > +offs = glfs_lseek(s->fd, start, SEEK_DATA); > +if (offs < 0) { > +return -errno; /* D3 or D4 */ > +} > +assert(offs >= start); > + > +if (offs > start) { > +/* D2: in hole, next data at offs */ > +*hole = start; > +*data = offs; > +return 0; > +} > + > +/* D1: in data, end not yet known */ > + > +/* > + * SEEK_HOLE cases: > + * H1. offs == start: start is in a hole > + * If this happens here, a hole has been dug behind our back > + * since the previous lseek(). > + * H2. offs > start: either start is in data, next hole at offs, > + * or start is in trailing hole, EOF at offs > + * Linux treats trailing holes like any other hole: offs == > + * start. Solaris seeks to EOF instead: offs > start (blech). > + * If that happens here, a hole has been dug behind our back > + * since the previous lseek(). > + * H3. offs < 0, errno = ENXIO: start is beyond EOF > + * If this happens, the file has been truncated behind our > + * back since we opened it. Treat it like a trailing hole. > + * H4. offs < 0, errno != ENXIO: we learned nothing > + * Pretend we know nothing at all, i.e. "forget" about D1. > + */ > +offs = glfs_lseek(s->fd, start, SEEK_HOLE); > +if (offs < 0) { > +return -errno; /* D1 and (H3 or H4) */ > +} > +assert(offs >= start); > + > +if (offs > start) { > +/* > + * D1 and H2: either in data, next hole at offs, or it was in > + * data but is now in a trailing hole. In the latter case, > + * all bets are off. Treating it as if it there was data all > + * the way to EOF is safe, so simply do that. > + */ > +*data = start; > +*hole = offs; > +return 0; > +} > + > +/* D1 and H1 */ > +return -EBUSY; > +} > + >
Re: [Qemu-block] [PATCH] block: Fix snapshot=on cache modes
On 07.03.2016 13:26, Kevin Wolf wrote: > Since commit 91a097e, we end up with a somewhat weird cache mode > configuration with snapshot=on: The commit broke the cache mode > inheritance for the snapshot overlay so that it is opened as > writethrough instead of unsafe now. The following bdrv_append() call to > put it on top of the tree swaps the WCE flag with the snapshot's backing > file (i.e. the originally given file), so what we eventually get is > cache=writeback on the temporary overlay and > cache=writethrough,cache.no-flush=on on the real image file. > > This patch changes things so that the temporary overlay gets > cache=unsafe again like it used to, and the real images get whatever the > user specified. This means that cache.direct is now respected even with > snapshot=on, and in the case of committing changes, the final flush is > no longer ignored except explicitly requested by the user. > > Signed-off-by: Kevin Wolf> --- > block.c | 34 -- > blockdev.c| 7 --- > include/block/block.h | 1 - > 3 files changed, 24 insertions(+), 18 deletions(-) > > diff --git a/block.c b/block.c > index ba24b8e..e3fe8ed 100644 > --- a/block.c > +++ b/block.c > @@ -687,13 +687,19 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) > } > > /* > - * Returns the flags that a temporary snapshot should get, based on the > - * originally requested flags (the originally requested image will have flags > - * like a backing file) > + * Returns the options and flags that a temporary snapshot should get, based > on > + * the originally requested flags (the originally requested image will have > + * flags like a backing file) > */ > -static int bdrv_temp_snapshot_flags(int flags) > +static void bdrv_temp_snapshot_options(int *child_flags, QDict > *child_options, > + int parent_flags, QDict > *parent_options) > { > -return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; > +*child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; > + > +/* For temporary files, unconditional cache=unsafe is fine */ > +qdict_set_default_str(child_options, BDRV_OPT_CACHE_WB, "on"); > +qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off"); > +qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on"); > } > > /* > @@ -1424,13 +1430,13 @@ done: > return c; > } > > -int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) > +static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, > + QDict *snapshot_options, Error **errp) > { > /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ > char *tmp_filename = g_malloc0(PATH_MAX + 1); > int64_t total_size; > QemuOpts *opts = NULL; > -QDict *snapshot_options; > BlockDriverState *bs_snapshot; > Error *local_err = NULL; > int ret; > @@ -1465,7 +1471,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int > flags, Error **errp) > } > > /* Prepare a new options QDict for the temporary file */ This comment reads a bit weird now. Rest looks good and this is not exactly critical, so: Reviewed-by: Max Reitz > -snapshot_options = qdict_new(); > qdict_put(snapshot_options, "file.driver", >qstring_from_str("file")); > qdict_put(snapshot_options, "file.filename", > @@ -1477,6 +1482,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int > flags, Error **errp) > > ret = bdrv_open(_snapshot, NULL, NULL, snapshot_options, > flags, _err); > +snapshot_options = NULL; > if (ret < 0) { > error_propagate(errp, local_err); > goto out; > @@ -1485,6 +1491,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int > flags, Error **errp) > bdrv_append(bs_snapshot, bs); > > out: > +QDECREF(snapshot_options); > g_free(tmp_filename); > return ret; > } > @@ -1516,6 +1523,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, > const char *filename, > const char *drvname; > const char *backing; > Error *local_err = NULL; > +QDict *snapshot_options = NULL; > int snapshot_flags = 0; > > assert(pbs); > @@ -1607,7 +1615,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, > const char *filename, > flags |= BDRV_O_ALLOW_RDWR; > } > if (flags & BDRV_O_SNAPSHOT) { > -snapshot_flags = bdrv_temp_snapshot_flags(flags); > +snapshot_options = qdict_new(); > +bdrv_temp_snapshot_options(_flags, snapshot_options, > + flags, options); > bdrv_backing_options(, options, flags, options); > } > > @@ -1709,7 +1719,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, > const char *filename, >
[Qemu-block] [PATCH] block/gluster: add support for SEEK_DATA/SEEK_HOLE
GlusterFS 3.8 contains support for SEEK_DATA and SEEK_HOLE. This makes it possible to detect sparse areas in files. Signed-off-by: Niels de Vos-- Tested by compiling and running "qemu-img map gluster://..." with a build of the current master branch of glusterfs. Using a Fedora cloud image (in raw format) shows many SEEK procudure calls going back and forth over the network. The output of "qemu map" matches the output when run against the image on the local filesystem. --- block/gluster.c | 159 configure | 25 + 2 files changed, 184 insertions(+) diff --git a/block/gluster.c b/block/gluster.c index 65077a0..1430010 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -677,6 +677,153 @@ static int qemu_gluster_has_zero_init(BlockDriverState *bs) return 0; } +#ifdef CONFIG_GLUSTERFS_SEEK_DATA +/* + * Find allocation range in @bs around offset @start. + * May change underlying file descriptor's file offset. + * If @start is not in a hole, store @start in @data, and the + * beginning of the next hole in @hole, and return 0. + * If @start is in a non-trailing hole, store @start in @hole and the + * beginning of the next non-hole in @data, and return 0. + * If @start is in a trailing hole or beyond EOF, return -ENXIO. + * If we can't find out, return a negative errno other than -ENXIO. + * + * (Shamefully copied from raw-posix.c, only miniscule adaptions.) + */ +static int find_allocation(BlockDriverState *bs, off_t start, + off_t *data, off_t *hole) +{ +BDRVGlusterState *s = bs->opaque; +off_t offs; + +/* + * SEEK_DATA cases: + * D1. offs == start: start is in data + * D2. offs > start: start is in a hole, next data at offs + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole + * or start is beyond EOF + * If the latter happens, the file has been truncated behind + * our back since we opened it. All bets are off then. + * Treating like a trailing hole is simplest. + * D4. offs < 0, errno != ENXIO: we learned nothing + */ +offs = glfs_lseek(s->fd, start, SEEK_DATA); +if (offs < 0) { +return -errno; /* D3 or D4 */ +} +assert(offs >= start); + +if (offs > start) { +/* D2: in hole, next data at offs */ +*hole = start; +*data = offs; +return 0; +} + +/* D1: in data, end not yet known */ + +/* + * SEEK_HOLE cases: + * H1. offs == start: start is in a hole + * If this happens here, a hole has been dug behind our back + * since the previous lseek(). + * H2. offs > start: either start is in data, next hole at offs, + * or start is in trailing hole, EOF at offs + * Linux treats trailing holes like any other hole: offs == + * start. Solaris seeks to EOF instead: offs > start (blech). + * If that happens here, a hole has been dug behind our back + * since the previous lseek(). + * H3. offs < 0, errno = ENXIO: start is beyond EOF + * If this happens, the file has been truncated behind our + * back since we opened it. Treat it like a trailing hole. + * H4. offs < 0, errno != ENXIO: we learned nothing + * Pretend we know nothing at all, i.e. "forget" about D1. + */ +offs = glfs_lseek(s->fd, start, SEEK_HOLE); +if (offs < 0) { +return -errno; /* D1 and (H3 or H4) */ +} +assert(offs >= start); + +if (offs > start) { +/* + * D1 and H2: either in data, next hole at offs, or it was in + * data but is now in a trailing hole. In the latter case, + * all bets are off. Treating it as if it there was data all + * the way to EOF is safe, so simply do that. + */ +*data = start; +*hole = offs; +return 0; +} + +/* D1 and H1 */ +return -EBUSY; +} + +/* + * Returns the allocation status of the specified sectors. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + * + * (Based on raw_co_get_block_status() from raw-posix.c.) + */ +static int64_t coroutine_fn qemu_gluster_co_get_block_status( +BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) +{ +BDRVGlusterState *s = bs->opaque; +off_t start, data = 0, hole = 0; +int64_t total_size; +int ret = -EINVAL; + +if (!s->fd) { +return ret; +} + +start = sector_num * BDRV_SECTOR_SIZE; +total_size =
Re: [Qemu-block] [PATCH] qed: fix bdrv_qed_drain
Am 23.02.2016 um 14:54 hat Paolo Bonzini geschrieben: > > > On 23/02/2016 13:49, Fam Zheng wrote: > > On Tue, 02/23 11:43, Paolo Bonzini wrote: > >> > >> > >> On 23/02/2016 06:57, Fam Zheng wrote: > >> +qed_cancel_need_check_timer(s); > >> +qed_need_check_timer_cb(s); > >> +} > > > > What if an allocating write is queued (the else branch case)? Its > > completion > > will be in bdrv_drain and it could arm the need_check_timer which is > > wrong. > > > > We need to drain the allocating_write_reqs queue before checking the > > timer. > > You're right, but how? That's what bdrv_drain(bs) does, it's a > chicken-and-egg problem. > >>> > >>> Maybe use an aio_poll loop before the if? > >> > >> That would not change the fact that you're reimplementing bdrv_drain > >> inside bdrv_qed_drain. > > > > But it fulfills the contract of .bdrv_drain. This is the easy way, the hard > > way > > would be iterating through the allocating_write_reqs list and process reqs > > one > > by one synchronously, which still involves aio_poll indirectly. > > The easy way would be better then. > > Stefan, any second opinion? What's the status here? It would be good to have qed not segfaulting all the time. Kevin
Re: [Qemu-block] strange crash in tracked_request_begin
On Mon, Mar 07, 2016 at 01:29:08PM +0100, Christian Borntraeger wrote: > Folks, > > I had a crash of a qemu guest in tracked_request_begin. > The testcase was a guest with ramdisk/kernel that reboots in a > loop. (about 10 times per second) with a single null-co disk > attached. No idea how to reproduce this, seems to be a lucky hit. > > (gdb) bt > #0 0x101db5ba in tracked_request_begin (req=req@entry=0x3ff90f1bdc0, > bs=bs@entry=0x42a39190, offset=offset@entry=0, bytes=bytes@entry=4096, > type=type@entry=BDRV_TRACKED_READ) > at /home/cborntra/REPOS/qemu/block/io.c:390 > #1 0x101de91e in bdrv_co_do_preadv (bs=0x42a39190, offset=0, > bytes=4096, qiov=0x3ff7400cbd8, flags=, flags@entry=(unknown: > 0)) > at /home/cborntra/REPOS/qemu/block/io.c:1001 > #2 0x101dfc3e in bdrv_co_do_readv (flags=(unknown: 0), > qiov=, nb_sectors=, sector_num=, > bs=) > at /home/cborntra/REPOS/qemu/block/io.c:1024 > #3 bdrv_co_do_rw (opaque=0x3ff7400e370) at > /home/cborntra/REPOS/qemu/block/io.c:2173 > #4 0x1022d8f6 in coroutine_trampoline (i0=, > i1=-1946150928) at /home/cborntra/REPOS/qemu/util/coroutine-ucontext.c:79 > #5 0x03ff95ed150a in __makecontext_ret () from /lib64/libc.so.6 > > looking at the code we are at > > QLIST_INSERT_HEAD(>tracked_requests, req, list); > which translates to > > if (((req)->list.le_next = (>tracked_requests)->lh_first) != NULL) > (>tracked_requests)->lh_first->list.le_prev = &(req)->list.le_next; > (>tracked_requests)->lh_first = (req); > (req)->list.le_prev = &(>tracked_requests)->lh_first; > > gdb says, that (>tracked_requests)->lh_first) is zero in the corefile > (gdb) print /x bs->tracked_requests > $6 = {lh_first = 0x0} > > Now looking at the code I am asking myself if this can happen in parallel > to another code that touches tracked_requests, because gcc seems to read > >tracked_requests)->lh_first twice (first to check the value, then > to use it as pointer) tracked_requests is protected by AioContext. Perhaps something is doing I/O without acquiring AioContext? Luckily there is only 1 place where items are added and removed from tracked_requests. This might make debugging somewhat easier. > > 388 qemu_co_queue_init(>wait_queue); >0x101db594 <+76>: la %r2,72(%r13) >0x101db598 <+80>: brasl %r14,0x1022cdc0 > > 389 > 390 QLIST_INSERT_HEAD(>tracked_requests, req, list); >0x101db59e <+86>: lg %r1,12744(%r12) # r1 = > (>tracked_requests)->lh_first) >0x101db5a4 <+92>: stg %r1,48(%r13)# > (req)->list.le_next = r1 >0x101db5aa <+98>: cgij%r1,0,8,0x101db5c0 ---+ # if r1==0 goto >0x101db5b0 <+104>: lg %r1,12744(%r12) | # r1 = > (>tracked_requests)->lh_first) (again!!) >0x101db5b6 <+110>: la %r2,48(%r13) | > => 0x101db5ba <+114>: stg %r2,56(%r1) | # r1==0 bang >0x101db5c0 <+120>: stg %r13,12744(%r12)<-+ >0x101db5c6 <+126>: lay %r12,12744(%r12) >0x101db5cc <+132>: stg %r12,56(%r13) > > > Christian > signature.asc Description: PGP signature
Re: [Qemu-block] [PATCH v10 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()
On 07.03.2016 17:02, Eric Blake wrote: > On 03/05/2016 11:13 AM, Max Reitz wrote: > >>> +index = atoi(child->name + 9); >> >> Optional: Assert absence of an error: >> > > Indeed, atoi() is worthless, because it cannot do error detection. > >> unsigned long index; >> char *endptr; >> >> index = strtoul(child->name + 9, , 10); >> assert(index >= 0 && !*endptr); > > Still incorrect; you aren't handling errno properly for detecting all > errors. Even better is to use qemu_strtoul(), which already handles > proper error detection. Yeah, I keep forgetting that it returns ULONG_MAX on range error... Max signature.asc Description: OpenPGP digital signature
Re: [Qemu-block] [PATCH v10 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()
On 03/05/2016 11:13 AM, Max Reitz wrote: >> +index = atoi(child->name + 9); > > Optional: Assert absence of an error: > Indeed, atoi() is worthless, because it cannot do error detection. > unsigned long index; > char *endptr; > > index = strtoul(child->name + 9, , 10); > assert(index >= 0 && !*endptr); Still incorrect; you aren't handling errno properly for detecting all errors. Even better is to use qemu_strtoul(), which already handles proper error detection. -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
Re: [Qemu-block] [Qemu-devel] [PATCH] block: Fix cache mode defaults in bds_tree_init()
On 03/07/2016 06:42 AM, Kevin Wolf wrote: > Without setting explicit defaults in the options, blockdev-add without > an ID ended up defaulting to writethrough. It should be writeback as > documented. > > Signed-off-by: Kevin Wolf> --- > blockdev.c | 7 +++ > 1 file changed, 7 insertions(+) > Reviewed-by: Eric Blake > diff --git a/blockdev.c b/blockdev.c > index eecd78d..1824cae 100644 > --- a/blockdev.c > +++ b/blockdev.c > @@ -675,6 +675,13 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, > Error **errp) > goto fail; > } > > +/* bdrv_open() defaults to the values in bdrv_flags (for compatibility > + * with other callers) rather than what we want as the real defaults. > + * Apply the defaults here instead. */ > +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_WB, "on"); > +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off"); > +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off"); > + > if (runstate_check(RUN_STATE_INMIGRATE)) { > bdrv_flags |= BDRV_O_INACTIVE; > } > -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
[Qemu-block] strange crash in tracked_request_begin
Folks, I had a crash of a qemu guest in tracked_request_begin. The testcase was a guest with ramdisk/kernel that reboots in a loop. (about 10 times per second) with a single null-co disk attached. No idea how to reproduce this, seems to be a lucky hit. (gdb) bt #0 0x101db5ba in tracked_request_begin (req=req@entry=0x3ff90f1bdc0, bs=bs@entry=0x42a39190, offset=offset@entry=0, bytes=bytes@entry=4096, type=type@entry=BDRV_TRACKED_READ) at /home/cborntra/REPOS/qemu/block/io.c:390 #1 0x101de91e in bdrv_co_do_preadv (bs=0x42a39190, offset=0, bytes=4096, qiov=0x3ff7400cbd8, flags=, flags@entry=(unknown: 0)) at /home/cborntra/REPOS/qemu/block/io.c:1001 #2 0x101dfc3e in bdrv_co_do_readv (flags=(unknown: 0), qiov=, nb_sectors=, sector_num=, bs=) at /home/cborntra/REPOS/qemu/block/io.c:1024 #3 bdrv_co_do_rw (opaque=0x3ff7400e370) at /home/cborntra/REPOS/qemu/block/io.c:2173 #4 0x1022d8f6 in coroutine_trampoline (i0=, i1=-1946150928) at /home/cborntra/REPOS/qemu/util/coroutine-ucontext.c:79 #5 0x03ff95ed150a in __makecontext_ret () from /lib64/libc.so.6 looking at the code we are at QLIST_INSERT_HEAD(>tracked_requests, req, list); which translates to if (((req)->list.le_next = (>tracked_requests)->lh_first) != NULL) (>tracked_requests)->lh_first->list.le_prev = &(req)->list.le_next; (>tracked_requests)->lh_first = (req); (req)->list.le_prev = &(>tracked_requests)->lh_first; gdb says, that (>tracked_requests)->lh_first) is zero in the corefile (gdb) print /x bs->tracked_requests $6 = {lh_first = 0x0} Now looking at the code I am asking myself if this can happen in parallel to another code that touches tracked_requests, because gcc seems to read >tracked_requests)->lh_first twice (first to check the value, then to use it as pointer) 388 qemu_co_queue_init(>wait_queue); 0x101db594 <+76>:la %r2,72(%r13) 0x101db598 <+80>:brasl %r14,0x1022cdc0 389 390 QLIST_INSERT_HEAD(>tracked_requests, req, list); 0x101db59e <+86>:lg %r1,12744(%r12) # r1 = (>tracked_requests)->lh_first) 0x101db5a4 <+92>:stg %r1,48(%r13)# (req)->list.le_next = r1 0x101db5aa <+98>:cgij%r1,0,8,0x101db5c0 ---+ # if r1==0 goto 0x101db5b0 <+104>: lg %r1,12744(%r12) | # r1 = (>tracked_requests)->lh_first) (again!!) 0x101db5b6 <+110>: la %r2,48(%r13) | => 0x101db5ba <+114>: stg %r2,56(%r1) | # r1==0 bang 0x101db5c0 <+120>: stg %r13,12744(%r12)<-+ 0x101db5c6 <+126>: lay %r12,12744(%r12) 0x101db5cc <+132>: stg %r12,56(%r13) Christian
[Qemu-block] [PATCH] block: Fix cache mode defaults in bds_tree_init()
Without setting explicit defaults in the options, blockdev-add without an ID ended up defaulting to writethrough. It should be writeback as documented. Signed-off-by: Kevin Wolf--- blockdev.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/blockdev.c b/blockdev.c index eecd78d..1824cae 100644 --- a/blockdev.c +++ b/blockdev.c @@ -675,6 +675,13 @@ static BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp) goto fail; } +/* bdrv_open() defaults to the values in bdrv_flags (for compatibility + * with other callers) rather than what we want as the real defaults. + * Apply the defaults here instead. */ +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_WB, "on"); +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_DIRECT, "off"); +qdict_set_default_str(bs_opts, BDRV_OPT_CACHE_NO_FLUSH, "off"); + if (runstate_check(RUN_STATE_INMIGRATE)) { bdrv_flags |= BDRV_O_INACTIVE; } -- 1.8.3.1
Re: [Qemu-block] strange crash in tracked_request_begin
On 07/03/2016 13:29, Christian Borntraeger wrote: > Folks, > > I had a crash of a qemu guest in tracked_request_begin. > The testcase was a guest with ramdisk/kernel that reboots in a > loop. (about 10 times per second) with a single null-co disk > attached. Does it use a separate iothread? Paolo
[Qemu-block] [PATCH] block: Fix snapshot=on cache modes
Since commit 91a097e, we end up with a somewhat weird cache mode configuration with snapshot=on: The commit broke the cache mode inheritance for the snapshot overlay so that it is opened as writethrough instead of unsafe now. The following bdrv_append() call to put it on top of the tree swaps the WCE flag with the snapshot's backing file (i.e. the originally given file), so what we eventually get is cache=writeback on the temporary overlay and cache=writethrough,cache.no-flush=on on the real image file. This patch changes things so that the temporary overlay gets cache=unsafe again like it used to, and the real images get whatever the user specified. This means that cache.direct is now respected even with snapshot=on, and in the case of committing changes, the final flush is no longer ignored except explicitly requested by the user. Signed-off-by: Kevin Wolf--- block.c | 34 -- blockdev.c| 7 --- include/block/block.h | 1 - 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/block.c b/block.c index ba24b8e..e3fe8ed 100644 --- a/block.c +++ b/block.c @@ -687,13 +687,19 @@ int bdrv_parse_cache_flags(const char *mode, int *flags) } /* - * Returns the flags that a temporary snapshot should get, based on the - * originally requested flags (the originally requested image will have flags - * like a backing file) + * Returns the options and flags that a temporary snapshot should get, based on + * the originally requested flags (the originally requested image will have + * flags like a backing file) */ -static int bdrv_temp_snapshot_flags(int flags) +static void bdrv_temp_snapshot_options(int *child_flags, QDict *child_options, + int parent_flags, QDict *parent_options) { -return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; +*child_flags = (parent_flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; + +/* For temporary files, unconditional cache=unsafe is fine */ +qdict_set_default_str(child_options, BDRV_OPT_CACHE_WB, "on"); +qdict_set_default_str(child_options, BDRV_OPT_CACHE_DIRECT, "off"); +qdict_set_default_str(child_options, BDRV_OPT_CACHE_NO_FLUSH, "on"); } /* @@ -1424,13 +1430,13 @@ done: return c; } -int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) +static int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, + QDict *snapshot_options, Error **errp) { /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ char *tmp_filename = g_malloc0(PATH_MAX + 1); int64_t total_size; QemuOpts *opts = NULL; -QDict *snapshot_options; BlockDriverState *bs_snapshot; Error *local_err = NULL; int ret; @@ -1465,7 +1471,6 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) } /* Prepare a new options QDict for the temporary file */ -snapshot_options = qdict_new(); qdict_put(snapshot_options, "file.driver", qstring_from_str("file")); qdict_put(snapshot_options, "file.filename", @@ -1477,6 +1482,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) ret = bdrv_open(_snapshot, NULL, NULL, snapshot_options, flags, _err); +snapshot_options = NULL; if (ret < 0) { error_propagate(errp, local_err); goto out; @@ -1485,6 +1491,7 @@ int bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) bdrv_append(bs_snapshot, bs); out: +QDECREF(snapshot_options); g_free(tmp_filename); return ret; } @@ -1516,6 +1523,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, const char *drvname; const char *backing; Error *local_err = NULL; +QDict *snapshot_options = NULL; int snapshot_flags = 0; assert(pbs); @@ -1607,7 +1615,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, flags |= BDRV_O_ALLOW_RDWR; } if (flags & BDRV_O_SNAPSHOT) { -snapshot_flags = bdrv_temp_snapshot_flags(flags); +snapshot_options = qdict_new(); +bdrv_temp_snapshot_options(_flags, snapshot_options, + flags, options); bdrv_backing_options(, options, flags, options); } @@ -1709,7 +1719,9 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename, /* For snapshot=on, create a temporary qcow2 overlay. bs points to the * temporary snapshot afterwards. */ if (snapshot_flags) { -ret = bdrv_append_temp_snapshot(bs, snapshot_flags, _err); +ret = bdrv_append_temp_snapshot(bs, snapshot_flags, snapshot_options, +_err); +snapshot_options = NULL; if (local_err) { goto