On 03/23/2016 04:22 AM, Vladimir Sementsov-Ogievskiy wrote: > On 23.03.2016 00:49, John Snow wrote: >> >> On 03/15/2016 04:04 PM, Vladimir Sementsov-Ogievskiy wrote: >>> Add functions for load/store HBitmap to BDS, using clusters table: >>> Last level of the bitmap is splitted into chunks of 'cluster_size' >>> size. Each cell of the table contains offset in bds, to load/store >>> corresponding chunk. >>> >>> Also, >>> 0 in cell means all-zeroes-chunk (should not be saved) >>> 1 in cell means all-ones-chunk (should not be saved) >>> hbitmap_prepare_store() fills table with >>> 0 for all-zeroes chunks >>> 1 for all-ones chunks >>> 2 for others >>> >>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> >>> --- >>> block/dirty-bitmap.c | 23 +++++ >>> include/block/dirty-bitmap.h | 11 +++ >>> include/qemu/hbitmap.h | 12 +++ >>> util/hbitmap.c | 209 >>> +++++++++++++++++++++++++++++++++++++++++++ >>> 4 files changed, 255 insertions(+) >>> >>> diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c >>> index e68c177..816c6ee 100644 >>> --- a/block/dirty-bitmap.c >>> +++ b/block/dirty-bitmap.c >>> @@ -396,3 +396,26 @@ int64_t bdrv_get_dirty_count(BdrvDirtyBitmap >>> *bitmap) >>> { >>> return hbitmap_count(bitmap->bitmap); >>> } >>> + >>> +int bdrv_dirty_bitmap_load(BdrvDirtyBitmap *bitmap, BlockDriverState >>> *bs, >>> + const uint64_t *table, uint32_t table_size, >>> + uint32_t cluster_size) >>> +{ >>> + return hbitmap_load(bitmap->bitmap, bs, table, table_size, >>> cluster_size); >>> +} >>> + >>> +int bdrv_dirty_bitmap_prepare_store(const BdrvDirtyBitmap *bitmap, >>> + uint32_t cluster_size, >>> + uint64_t *table, >>> + uint32_t *table_size) >>> +{ >>> + return hbitmap_prepare_store(bitmap->bitmap, cluster_size, >>> + table, table_size); >>> +} >>> + >>> +int bdrv_dirty_bitmap_store(const BdrvDirtyBitmap *bitmap, >>> BlockDriverState *bs, >>> + const uint64_t *table, uint32_t table_size, >>> + uint32_t cluster_size) >>> +{ >>> + return hbitmap_store(bitmap->bitmap, bs, table, table_size, >>> cluster_size); >>> +} >>> diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h >>> index 27515af..20cb540 100644 >>> --- a/include/block/dirty-bitmap.h >>> +++ b/include/block/dirty-bitmap.h >>> @@ -43,4 +43,15 @@ void bdrv_set_dirty_iter(struct HBitmapIter *hbi, >>> int64_t offset); >>> int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); >>> void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); >>> +int bdrv_dirty_bitmap_load(BdrvDirtyBitmap *bitmap, >>> BlockDriverState *bs, >>> + const uint64_t *table, uint32_t table_size, >>> + uint32_t cluster_size); >>> +int bdrv_dirty_bitmap_prepare_store(const BdrvDirtyBitmap *bitmap, >>> + uint32_t cluster_size, >>> + uint64_t *table, >>> + uint32_t *table_size); >>> +int bdrv_dirty_bitmap_store(const BdrvDirtyBitmap *bitmap, >>> BlockDriverState *bs, >>> + const uint64_t *table, uint32_t table_size, >>> + uint32_t cluster_size); >>> + >>> #endif >>> diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h >>> index 6d1da4d..d83bb79 100644 >>> --- a/include/qemu/hbitmap.h >>> +++ b/include/qemu/hbitmap.h >>> @@ -241,5 +241,17 @@ static inline size_t >>> hbitmap_iter_next_word(HBitmapIter *hbi, unsigned long *p_c >>> return hbi->pos; >>> } >>> +typedef struct BlockDriverState BlockDriverState; >>> + >>> +int hbitmap_load(HBitmap *bitmap, BlockDriverState *bs, >>> + const uint64_t *table, uint32_t table_size, >>> + uint32_t cluster_size); >>> + >>> +int hbitmap_prepare_store(const HBitmap *bitmap, uint32_t cluster_size, >>> + uint64_t *table, uint32_t *table_size); >>> + >>> +int hbitmap_store(HBitmap *bitmap, BlockDriverState *bs, >>> + const uint64_t *table, uint32_t table_size, >>> + uint32_t cluster_size); >>> #endif >>> diff --git a/util/hbitmap.c b/util/hbitmap.c >>> index 28595fb..1960e4f 100644 >>> --- a/util/hbitmap.c >>> +++ b/util/hbitmap.c >>> @@ -15,6 +15,8 @@ >>> #include "qemu/host-utils.h" >>> #include "trace.h" >>> +#include "block/block.h" >>> + >>> /* HBitmaps provides an array of bits. The bits are stored as >>> usual in an >>> * array of unsigned longs, but HBitmap is also optimized to >>> provide fast >>> * iteration over set bits; going from one bit to the next is >>> O(logB n) >>> @@ -499,3 +501,210 @@ char *hbitmap_md5(const HBitmap *bitmap) >>> const guchar *data = (const guchar >>> *)bitmap->levels[HBITMAP_LEVELS - 1]; >>> return g_compute_checksum_for_data(G_CHECKSUM_MD5, data, size); >>> } >>> + >>> +/* hb_restore_levels() >>> + * Using the last level restore all other levels >>> + */ >>> +static void hb_restore_levels(HBitmap *bitmap) >>> +{ >>> + int64_t i, size, prev_size; >>> + int lev; >>> + >>> + /* restore levels starting from penultimate to zero level, assuming >>> + * that the last level is ok */ >>> + size = MAX((bitmap->size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, >>> 1); >>> + for (lev = HBITMAP_LEVELS - 1; lev-- > 0; ) { >>> + prev_size = size; >>> + size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); >>> + memset(bitmap->levels[lev], 0, size * sizeof(unsigned long)); >>> + >>> + for (i = 0; i < prev_size; ++i) { >>> + if (bitmap->levels[lev + 1][i]) { >>> + bitmap->levels[lev][i >> BITS_PER_LEVEL] |= >>> + 1UL << (i & (BITS_PER_LONG - 1)); >>> + } >>> + } >>> + } >>> + >>> + bitmap->levels[0][0] |= 1UL << (BITS_PER_LONG - 1); >>> +} >>> + >>> +/* load_bitmap() >>> + * Load dirty bitmap from file, using table with cluster offsets. >>> + * Table entries are assumed to be in little endian format. >>> + */ >>> +int hbitmap_load(HBitmap *bitmap, BlockDriverState *bs, >>> + const uint64_t *table, uint32_t table_size, >>> + uint32_t cluster_size) >>> +{ >>> + uint32_t i; >>> + uint8_t *cur = (uint8_t *)bitmap->levels[HBITMAP_LEVELS - 1]; >>> + uint8_t *end = cur + ((bitmap->size + 7) >> 3); >>> + >>> + hbitmap_reset_all(bitmap); >>> + >>> + for (i = 0; i < table_size && cur < end; ++i) { >>> + uint64_t offset = table[i]; >>> + uint64_t count = MIN(cluster_size, end - cur); >>> + >>> + /* Zero offset means zero region, offset = 1 means filled >>> region. >>> + * Cluster is not allocated in both cases. */ >>> + if (offset == 1) { >>> + memset(cur, 0xff, count); >>> + } else if (offset) { >>> + int ret = bdrv_pread(bs, offset, cur, count); >>> + if (ret < 0) { >>> + return ret; >>> + } >>> + } >>> + >>> + cur += cluster_size; >>> + } >>> + >>> + cur = (uint8_t *)bitmap->levels[HBITMAP_LEVELS - 1]; >>> + while (cur < end) { >>> + if (BITS_PER_LONG == 32) { >>> + le32_to_cpus((uint32_t *)cur); >>> + } else { >>> + le64_to_cpus((uint64_t *)cur); >>> + } >>> + >>> + cur += sizeof(unsigned long); >>> + } >>> + >>> + hb_restore_levels(bitmap); >>> + >>> + return 0; >>> +} >>> + >>> +static bool buffer_is_all_ones(void *buf, size_t len) >>> +{ >>> + /* FIXME */ >>> + return false; >>> +} >>> + >>> +/* hbitmap_prepare_store() >>> + * Devide bitmap data into clusters, and then, >>> + * for zero cluster: table[i] = 0 >>> + * for all-ones cluster: table[i] = 1 >>> + * for other clusters: table[i] = 2 >>> + */ >>> +int hbitmap_prepare_store(const HBitmap *bitmap, >>> + uint32_t cluster_size, >>> + uint64_t *table, >>> + uint32_t *table_size) >>> +{ >>> + HBitmapIter hbi; >>> + hbitmap_iter_init(&hbi, bitmap, 0); >>> + uint64_t nb_bits_in_cl = (uint64_t)cluster_size << 3; >>> + uint32_t need_table_size = >>> + (bitmap->size + nb_bits_in_cl - 1) / nb_bits_in_cl; >>> + >>> + if (table == NULL && *table_size == 0) { >>> + *table_size = need_table_size; >>> + return 0; >>> + } >>> + >>> + if (*table_size < need_table_size) { >>> + return -ENOMEM; >>> + } >>> + >>> + memset(table, 0, *table_size * sizeof(table[0])); >>> + >>> + for (;;) { >>> + unsigned long cur; >>> + size_t pos = hbitmap_iter_next_word(&hbi, &cur); >>> + size_t byte = pos * sizeof(unsigned long); >>> + uint64_t bit = byte << 3; >>> + uint64_t nbits = MIN(cluster_size << 3, bitmap->size - bit), >>> next_bit; >>> + size_t i = byte / cluster_size; >>> + >>> + if (pos == -1) { >>> + break; >>> + } >>> + >>> + if (pos % cluster_size != 0) { >>> + table[i] = 2; >>> + } else if (buffer_is_all_ones(&bitmap->levels[HBITMAP_LEVELS >>> - 1][pos], >>> + nbits >> 3)) { >>> + table[i] = 1; >>> + if (nbits & 7) { >>> + uint8_t last_byte = >>> + *(((uint8_t *)&bitmap->levels[HBITMAP_LEVELS >>> - 1][pos]) >>> + + (nbits >> 3)); >>> + if (last_byte != ((1 << (nbits & 7)) - 1)) { >>> + table[i] = 2; >>> + } >>> + } >>> + } else { >>> + table[i] = 2; >>> + } >>> + >>> + next_bit = (i + 1) * cluster_size << 3; >>> + >>> + if (next_bit >= bitmap->size) { >>> + break; >>> + } >>> + >>> + hbitmap_iter_init(&hbi, bitmap, next_bit << >>> bitmap->granularity); >>> + } >>> + >>> + return 0; >>> +} >>> + >>> +static void longs_to_le(unsigned long *longs, size_t count) >>> +{ >>> + unsigned long *end = longs + count; >>> + while (longs < end) { >>> + if (BITS_PER_LONG == 32) { >>> + cpu_to_le32s((uint32_t *)longs); >>> + } else { >>> + cpu_to_le64s((uint64_t *)longs); >>> + } >>> + >>> + longs++; >>> + } >>> +} >>> + >>> +/* store_bitmap() >>> + * update bitmap table by storing bitmap to it. >>> + * bitmap table entries are assumed to be in big endian format >>> + * On the error, the resulting bitmap table is valid for clearing, but >>> + * may contain invalid bitmap */ >>> +int hbitmap_store(HBitmap *bitmap, BlockDriverState *bs, >>> + const uint64_t *table, uint32_t table_size, >>> + uint32_t cluster_size) >>> +{ >>> + int i; >>> + uint8_t *cur = (uint8_t *)bitmap->levels[HBITMAP_LEVELS - 1]; >>> + uint8_t *end = cur + >>> + ((bitmap->size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL) * >>> (sizeof(long)); >>> + >>> + for (i = 0; i < table_size && cur < end; ++i) { >>> + uint64_t offset = table[i]; >>> + uint64_t count = MIN(cluster_size, end - cur); >>> + >>> + /* Zero offset means zero region, offset = 1 means filled >>> region. >>> + * Cluster is not allocated in both cases. */ >>> + if (offset > 1) { >>> + int ret; >>> + if (cpu_to_le16(1) == 1) { >>> + ret = bdrv_pwrite(bs, offset, cur, count); >> I suspect that the reason you're using bdrv_pwrite down in here is to >> avoid a buffered copy where hbitmap prepares a buffer and then the file >> format layer decides what to do with it, opting instead to allow the >> hbitmap layer itself to do a direct write. >> >> I don't think this is going to work, though -- hbitmaps are a generic >> utility and shouldn't have block layer access. >> >> I think the standard, naive design is the right one: >> >> (1) qcow2-bitmap.c calls >> bdrv_dirty_bitmap_serialize(bdrvdirtybitmap *bitmap, void *out); >> and serialization primitives for hbitmap are used to construct a buffer >> placed in *out. >> >> (2) qcow2-bitmap.c writes this buffer itself where it needs it, using >> bdrv_pwrite. >> >> If this proves to be too slow, we can optimize it later. > > May be just pass writing function to hbitmap_store, like int > (*pwrite_func)(void *opaque, int64_t offset, const void *buf, int bytes) > ? Serialization is superfluous here. It is needed for migration, where > we forced to work with data chunks protocol layer. But why to split here > the bitmap, dance with granularity and block layer wrappers, and make an > additional copy of the data? >
I hate to say it, but I think this is premature optimization. If you can prove there is a meaningful difference between a buffered serialization and a zero-copy, I'll go along with it -- but I think until we are bench-marking quite large bitmaps, we won't see much of a difference. This should be easy enough to improve later if we need to. >> >>> + } else { >>> + void *tmp = g_memdup(cur, count); >>> + longs_to_le((unsigned long *)cur, >>> + count / sizeof(unsigned long)); >>> + ret = bdrv_pwrite(bs, offset, tmp, count); >>> + g_free(tmp); >>> + } >>> + >>> + if (ret < 0) { >>> + return ret; >>> + } >>> + } >>> + >>> + cur += cluster_size; >>> + } >>> + >>> + return 0; >>> +} >>> > >