[Qemu-devel] [PATCH 3/3] aio: always check paio_init result
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/raw-posix.c | 11 +-- 1 files changed, 5 insertions(+), 6 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 275b41e..3c6bd4b 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -230,13 +230,15 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } } +/* We're falling back to POSIX AIO in some cases so init always */ +if (paio_init() 0) { +goto out_free_buf; +} + #ifdef CONFIG_LINUX_AIO if ((bdrv_flags (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { -/* We're falling back to POSIX AIO in some cases */ -paio_init(); - s-aio_ctx = laio_init(); if (!s-aio_ctx) { goto out_free_buf; @@ -245,9 +247,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } else #endif { -if (paio_init() 0) { -goto out_free_buf; -} #ifdef CONFIG_LINUX_AIO s-use_aio = 0; #endif -- 1.7.1
[Qemu-devel] [PATCH 0/2] improve Linux AIO support
These patches avoid many fallbacks to POSIX AIO and enable Linux AIO even if nocache is not specified. Also add flush support with Linux AIO. Frediano Ziglio (2): linux aio: support flush operation aio: use Linux AIO even if nocache is not specified block/raw-posix.c | 30 +- linux-aio.c |3 +++ 2 files changed, 20 insertions(+), 13 deletions(-)
[Qemu-devel] [PATCH 1/2] linux aio: support flush operation
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/raw-posix.c |7 +++ linux-aio.c |3 +++ 2 files changed, 10 insertions(+), 0 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 3c6bd4b..27ae81e 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -628,6 +628,13 @@ static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs, if (fd_open(bs) 0) return NULL; +#ifdef CONFIG_LINUX_AIO +if (s-use_aio) { +return laio_submit(bs, s-aio_ctx, s-fd, 0, NULL, + 0, cb, opaque, QEMU_AIO_FLUSH); +} +#endif + return paio_submit(bs, s-fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); } diff --git a/linux-aio.c b/linux-aio.c index 68f4b3d..d07c435 100644 --- a/linux-aio.c +++ b/linux-aio.c @@ -215,6 +215,9 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, case QEMU_AIO_READ: io_prep_preadv(iocbs, fd, qiov-iov, qiov-niov, offset); break; +case QEMU_AIO_FLUSH: +io_prep_fdsync(iocbs, fd); +break; default: fprintf(stderr, %s: invalid AIO request type 0x%x.\n, __func__, type); -- 1.7.1
[Qemu-devel] [PATCH 2/2] aio: use Linux AIO even if nocache is not specified
Currently Linux AIO are used only if nocache is specified. Linux AIO works in all cases. The only problem is that currently Linux AIO does not align data so I add a test that use POSIX AIO in this case. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/raw-posix.c | 23 ++- 1 files changed, 10 insertions(+), 13 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index 27ae81e..078a256 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -236,21 +236,16 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } #ifdef CONFIG_LINUX_AIO -if ((bdrv_flags (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == - (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { +s-use_aio = 0; +if ((bdrv_flags BDRV_O_NATIVE_AIO)) { s-aio_ctx = laio_init(); if (!s-aio_ctx) { goto out_free_buf; } s-use_aio = 1; -} else -#endif -{ -#ifdef CONFIG_LINUX_AIO -s-use_aio = 0; -#endif } +#endif #ifdef CONFIG_XFS if (platform_test_xfs_fd(s-fd)) { @@ -592,14 +587,16 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, if (s-aligned_buf) { if (!qiov_is_aligned(bs, qiov)) { type |= QEMU_AIO_MISALIGNED; -#ifdef CONFIG_LINUX_AIO -} else if (s-use_aio) { -return laio_submit(bs, s-aio_ctx, s-fd, sector_num, qiov, - nb_sectors, cb, opaque, type); -#endif } } +#ifdef CONFIG_LINUX_AIO +if (s-use_aio !(type QEMU_AIO_MISALIGNED)) { +return laio_submit(bs, s-aio_ctx, s-fd, sector_num, qiov, + nb_sectors, cb, opaque, type); +} +#endif + return paio_submit(bs, s-fd, sector_num, qiov, nb_sectors, cb, opaque, type); } -- 1.7.1
Re: [Qemu-devel] [PATCH 1/2] linux aio: support flush operation
Il giorno 27/lug/2011, alle ore 20:31, Christoph Hellwig h...@lst.de ha scritto: Did you test this at all? Yes! Not at kernel level :-) Usually I trust documentation and man pages. On Wed, Jul 27, 2011 at 08:25:25PM +0200, Frediano Ziglio wrote: +case QEMU_AIO_FLUSH: +io_prep_fdsync(iocbs, fd); +break; Looks great, but doesn't work as expected. Hint: grep for aio_fsync in the linux kernel. Thanks. I'll try to port misaligned access to Linux AIO. Also I'll add some comments on code to avoid somebody do the same mistache I did. Mainly however -k qemu-img and aio=native in blockdev options are silently ignored if nocache is not enabled. Also I notice that combining XFS, Linux AIO, O_DIRECT and O_DSYNC give impressive performance but currently there is no way to specify all that flags together cause nocache enable O_DIRECT while O_DSYNC is enabled with writethrough. Frediano
Re: [Qemu-devel] [PATCH 09/10] posix-aio-compat: Allow read after EOF
2011/7/26 Kevin Wolf kw...@redhat.com: In order to be able to transparently replace bdrv_read calls by bdrv_co_read, reading beyond EOF must produce zeros instead of short reads for AIO, too. Signed-off-by: Kevin Wolf kw...@redhat.com --- posix-aio-compat.c | 19 +++ 1 files changed, 19 insertions(+), 0 deletions(-) diff --git a/posix-aio-compat.c b/posix-aio-compat.c index 788d113..8dc00cb 100644 --- a/posix-aio-compat.c +++ b/posix-aio-compat.c @@ -198,6 +198,12 @@ static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb) return len; } +/* + * Read/writes the data to/from a given linear buffer. + * + * Returns the number of bytes handles or -errno in case of an error. Short + * reads are only returned if the end of the file is reached. + */ static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf) { ssize_t offset = 0; @@ -334,6 +340,19 @@ static void *aio_thread(void *unused) switch (aiocb-aio_type QEMU_AIO_TYPE_MASK) { case QEMU_AIO_READ: + ret = handle_aiocb_rw(aiocb); + if (ret = 0 ret aiocb-aio_nbytes aiocb-common.bs-growable) { + /* A short read means that we have reached EOF. Pad the buffer + * with zeros for bytes after EOF. */ + QEMUIOVector qiov; + + qemu_iovec_init_external(qiov, aiocb-aio_iov, + aiocb-aio_niov); + qemu_iovec_memset_skip(qiov, 0, aiocb-aio_nbytes - ret, ret); + + ret = aiocb-aio_nbytes; + } + break; case QEMU_AIO_WRITE: ret = handle_aiocb_rw(aiocb); break; -- 1.7.6 Still not tested but I think to know what does it solve :) I think Linux AIO require same attention. Frediano
[Qemu-devel] [PATCH 0/5] qcow: coroutines cleanup
These patches mostly cleanup some AIO code using coroutines. These patches apply to Kevin's repository, branch coroutine-block. Mostly they use stack instead of allocated AIO structure. Frediano Ziglio (5): qcow: allocate QCowAIOCB structure using stack qcow: QCowAIOCB field cleanup qcow: move some blocks of code to avoid useless variable initialization qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev qcow: remove old #undefined code block/qcow.c | 374 ++--- block/qcow2.c | 38 ++ 2 files changed, 129 insertions(+), 283 deletions(-)
[Qemu-devel] [PATCH 1/5] qcow: allocate QCowAIOCB structure using stack
instead of calling qemi_aio_get use stack Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 52 block/qcow2.c | 38 +++--- 2 files changed, 27 insertions(+), 63 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 6447c2a..d19ef04 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -503,28 +503,12 @@ typedef struct QCowAIOCB { BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; -static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -if (acb-hd_aiocb) -bdrv_aio_cancel(acb-hd_aiocb); -qemu_aio_release(acb); -} - -static AIOPool qcow_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow_aio_cancel, -}; - static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, -int is_write) +int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow_aio_pool, bs, NULL, NULL); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-hd_aiocb = NULL; acb-sector_num = sector_num; acb-qiov = qiov; @@ -543,9 +527,8 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, return acb; } -static int qcow_aio_read_cb(void *opaque) +static int qcow_aio_read_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -634,29 +617,27 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_read_cb(acb); +ret = qcow_aio_read_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size); +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } -static int qcow_aio_write_cb(void *opaque) +static int qcow_aio_write_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -714,23 +695,22 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; s-cluster_cache_offset = -1; /* disable compressed cache */ -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_write_cb(acb); +ret = qcow_aio_write_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index f07d550..edc068e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -388,17 +388,6 @@ typedef struct QCowAIOCB { QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; -static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -qemu_aio_release(acb); -} - -static AIOPool qcow2_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow2_aio_cancel, -}; - /* * Returns 0 when the request is completed successfully, 1 when there is still * a part left to do and -errno in error cases. @@ -528,13 +517,10 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, int is_write) + void *opaque, int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow2_aio_pool, bs, cb, opaque); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; acb-is_write = is_write; @@ -554,19 +540,18 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, int
[Qemu-devel] [PATCH 2/5] qcow: QCowAIOCB field cleanup
remove unused field from this structure and put some of them in qcow_aio_read_cb and qcow_aio_write_cb Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 135 +++--- 1 files changed, 63 insertions(+), 72 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index d19ef04..e52df91 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -487,72 +487,61 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, #endif typedef struct QCowAIOCB { -BlockDriverAIOCB common; +BlockDriverState *bs; int64_t sector_num; QEMUIOVector *qiov; uint8_t *buf; void *orig_buf; int nb_sectors; -int n; -uint64_t cluster_offset; -uint8_t *cluster_data; -struct iovec hd_iov; -bool is_write; -QEMUBH *bh; -QEMUIOVector hd_qiov; -BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, int is_write, QCowAIOCB *acb) { -memset(acb, 0, sizeof(*acb)); -acb-common.bs = bs; -acb-hd_aiocb = NULL; +acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; -acb-is_write = is_write; if (qiov-niov 1) { acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size); if (is_write) qemu_iovec_to_buffer(qiov, acb-buf); } else { +acb-orig_buf = NULL; acb-buf = (uint8_t *)qiov-iov-iov_base; } acb-nb_sectors = nb_sectors; -acb-n = 0; -acb-cluster_offset = 0; return acb; } static int qcow_aio_read_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret; - -acb-hd_aiocb = NULL; +int ret, n = 0; +uint64_t cluster_offset = 0; +struct iovec hd_iov; +QEMUIOVector hd_qiov; redo: /* post process the read buffer */ -if (!acb-cluster_offset) { +if (!cluster_offset) { /* nothing to do */ -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* nothing to do */ } else { if (s-crypt_method) { encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -acb-n, 0, +n, 0, s-aes_decrypt_key); } } -acb-nb_sectors -= acb-n; -acb-sector_num += acb-n; -acb-buf += acb-n * 512; +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; if (acb-nb_sectors == 0) { /* request completed */ @@ -560,57 +549,57 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } /* prepare next AIO request */ -acb-cluster_offset = get_cluster_offset(bs, acb-sector_num 9, +cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -acb-n = s-cluster_sectors - index_in_cluster; -if (acb-n acb-nb_sectors) -acb-n = acb-nb_sectors; +n = s-cluster_sectors - index_in_cluster; +if (n acb-nb_sectors) +n = acb-nb_sectors; -if (!acb-cluster_offset) { +if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ -acb-hd_iov.iov_base = (void *)acb-buf; -acb-hd_iov.iov_len = acb-n * 512; -qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1); +hd_iov.iov_base = (void *)acb-buf; +hd_iov.iov_len = n * 512; +qemu_iovec_init_external(hd_qiov, hd_iov, 1); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -acb-n, acb-hd_qiov); +n, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { return -EIO; } } else { /* Note: in this case, no need to wait */ -memset(acb-buf, 0, 512 * acb-n); +memset(acb-buf, 0, 512 * n); goto redo; } -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ -if (decompress_cluster(bs, acb-cluster_offset) 0) { +if (decompress_cluster(bs, cluster_offset) 0) { return -EIO; } memcpy(acb-buf, - s-cluster_cache + index_in_cluster * 512, 512 * acb-n); + s-cluster_cache + index_in_cluster * 512, 512 * n); goto redo; } else { -if ((acb-cluster_offset 511) != 0) { +if ((cluster_offset 511) != 0) { return -EIO; } -acb-hd_iov.iov_base = (void *)acb-buf
[Qemu-devel] [PATCH 4/5] qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 287 - 1 files changed, 121 insertions(+), 166 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 015a472..2feb7d7 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -486,221 +486,176 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, } #endif -typedef struct QCowAIOCB { -BlockDriverState *bs; -int64_t sector_num; -QEMUIOVector *qiov; -uint8_t *buf; -void *orig_buf; -int nb_sectors; -} QCowAIOCB; - -static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, -int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, -int is_write, QCowAIOCB *acb) -{ -acb-bs = bs; -acb-sector_num = sector_num; -acb-qiov = qiov; - -if (qiov-niov 1) { -acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size); -if (is_write) -qemu_iovec_to_buffer(qiov, acb-buf); -} else { -acb-orig_buf = NULL; -acb-buf = (uint8_t *)qiov-iov-iov_base; -} -acb-nb_sectors = nb_sectors; -return acb; -} - -static int qcow_aio_read_cb(QCowAIOCB *acb) +static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) { -BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret, n; +int ret = 0, n; uint64_t cluster_offset; struct iovec hd_iov; QEMUIOVector hd_qiov; +uint8_t *buf; +void *orig_buf; - redo: -if (acb-nb_sectors == 0) { -/* request completed */ -return 0; +if (qiov-niov 1) { +buf = orig_buf = qemu_blockalign(bs, qiov-size); +} else { +orig_buf = NULL; +buf = (uint8_t *)qiov-iov-iov_base; } -/* prepare next request */ -cluster_offset = get_cluster_offset(bs, acb-sector_num 9, - 0, 0, 0, 0); -index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -n = s-cluster_sectors - index_in_cluster; -if (n acb-nb_sectors) -n = acb-nb_sectors; +qemu_co_mutex_lock(s-lock); + +while (nb_sectors != 0) { +/* prepare next request */ +cluster_offset = get_cluster_offset(bs, sector_num 9, + 0, 0, 0, 0); +index_in_cluster = sector_num (s-cluster_sectors - 1); +n = s-cluster_sectors - index_in_cluster; +if (n nb_sectors) +n = nb_sectors; -if (!cluster_offset) { -if (bs-backing_hd) { -/* read from the base image */ -hd_iov.iov_base = (void *)acb-buf; +if (!cluster_offset) { +if (bs-backing_hd) { +/* read from the base image */ +hd_iov.iov_base = (void *)buf; +hd_iov.iov_len = n * 512; +qemu_iovec_init_external(hd_qiov, hd_iov, 1); +qemu_co_mutex_unlock(s-lock); +ret = bdrv_co_readv(bs-backing_hd, sector_num, +n, hd_qiov); +qemu_co_mutex_lock(s-lock); +if (ret 0) { +goto fail; +} +} else { +/* Note: in this case, no need to wait */ +memset(buf, 0, 512 * n); +} +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* add AIO support for compressed blocks ? */ +if (decompress_cluster(bs, cluster_offset) 0) { +goto fail; +} +memcpy(buf, + s-cluster_cache + index_in_cluster * 512, 512 * n); +} else { +if ((cluster_offset 511) != 0) { +goto fail; +} +hd_iov.iov_base = (void *)buf; hd_iov.iov_len = n * 512; qemu_iovec_init_external(hd_qiov, hd_iov, 1); qemu_co_mutex_unlock(s-lock); -ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, +ret = bdrv_co_readv(bs-file, +(cluster_offset 9) + index_in_cluster, n, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { -return -EIO; +break; +} +if (s-crypt_method) { +encrypt_sectors(s, sector_num, buf, buf, +n, 0, +s-aes_decrypt_key); } -} else { -/* Note: in this case, no need to wait */ -memset(acb-buf, 0, 512 * n); -} -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* add AIO support for compressed blocks ? */ -if (decompress_cluster(bs, cluster_offset) 0) { -return -EIO; -} -memcpy(acb-buf, - s-cluster_cache
[Qemu-devel] [PATCH 5/5] qcow: remove old #undefined code
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 63 -- 1 files changed, 0 insertions(+), 63 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 2feb7d7..42331c7 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -190,24 +190,6 @@ static int qcow_set_key(BlockDriverState *bs, const char *key) return -1; if (AES_set_decrypt_key(keybuf, 128, s-aes_decrypt_key) != 0) return -1; -#if 0 -/* test */ -{ -uint8_t in[16]; -uint8_t out[16]; -uint8_t tmp[16]; -for(i=0;i16;i++) -in[i] = i; -AES_encrypt(in, tmp, s-aes_encrypt_key); -AES_decrypt(tmp, out, s-aes_decrypt_key); -for(i = 0; i 16; i++) -printf( %02x, tmp[i]); -printf(\n); -for(i = 0; i 16; i++) -printf( %02x, out[i]); -printf(\n); -} -#endif return 0; } @@ -441,51 +423,6 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) return 0; } -#if 0 - -static int qcow_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ -BDRVQcowState *s = bs-opaque; -int ret, index_in_cluster, n; -uint64_t cluster_offset; - -while (nb_sectors 0) { -cluster_offset = get_cluster_offset(bs, sector_num 9, 0, 0, 0, 0); -index_in_cluster = sector_num (s-cluster_sectors - 1); -n = s-cluster_sectors - index_in_cluster; -if (n nb_sectors) -n = nb_sectors; -if (!cluster_offset) { -if (bs-backing_hd) { -/* read from the base image */ -ret = bdrv_read(bs-backing_hd, sector_num, buf, n); -if (ret 0) -return -1; -} else { -memset(buf, 0, 512 * n); -} -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -if (decompress_cluster(bs, cluster_offset) 0) -return -1; -memcpy(buf, s-cluster_cache + index_in_cluster * 512, 512 * n); -} else { -ret = bdrv_pread(bs-file, cluster_offset + index_in_cluster * 512, buf, n * 512); -if (ret != n * 512) -return -1; -if (s-crypt_method) { -encrypt_sectors(s, sector_num, buf, buf, n, 0, -s-aes_decrypt_key); -} -} -nb_sectors -= n; -sector_num += n; -buf += n * 512; -} -return 0; -} -#endif - static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { -- 1.7.1
[Qemu-devel] [PATCH 3/5] qcow: move some blocks of code to avoid useless variable initialization
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 53 ++--- 1 files changed, 26 insertions(+), 27 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index e52df91..015a472 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -520,35 +520,18 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret, n = 0; -uint64_t cluster_offset = 0; +int ret, n; +uint64_t cluster_offset; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -/* post process the read buffer */ -if (!cluster_offset) { -/* nothing to do */ -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* nothing to do */ -} else { -if (s-crypt_method) { -encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -n, 0, -s-aes_decrypt_key); -} -} - -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; } -/* prepare next AIO request */ +/* prepare next request */ cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); @@ -572,7 +555,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } else { /* Note: in this case, no need to wait */ memset(acb-buf, 0, 512 * n); -goto redo; } } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -581,7 +563,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } memcpy(acb-buf, s-cluster_cache + index_in_cluster * 512, 512 * n); -goto redo; } else { if ((cluster_offset 511) != 0) { return -EIO; @@ -599,6 +580,23 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } } +/* post process the read buffer */ +if (!cluster_offset) { +/* nothing to do */ +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* nothing to do */ +} else { +if (s-crypt_method) { +encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, +n, 0, +s-aes_decrypt_key); +} +} + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } @@ -630,16 +628,12 @@ static int qcow_aio_write_cb(QCowAIOCB *acb) int index_in_cluster; uint64_t cluster_offset; const uint8_t *src_buf; -int ret, n = 0; +int ret, n; uint8_t *cluster_data = NULL; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; @@ -681,6 +675,11 @@ static int qcow_aio_write_cb(QCowAIOCB *acb) if (ret 0) { return ret; } + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } -- 1.7.1
Re: [Qemu-devel] [RFC] qcow2: 2 way to improve performance updating refcount
2011/7/22 Kevin Wolf kw...@redhat.com: Am 21.07.2011 18:17, schrieb Frediano Ziglio: Hi, after a snapshot is taken currently many write operations are quite slow due to - refcount updates (decrement old and increment new ) - cluster allocation and file expansion - read-modify-write on partial clusters I found 2 way to improve refcount performance Method 1 - Lazy count Mainly do not take into account count for current snapshot, that is current snapshot counts as 0. This would require to add a current_snapshot in header and update refcount when current is changed. So for these operation - creating snapshot, performance are the same, just increment for old snapshot instead of the new one - normal write operations. As current snaphot counts as 0 there is not operations here so do not write any data - changing current snapshot, this is the worst case, you have to increment for the current snapshot and decrement for the new so it will take twice - deleting snapshot, if is the current just set current_snapshot to a dummy not existing value, if is not the current just decrement counters, no performance changes How would you do cluster allocation if you don't have refcounts any more that can tell you if a cluster is used or not? You have refcount, is only that current snapshot counts as 0. An example may help, start with A snapshot A counts as zero so all refcounts are 0, now we create a snapshot B and make it current so refcounts are 1 A --- B If you change a cluster in snapshot B counts are still 1. If you go back to A counters are increment (cause you leave B) and then decrement (cause you enter in A). Perhaps the problem is how to distinguish 0 from allocated in current and not allocated. Yes, with which I suppose above it's a problem, but we can easily use -1 as not allocated. If current and refcount 0 mark as -1, if not current we would have to increment counters of current, mark current as -1 than decrement for deleting, yes in this case you have twice the time. Method 2 - Read-only parent Here parents are readonly, instead of storing a refcount store a numeric id of the owner. If the owner is not current copy the cluster and change it. Considering this situation A --- B --- C B cannot be changed so in order to change B you have to create a new snapshot A --- B --- C \--- D and change D. It can take more space cause you have in this case an additional snapshot. Operations: - creating snapshot, really fast as you don't have to change any ownership - normal write operations. If owner is not the same allocate a new cluster and just store a new owner for new cluster. Also ownership for past-to-end cluster could be set all to current owner in order to collapse allocations - changing current snapshot, no changes required for owners - deleting snapshot. Only possible if you have no child or a single child. Will require to scan all l2 tables and merge and update owner. I think this has similar characteristics as we have with external snapshots (i.e. backing files). The advantage is that with applying it to internal snapshots is that when deleting a snapshot you don't have to copy around all the data. Probably this change could even be done transparently for the user, so that B still appears to be writeable, but in fact refers to D now. Anyway, have you checked how bad the refcount work really is? I think that writing the VM state takes a lot longer, so that optimising the refcount update may be the wrong approach, especially if it requires a format change. My results with qemu-img snapshot suggest that it's not worth it: kwolf@dhcp-5-188:~/images$ ~/source/qemu/qemu-img info scratch.qcow2 image: scratch.qcow2 file format: qcow2 virtual size: 8.0G (8589934592 bytes) disk size: 4.0G cluster_size: 65536 kwolf@dhcp-5-188:~/images$ time ~/source/qemu/qemu-img snapshot -c test scratch.qcow2 real 0m0.116s user 0m0.009s sys 0m0.040s kwolf@dhcp-5-188:~/images$ time ~/source/qemu/qemu-img snapshot -d test scratch.qcow2 real 0m0.084s user 0m0.011s sys 0m0.044s Kevin I'm not worried about time just taking snapshot more after taking snapshot during normal use. As you stated taking snapshot you can disable cache writethrough making it very fast but during normal operations you can't. Personally I'm pondering a log too to allow collapsing metadata updates. Even an external (another file) full log (with data) to try to reduce even overhead caused by read-modify-write during partial cluster updates and reduce file fragmentation. But as you can see from my patches I'm still exercising myself with Qemu code. Regards Frediano
Re: [Qemu-devel] [RFC PATCH 0/5] qcow: coroutines cleanup
2011/7/22 Kevin Wolf kw...@redhat.com: Am 20.07.2011 15:56, schrieb Frediano Ziglio: These patches mostly cleanup some AIO code using coroutines. These patches apply to Kevin's repository, branch coroutine-block. Mostly they use stack instead of allocated AIO structure. Frediano Ziglio (5): qcow: allocate QCowAIOCB structure using stack qcow: QCowAIOCB field cleanup qcow: move some blocks of code to avoid useless variable initialization avoid dandling pointers qcow: small optimization initializing QCowAIOCB block/qcow.c | 210 + block/qcow2.c | 38 +++--- 2 files changed, 102 insertions(+), 146 deletions(-) Most of it looks good now. Did you include the RFC in the subject just because the coroutine work is in RFC state, too, or did you intend to tell me that I shouldn't merge yet? Kevin As these patches are first quite big patches I send (typo or small fixes do not counts) I just want to mark that I could write something really wrong. Just a way to avoid somebody having to send more patches and get more attention. Some projects are quite prone to merge even not that fine ones. I prefer to have some (a bit) pedantic comments and a real fix/improve. Now I removed the RFC from last update. The main reason is that I found your qemu-iotests repository which, I think should be merged to main repository, but it's just my opinion. Oh... qcow fails 004 test (even origin/coroutines-block) with a I/O error. I must say there are a lot of small hidden things that a developer should know about Qemu, for instance - mailing list follow some LKML rules as CC ML and send to maintainer to get more attention - you can use scripts/checkpatch.pl to check your patches before send I still have also to understand how to use git format-patch/send-email correctly and fluently :) Frediano
Re: [Qemu-devel] [RFC PATCH 4/5] avoid dandling pointers
2011/7/22 Kevin Wolf kw...@redhat.com: Am 20.07.2011 15:56, schrieb Frediano Ziglio: Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) Subject needs a qcow: ... Kevin Yes, now I removed that patch as with argument on stack it just make few sense...
Re: [Qemu-devel] [RFC PATCH 0/5] qcow: coroutines cleanup
2011/7/22 Stefan Hajnoczi stefa...@gmail.com: On Fri, Jul 22, 2011 at 11:10 AM, Kevin Wolf kw...@redhat.com wrote: Am 22.07.2011 11:26, schrieb Frediano Ziglio: - you can use scripts/checkpatch.pl to check your patches before send I updated the SubmitAPatch wiki earlier this week. Stefan Good, now wiki is working (it seems somebody is attacking Qemu sites... yesterday the ML). http://git.qemu.org/git/qemu.git/ is not working so all links give 404. I added some notes, yes checkpatch was already in the page. About git commands to send multiple patches, I use git format-patch --cover-letter -s -M origin/original_branch_name --subject-prefix='PATCH vXX' -o outgoing/ edit manually cover letter and git send-email --to='maintainer@domain' --cc='qemu-devel@nongnu.org' outgoing/* are these command correct or there is a better way? Frediano
Re: [Qemu-devel] [RFC] qcow2: 2 way to improve performance updating refcount
2011/7/22 Stefan Hajnoczi stefa...@gmail.com: On Fri, Jul 22, 2011 at 10:13 AM, Frediano Ziglio fredd...@gmail.com wrote: 2011/7/22 Kevin Wolf kw...@redhat.com: Am 21.07.2011 18:17, schrieb Frediano Ziglio: Hi, after a snapshot is taken currently many write operations are quite slow due to - refcount updates (decrement old and increment new ) - cluster allocation and file expansion - read-modify-write on partial clusters I found 2 way to improve refcount performance Method 1 - Lazy count Mainly do not take into account count for current snapshot, that is current snapshot counts as 0. This would require to add a current_snapshot in header and update refcount when current is changed. So for these operation - creating snapshot, performance are the same, just increment for old snapshot instead of the new one - normal write operations. As current snaphot counts as 0 there is not operations here so do not write any data - changing current snapshot, this is the worst case, you have to increment for the current snapshot and decrement for the new so it will take twice - deleting snapshot, if is the current just set current_snapshot to a dummy not existing value, if is not the current just decrement counters, no performance changes How would you do cluster allocation if you don't have refcounts any more that can tell you if a cluster is used or not? You have refcount, is only that current snapshot counts as 0. An example may help, start with A snapshot A counts as zero so all refcounts are 0, now we create a snapshot B and make it current so refcounts are 1 A --- B If you change a cluster in snapshot B counts are still 1. If you go back to A counters are increment (cause you leave B) and then decrement (cause you enter in A). Perhaps the problem is how to distinguish 0 from allocated in current and not allocated. Yes, with which I suppose above it's a problem, but we can easily use -1 as not allocated. If current and refcount 0 mark as -1, if not current we would have to increment counters of current, mark current as -1 than decrement for deleting, yes in this case you have twice the time. I'm not sure I follow your last sentence but just having a different refcount value for not allocated vs allocated means allocating write requests will need to update refcounts. Now you have 0 for not allocated and 0 for allocated. If you assume current snapshot counting as 0 a 0 in refcount could mean an allocated cluster in current snapshot not shared with other snapshots and if you don't use -1 could be also a not allocated cluster. Thinking in another way is not that you don't update refcounts but you update refcounts with 0 addend (that's practically not changing refcounts). Question was: is possible to use this trick? But are non-append allocations common enough that we should bother with them in the allocating write path? Can we append to the end of the image file for allocating writes and handle defragmentation elsewhere (i.e. get rid of unallocated clusters in the middle of the file)? Stefan I think so but is better to have a way to know if a cluster is allocated without having to scan all l2 tables. Frediano
Re: [Qemu-devel] [RFC PATCH 0/5] qcow: coroutines cleanup
Il giorno ven, 22/07/2011 alle 12.10 +0200, Kevin Wolf ha scritto: Am 22.07.2011 11:26, schrieb Frediano Ziglio: 2011/7/22 Kevin Wolf kw...@redhat.com: Am 20.07.2011 15:56, schrieb Frediano Ziglio: These patches mostly cleanup some AIO code using coroutines. These patches apply to Kevin's repository, branch coroutine-block. Mostly they use stack instead of allocated AIO structure. Frediano Ziglio (5): qcow: allocate QCowAIOCB structure using stack qcow: QCowAIOCB field cleanup qcow: move some blocks of code to avoid useless variable initialization avoid dandling pointers qcow: small optimization initializing QCowAIOCB block/qcow.c | 210 + block/qcow2.c | 38 +++--- 2 files changed, 102 insertions(+), 146 deletions(-) Most of it looks good now. Did you include the RFC in the subject just because the coroutine work is in RFC state, too, or did you intend to tell me that I shouldn't merge yet? Kevin As these patches are first quite big patches I send (typo or small fixes do not counts) I just want to mark that I could write something really wrong. Just a way to avoid somebody having to send more patches and get more attention. Some projects are quite prone to merge even not that fine ones. I prefer to have some (a bit) pedantic comments and a real fix/improve. Now I removed the RFC from last update. The main reason is that I found your qemu-iotests repository which, I think should be merged to main repository, but it's just my opinion. Oh... qcow fails 004 test (even origin/coroutines-block) with a I/O error. Yup, you're right, I must have messed it up. Care to fix it or should I look into it? Care but I don't know if I'll have time before Thursday. However I found the problem, really strange. bdrv_read returns 0 for errors 0 for success and... bytes read on partial read! Now a qcow image of 128m is 560 bytes so when you read sector 1 you get 48 which is not a problem for qcow code. But if you replace bdrv_read with a bdrv_co_readv (your latest patch on coroutine-block) bdrv_co_readv return -EINVAL on partial read. Frediano
[Qemu-devel] [RFC] qcow2: 2 way to improve performance updating refcount
Hi, after a snapshot is taken currently many write operations are quite slow due to - refcount updates (decrement old and increment new ) - cluster allocation and file expansion - read-modify-write on partial clusters I found 2 way to improve refcount performance Method 1 - Lazy count Mainly do not take into account count for current snapshot, that is current snapshot counts as 0. This would require to add a current_snapshot in header and update refcount when current is changed. So for these operation - creating snapshot, performance are the same, just increment for old snapshot instead of the new one - normal write operations. As current snaphot counts as 0 there is not operations here so do not write any data - changing current snapshot, this is the worst case, you have to increment for the current snapshot and decrement for the new so it will take twice - deleting snapshot, if is the current just set current_snapshot to a dummy not existing value, if is not the current just decrement counters, no performance changes Method 2 - Read-only parent Here parents are readonly, instead of storing a refcount store a numeric id of the owner. If the owner is not current copy the cluster and change it. Considering this situation A --- B --- C B cannot be changed so in order to change B you have to create a new snapshot A --- B --- C \--- D and change D. It can take more space cause you have in this case an additional snapshot. Operations: - creating snapshot, really fast as you don't have to change any ownership - normal write operations. If owner is not the same allocate a new cluster and just store a new owner for new cluster. Also ownership for past-to-end cluster could be set all to current owner in order to collapse allocations - changing current snapshot, no changes required for owners - deleting snapshot. Only possible if you have no child or a single child. Will require to scan all l2 tables and merge and update owner. Same performance Regards Frediano Ziglio
[Qemu-devel] [RFC PATCH 2/5] use more stack
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 133 +++--- 1 files changed, 62 insertions(+), 71 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index d19ef04..cd1f9e3 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -487,20 +487,12 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, #endif typedef struct QCowAIOCB { -BlockDriverAIOCB common; +BlockDriverState *bs; int64_t sector_num; QEMUIOVector *qiov; uint8_t *buf; void *orig_buf; int nb_sectors; -int n; -uint64_t cluster_offset; -uint8_t *cluster_data; -struct iovec hd_iov; -bool is_write; -QEMUBH *bh; -QEMUIOVector hd_qiov; -BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, @@ -508,11 +500,9 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int is_write, QCowAIOCB *acb) { memset(acb, 0, sizeof(*acb)); -acb-common.bs = bs; -acb-hd_aiocb = NULL; +acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; -acb-is_write = is_write; if (qiov-niov 1) { acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size); @@ -522,37 +512,36 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, acb-buf = (uint8_t *)qiov-iov-iov_base; } acb-nb_sectors = nb_sectors; -acb-n = 0; -acb-cluster_offset = 0; return acb; } static int qcow_aio_read_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret; - -acb-hd_aiocb = NULL; +int ret, n = 0; +uint64_t cluster_offset = 0; +struct iovec hd_iov; +QEMUIOVector hd_qiov; redo: /* post process the read buffer */ -if (!acb-cluster_offset) { +if (!cluster_offset) { /* nothing to do */ -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* nothing to do */ } else { if (s-crypt_method) { encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -acb-n, 0, +n, 0, s-aes_decrypt_key); } } -acb-nb_sectors -= acb-n; -acb-sector_num += acb-n; -acb-buf += acb-n * 512; +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; if (acb-nb_sectors == 0) { /* request completed */ @@ -560,57 +549,57 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } /* prepare next AIO request */ -acb-cluster_offset = get_cluster_offset(bs, acb-sector_num 9, +cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -acb-n = s-cluster_sectors - index_in_cluster; -if (acb-n acb-nb_sectors) -acb-n = acb-nb_sectors; +n = s-cluster_sectors - index_in_cluster; +if (n acb-nb_sectors) +n = acb-nb_sectors; -if (!acb-cluster_offset) { +if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ -acb-hd_iov.iov_base = (void *)acb-buf; -acb-hd_iov.iov_len = acb-n * 512; -qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1); +hd_iov.iov_base = (void *)acb-buf; +hd_iov.iov_len = n * 512; +qemu_iovec_init_external(hd_qiov, hd_iov, 1); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -acb-n, acb-hd_qiov); +n, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { return -EIO; } } else { /* Note: in this case, no need to wait */ -memset(acb-buf, 0, 512 * acb-n); +memset(acb-buf, 0, 512 * n); goto redo; } -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ -if (decompress_cluster(bs, acb-cluster_offset) 0) { +if (decompress_cluster(bs, cluster_offset) 0) { return -EIO; } memcpy(acb-buf, - s-cluster_cache + index_in_cluster * 512, 512 * acb-n); + s-cluster_cache + index_in_cluster * 512, 512 * n); goto redo; } else { -if ((acb-cluster_offset 511) != 0) { +if ((cluster_offset 511) != 0) { return -EIO; } -acb-hd_iov.iov_base = (void *)acb-buf; -acb-hd_iov.iov_len = acb-n * 512; -qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1); +hd_iov.iov_base = (void
[Qemu-devel] [RFC PATCH 5/5] qemu_aio_get used to clear all allocated buffer
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 007fb57..8fd1ee5 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -499,7 +499,6 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, int is_write, QCowAIOCB *acb) { -memset(acb, 0, sizeof(*acb)); acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; @@ -509,6 +508,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, if (is_write) qemu_iovec_to_buffer(qiov, acb-buf); } else { +acb-orig_buf = NULL; acb-buf = (uint8_t *)qiov-iov-iov_base; } acb-nb_sectors = nb_sectors; -- 1.7.1
[Qemu-devel] [RFC PATCH 3/5] more stack work
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 53 ++--- 1 files changed, 26 insertions(+), 27 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index cd1f9e3..8ccd7d7 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -520,35 +520,18 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret, n = 0; -uint64_t cluster_offset = 0; +int ret, n; +uint64_t cluster_offset; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -/* post process the read buffer */ -if (!cluster_offset) { -/* nothing to do */ -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* nothing to do */ -} else { -if (s-crypt_method) { -encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -n, 0, -s-aes_decrypt_key); -} -} - -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; } -/* prepare next AIO request */ +/* prepare next request */ cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); @@ -572,7 +555,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } else { /* Note: in this case, no need to wait */ memset(acb-buf, 0, 512 * n); -goto redo; } } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -581,7 +563,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } memcpy(acb-buf, s-cluster_cache + index_in_cluster * 512, 512 * n); -goto redo; } else { if ((cluster_offset 511) != 0) { return -EIO; @@ -599,6 +580,23 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } } +/* post process the read buffer */ +if (!cluster_offset) { +/* nothing to do */ +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* nothing to do */ +} else { +if (s-crypt_method) { +encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, +n, 0, +s-aes_decrypt_key); +} +} + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } @@ -630,16 +628,12 @@ static int qcow_aio_write_cb(QCowAIOCB *acb) int index_in_cluster; uint64_t cluster_offset; const uint8_t *src_buf; -int ret, n = 0; +int ret, n; uint8_t *cluster_data = NULL; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; @@ -681,6 +675,11 @@ static int qcow_aio_write_cb(QCowAIOCB *acb) if (ret 0) { return ret; } + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } -- 1.7.1
[Qemu-devel] [RFC PATCH 0/5] Coroutines cleanup
These patches mostly cleanup some AIO code using coroutines. These patches apply to Kevin's repository, branch coroutine-block. Mostly they use stack instead of allocated AIO structure. Frediano Ziglio (5): allocate AIO on stack use more stack more stack work avoid dandling pointers qemu_aio_get used to clear all allocated buffer block/qcow.c | 210 + block/qcow2.c | 38 +++--- 2 files changed, 102 insertions(+), 146 deletions(-)
[Qemu-devel] [RFC PATCH 1/5] allocate AIO on stack
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 52 block/qcow2.c | 38 +++--- 2 files changed, 27 insertions(+), 63 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 6447c2a..d19ef04 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -503,28 +503,12 @@ typedef struct QCowAIOCB { BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; -static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -if (acb-hd_aiocb) -bdrv_aio_cancel(acb-hd_aiocb); -qemu_aio_release(acb); -} - -static AIOPool qcow_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow_aio_cancel, -}; - static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, -int is_write) +int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow_aio_pool, bs, NULL, NULL); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-hd_aiocb = NULL; acb-sector_num = sector_num; acb-qiov = qiov; @@ -543,9 +527,8 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, return acb; } -static int qcow_aio_read_cb(void *opaque) +static int qcow_aio_read_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -634,29 +617,27 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_read_cb(acb); +ret = qcow_aio_read_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size); +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } -static int qcow_aio_write_cb(void *opaque) +static int qcow_aio_write_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -714,23 +695,22 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; s-cluster_cache_offset = -1; /* disable compressed cache */ -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_write_cb(acb); +ret = qcow_aio_write_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index f07d550..edc068e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -388,17 +388,6 @@ typedef struct QCowAIOCB { QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; -static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -qemu_aio_release(acb); -} - -static AIOPool qcow2_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow2_aio_cancel, -}; - /* * Returns 0 when the request is completed successfully, 1 when there is still * a part left to do and -errno in error cases. @@ -528,13 +517,10 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, int is_write) + void *opaque, int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow2_aio_pool, bs, cb, opaque); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; acb-is_write = is_write; @@ -554,19 +540,18 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov
[Qemu-devel] [RFC PATCH 4/5] avoid dandling pointers
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 8ccd7d7..007fb57 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -616,6 +616,7 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, if (acb.qiov-niov 1) { qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size); qemu_vfree(acb.orig_buf); +acb.orig_buf = NULL; } return ret; @@ -700,6 +701,7 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, if (acb.qiov-niov 1) { qemu_vfree(acb.orig_buf); +acb.orig_buf = NULL; } return ret; -- 1.7.1
[Qemu-devel] [PATCH] removed unused function
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block.c | 13 - block.h |2 -- 2 files changed, 0 insertions(+), 15 deletions(-) diff --git a/block.c b/block.c index 24a25d5..b879d2f 100644 --- a/block.c +++ b/block.c @@ -1108,19 +1108,6 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return 0; } -/* - * Writes to the file and ensures that no writes are reordered across this - * request (acts as a barrier) - * - * Returns 0 on success, -errno in error cases. - */ -int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num, -const uint8_t *buf, int nb_sectors) -{ -return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num, -buf, BDRV_SECTOR_SIZE * nb_sectors); -} - /** * Truncate file to 'offset' bytes (needed only for file protocols) */ diff --git a/block.h b/block.h index 859d1d9..6f3e5d0 100644 --- a/block.h +++ b/block.h @@ -85,8 +85,6 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset, const void *buf, int count); int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, const void *buf, int count); -int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num, -const uint8_t *buf, int nb_sectors); int bdrv_truncate(BlockDriverState *bs, int64_t offset); int64_t bdrv_getlength(BlockDriverState *bs); void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr); -- 1.7.1
Re: [Qemu-devel] [RFC PATCH 5/5] qemu_aio_get used to clear all allocated buffer
2011/7/20 Kevin Wolf kw...@redhat.com: Am 20.07.2011 09:57, schrieb Frediano Ziglio: Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 007fb57..8fd1ee5 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -499,7 +499,6 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, int is_write, QCowAIOCB *acb) { - memset(acb, 0, sizeof(*acb)); acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; @@ -509,6 +508,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, if (is_write) qemu_iovec_to_buffer(qiov, acb-buf); } else { + acb-orig_buf = NULL; acb-buf = (uint8_t *)qiov-iov-iov_base; } acb-nb_sectors = nb_sectors; What does this fix? Removing the memset looks like changing code for no obvious reason. Is there any state in acb that must survive qcow_aio_setup? Kevin No, this is not a fix, just an optimization, memset was used to clear entire structure but all fields are set by qcow_aio_setup, only orig_buf left. The comment, I must admit is terrible! And it's mine! I'll rewrite all comments, perhaps merge some commits and send all patches again. Frediano
[Qemu-devel] [RFC PATCH 5/5] qcow: small optimization initializing QCowAIOCB
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 007fb57..8fd1ee5 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -499,7 +499,6 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, int is_write, QCowAIOCB *acb) { -memset(acb, 0, sizeof(*acb)); acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; @@ -509,6 +508,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, if (is_write) qemu_iovec_to_buffer(qiov, acb-buf); } else { +acb-orig_buf = NULL; acb-buf = (uint8_t *)qiov-iov-iov_base; } acb-nb_sectors = nb_sectors; -- 1.7.1
[Qemu-devel] [RFC PATCH 0/5] qcow: coroutines cleanup
These patches mostly cleanup some AIO code using coroutines. These patches apply to Kevin's repository, branch coroutine-block. Mostly they use stack instead of allocated AIO structure. Frediano Ziglio (5): qcow: allocate QCowAIOCB structure using stack qcow: QCowAIOCB field cleanup qcow: move some blocks of code to avoid useless variable initialization avoid dandling pointers qcow: small optimization initializing QCowAIOCB block/qcow.c | 210 + block/qcow2.c | 38 +++--- 2 files changed, 102 insertions(+), 146 deletions(-)
[Qemu-devel] [RFC PATCH 1/5] qcow: allocate QCowAIOCB structure using stack
instead of calling qemi_aio_get use stack Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 52 block/qcow2.c | 38 +++--- 2 files changed, 27 insertions(+), 63 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 6447c2a..d19ef04 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -503,28 +503,12 @@ typedef struct QCowAIOCB { BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; -static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -if (acb-hd_aiocb) -bdrv_aio_cancel(acb-hd_aiocb); -qemu_aio_release(acb); -} - -static AIOPool qcow_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow_aio_cancel, -}; - static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, -int is_write) +int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow_aio_pool, bs, NULL, NULL); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-hd_aiocb = NULL; acb-sector_num = sector_num; acb-qiov = qiov; @@ -543,9 +527,8 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, return acb; } -static int qcow_aio_read_cb(void *opaque) +static int qcow_aio_read_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -634,29 +617,27 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_read_cb(acb); +ret = qcow_aio_read_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size); +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } -static int qcow_aio_write_cb(void *opaque) +static int qcow_aio_write_cb(QCowAIOCB *acb) { -QCowAIOCB *acb = opaque; BlockDriverState *bs = acb-common.bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; @@ -714,23 +695,22 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BDRVQcowState *s = bs-opaque; -QCowAIOCB *acb; +QCowAIOCB acb; int ret; s-cluster_cache_offset = -1; /* disable compressed cache */ -acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1); +qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1, acb); qemu_co_mutex_lock(s-lock); do { -ret = qcow_aio_write_cb(acb); +ret = qcow_aio_write_cb(acb); } while (ret 0); qemu_co_mutex_unlock(s-lock); -if (acb-qiov-niov 1) { -qemu_vfree(acb-orig_buf); +if (acb.qiov-niov 1) { +qemu_vfree(acb.orig_buf); } -qemu_aio_release(acb); return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index f07d550..edc068e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -388,17 +388,6 @@ typedef struct QCowAIOCB { QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; -static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb) -{ -QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); -qemu_aio_release(acb); -} - -static AIOPool qcow2_aio_pool = { -.aiocb_size = sizeof(QCowAIOCB), -.cancel = qcow2_aio_cancel, -}; - /* * Returns 0 when the request is completed successfully, 1 when there is still * a part left to do and -errno in error cases. @@ -528,13 +517,10 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb) static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque, int is_write) + void *opaque, int is_write, QCowAIOCB *acb) { -QCowAIOCB *acb; - -acb = qemu_aio_get(qcow2_aio_pool, bs, cb, opaque); -if (!acb) -return NULL; +memset(acb, 0, sizeof(*acb)); +acb-common.bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; acb-is_write = is_write; @@ -554,19 +540,18 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, int
[Qemu-devel] [RFC PATCH 4/5] avoid dandling pointers
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 8ccd7d7..007fb57 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -616,6 +616,7 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, if (acb.qiov-niov 1) { qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size); qemu_vfree(acb.orig_buf); +acb.orig_buf = NULL; } return ret; @@ -700,6 +701,7 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, if (acb.qiov-niov 1) { qemu_vfree(acb.orig_buf); +acb.orig_buf = NULL; } return ret; -- 1.7.1
[Qemu-devel] [RFC PATCH 2/5] qcow: QCowAIOCB field cleanup
remove unused field from this structure and put some of them in qcow_aio_read_cb and qcow_aio_write_cb Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 133 +++--- 1 files changed, 62 insertions(+), 71 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index d19ef04..cd1f9e3 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -487,20 +487,12 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, #endif typedef struct QCowAIOCB { -BlockDriverAIOCB common; +BlockDriverState *bs; int64_t sector_num; QEMUIOVector *qiov; uint8_t *buf; void *orig_buf; int nb_sectors; -int n; -uint64_t cluster_offset; -uint8_t *cluster_data; -struct iovec hd_iov; -bool is_write; -QEMUBH *bh; -QEMUIOVector hd_qiov; -BlockDriverAIOCB *hd_aiocb; } QCowAIOCB; static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, @@ -508,11 +500,9 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, int is_write, QCowAIOCB *acb) { memset(acb, 0, sizeof(*acb)); -acb-common.bs = bs; -acb-hd_aiocb = NULL; +acb-bs = bs; acb-sector_num = sector_num; acb-qiov = qiov; -acb-is_write = is_write; if (qiov-niov 1) { acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size); @@ -522,37 +512,36 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, acb-buf = (uint8_t *)qiov-iov-iov_base; } acb-nb_sectors = nb_sectors; -acb-n = 0; -acb-cluster_offset = 0; return acb; } static int qcow_aio_read_cb(QCowAIOCB *acb) { -BlockDriverState *bs = acb-common.bs; +BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret; - -acb-hd_aiocb = NULL; +int ret, n = 0; +uint64_t cluster_offset = 0; +struct iovec hd_iov; +QEMUIOVector hd_qiov; redo: /* post process the read buffer */ -if (!acb-cluster_offset) { +if (!cluster_offset) { /* nothing to do */ -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* nothing to do */ } else { if (s-crypt_method) { encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -acb-n, 0, +n, 0, s-aes_decrypt_key); } } -acb-nb_sectors -= acb-n; -acb-sector_num += acb-n; -acb-buf += acb-n * 512; +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; if (acb-nb_sectors == 0) { /* request completed */ @@ -560,57 +549,57 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } /* prepare next AIO request */ -acb-cluster_offset = get_cluster_offset(bs, acb-sector_num 9, +cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); -acb-n = s-cluster_sectors - index_in_cluster; -if (acb-n acb-nb_sectors) -acb-n = acb-nb_sectors; +n = s-cluster_sectors - index_in_cluster; +if (n acb-nb_sectors) +n = acb-nb_sectors; -if (!acb-cluster_offset) { +if (!cluster_offset) { if (bs-backing_hd) { /* read from the base image */ -acb-hd_iov.iov_base = (void *)acb-buf; -acb-hd_iov.iov_len = acb-n * 512; -qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1); +hd_iov.iov_base = (void *)acb-buf; +hd_iov.iov_len = n * 512; +qemu_iovec_init_external(hd_qiov, hd_iov, 1); qemu_co_mutex_unlock(s-lock); ret = bdrv_co_readv(bs-backing_hd, acb-sector_num, -acb-n, acb-hd_qiov); +n, hd_qiov); qemu_co_mutex_lock(s-lock); if (ret 0) { return -EIO; } } else { /* Note: in this case, no need to wait */ -memset(acb-buf, 0, 512 * acb-n); +memset(acb-buf, 0, 512 * n); goto redo; } -} else if (acb-cluster_offset QCOW_OFLAG_COMPRESSED) { +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ -if (decompress_cluster(bs, acb-cluster_offset) 0) { +if (decompress_cluster(bs, cluster_offset) 0) { return -EIO; } memcpy(acb-buf, - s-cluster_cache + index_in_cluster * 512, 512 * acb-n); + s-cluster_cache + index_in_cluster * 512, 512 * n); goto redo; } else { -if ((acb-cluster_offset 511) != 0) { +if ((cluster_offset 511) != 0) { return -EIO; } -acb-hd_iov.iov_base = (void *)acb-buf; -acb-hd_iov.iov_len = acb-n
[Qemu-devel] [RFC PATCH 3/5] qcow: move some blocks of code to avoid useless variable initialization
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c | 53 ++--- 1 files changed, 26 insertions(+), 27 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index cd1f9e3..8ccd7d7 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -520,35 +520,18 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) BlockDriverState *bs = acb-bs; BDRVQcowState *s = bs-opaque; int index_in_cluster; -int ret, n = 0; -uint64_t cluster_offset = 0; +int ret, n; +uint64_t cluster_offset; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -/* post process the read buffer */ -if (!cluster_offset) { -/* nothing to do */ -} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { -/* nothing to do */ -} else { -if (s-crypt_method) { -encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, -n, 0, -s-aes_decrypt_key); -} -} - -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; } -/* prepare next AIO request */ +/* prepare next request */ cluster_offset = get_cluster_offset(bs, acb-sector_num 9, 0, 0, 0, 0); index_in_cluster = acb-sector_num (s-cluster_sectors - 1); @@ -572,7 +555,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } else { /* Note: in this case, no need to wait */ memset(acb-buf, 0, 512 * n); -goto redo; } } else if (cluster_offset QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ @@ -581,7 +563,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } memcpy(acb-buf, s-cluster_cache + index_in_cluster * 512, 512 * n); -goto redo; } else { if ((cluster_offset 511) != 0) { return -EIO; @@ -599,6 +580,23 @@ static int qcow_aio_read_cb(QCowAIOCB *acb) } } +/* post process the read buffer */ +if (!cluster_offset) { +/* nothing to do */ +} else if (cluster_offset QCOW_OFLAG_COMPRESSED) { +/* nothing to do */ +} else { +if (s-crypt_method) { +encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf, +n, 0, +s-aes_decrypt_key); +} +} + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } @@ -630,16 +628,12 @@ static int qcow_aio_write_cb(QCowAIOCB *acb) int index_in_cluster; uint64_t cluster_offset; const uint8_t *src_buf; -int ret, n = 0; +int ret, n; uint8_t *cluster_data = NULL; struct iovec hd_iov; QEMUIOVector hd_qiov; redo: -acb-nb_sectors -= n; -acb-sector_num += n; -acb-buf += n * 512; - if (acb-nb_sectors == 0) { /* request completed */ return 0; @@ -681,6 +675,11 @@ static int qcow_aio_write_cb(QCowAIOCB *acb) if (ret 0) { return ret; } + +acb-nb_sectors -= n; +acb-sector_num += n; +acb-buf += n * 512; + goto redo; } -- 1.7.1
[Qemu-devel] [PATCH] avoid core reading with bdrv_read (qemu-io)
This patch apply to kevin coroutine-block branch and avoid code. It fix qcow: Use coroutines patch. Test case: $ ./qemu-img create -f qcow aaa.img 1G Formatting 'aaa.img', fmt=qcow size=1073741824 encryption=off $ ./qemu-io aaa.img qemu-io read 1024 1024 Segmentation fault Signed-off-by: Frediano Ziglio fredd...@gmail.com --- block/qcow.c |5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/block/qcow.c b/block/qcow.c index 6f7973c..1386e92 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -573,7 +573,8 @@ static int qcow_aio_read_cb(void *opaque) if (acb-nb_sectors == 0) { /* request completed */ -qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); +if (acb-orig_buf) +qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); return 0; } @@ -648,6 +649,7 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, if (acb-qiov-niov 1) { qemu_vfree(acb-orig_buf); +acb-orig_buf = NULL; } qemu_aio_release(acb); @@ -729,6 +731,7 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, if (acb-qiov-niov 1) { qemu_vfree(acb-orig_buf); +acb-orig_buf = NULL; } qemu_aio_release(acb); -- 1.7.1
[Qemu-devel] coroutines and block I/O considerations
Hi, I'm exercise myself in block I/O layer and I decided to test coroutine branch cause I find it easier to use instead of normal callback. Looking at normal code there are a lot of rows in source to save/restore state and declare callbacks and is not that easier to understand the normal flow. At the end I would like to create a new image format to get rid of some performance problem I encounter using writethrough and snapshots. I have some questions regard block I/O and also coroutines 1- threading model. I don't understand it. I can see that aio pool routines does not contain locking code so I think aio layer is mainly executed in a single thread. I saw introduction of some locking using coroutines so I think coroutines are now called from different threads and needs lock (current implementation serialize all device operations) 2- memory considerations on coroutines. Beside coroutines allow more readable code I wonder if somebody considered memory. For every coroutines a different stack has to be allocated. For instance ucontext and win32 implementation use 4mb. Assuming 128 concurrent AIO this require about 512mb of ram (mostly only committed but not used and coroutines are reused). About snapshot and block i/o I think that using external snapshot would help making some stuff easier. By external snapshot I mean creating a new image with backing file as current image file and using this new image for future operations. This would allow for instance - support snapshot with every format (even raw) - making snapshot backup using external programs (even from different hosts using clustered file system and without many locking issues as original image is now read-only) - convert images live (just snapshot, qemu-img convert, remove snapshot) Regards Frediano
Re: [Qemu-devel] [PATCH] avoid core reading with bdrv_read (qemu-io)
2011/7/19 Kevin Wolf kw...@redhat.com: Am 19.07.2011 09:33, schrieb Frediano Ziglio: This patch apply to kevin coroutine-block branch and avoid code. It fix qcow: Use coroutines patch. Test case: $ ./qemu-img create -f qcow aaa.img 1G Formatting 'aaa.img', fmt=qcow size=1073741824 encryption=off $ ./qemu-io aaa.img qemu-io read 1024 1024 Segmentation fault Signed-off-by: Frediano Ziglio fredd...@gmail.com Thanks for the report. I'll update the patch, but in a slightly different way that matches the old code better: diff --git a/block/qcow.c b/block/qcow.c index 6f7973c..6447c2a 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -573,7 +573,6 @@ static int qcow_aio_read_cb(void *opaque) if (acb-nb_sectors == 0) { /* request completed */ - qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); return 0; } @@ -647,6 +646,7 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, qemu_co_mutex_unlock(s-lock); if (acb-qiov-niov 1) { + qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size); qemu_vfree(acb-orig_buf); } qemu_aio_release(acb); Yes, my patch also removed some dandling pointer which I don't like but are not a problem with current code. In case of ret 0 (error) your code could copy data that probably are not initialized. I don't know if data is used in case of failure but in case memory is shared with guest (I don't know, perhaps using virtio) this lead to security issues. Also some memory debugger like valgrind could not like that copy. Frediano
Re: [Qemu-devel] External COW format for raw images
2011/7/19 Robert Wang wdon...@linux.vnet.ibm.com: As you known, raw image is very popular,but the raw image format does NOT support Copy-On-Write,a raw image file can NOT be used as a copy destination, then image streaming/Live Block Copy will NOT work. To fix this, we need to add a new block driver raw-cow to QEMU. If finished, we can use qemu-img like this: qemu-img create -f raw-cow -o backing_file=ubuntu.img,raw_file=my_vm.img my_vm.raw-cow 1) ubuntu.img is the backing file, my_vm.img is a raw file, my_vm.raw-cow stores a COW bitmap related to my_vm.img. 2) If the entire COW bitmap is set to dirty flag then we can get all information from my_vm.img and can ignore ubuntu.img and my_vm.raw-cow from now. To implement this, I think I can follow these steps: 1) Add a new member to BlockDriverState struct: char raw_file[1024]; This member will track raw_file parameter related to raw-cow file from command line. 2) * Create a new file block/raw-cow.c. It will be much more like the mixture of block/cow.c and block/raw.c. So I will change some functions in cow.c and raw.c to none-static, then raw-cow.c can re-use them. When read operation occurs, determine whether dirty flag in raw-cow image is set. If true, read directly from the raw file. After write operation, set related dirty flag in raw-cow image. And other functions might also be modified. * Of course, format_name member of BlockDriver struct will be raw-cow. And in order to keep relationship with raw file( like my_vm.img) , raw_cow_header struct should be struct raw_cow_header { uint32_t magic; uint32_t version; char backing_file[1024]; char raw_file[1024];/* added*/ int32_t mtime; uint64_t size; uint32_t sectorsize; }; * Struct raw_cow_create_options should be one member plus based on cow_create_options: { .name = BLOCK_OPT_RAW_FILE, .type = OPT_STRING, .help = Raw file name }, 3) Add bdrv_get_raw_filename in img_info function of qemu-img.c. In bdrv_get_raw_filename, if the format of the image file is raw-cow, print the related raw file. Do you think my approach is right? Thank you. I don't understand if you mean just a way to track clusters/sectors changed or a new way to implement snapshotting, something that writing data just store original to cow like like: normal backfile is allocated on image write to image else allocate on image copy from backing to image write to image (patch before previous write) cow-raw (inverse backfile) is allocated on image write to backing else allocate on image copy from backing to image write to backing that is is not allocated on image allocate on image copy from backing to image is normal backing write to image else write to backing Frediano
[Qemu-devel] Possible leak in block/qcow.c
Hi, I noted that there are two cluster_data member in block/qcow.c, one in BDRVQcowState, the other in QCowAIOCB. The last one is used in qcow_aio_write_cb to hold buffer for encrypt the cluster before write but I cannot find any related qemu_free while I can find many place where BDRVQcowState::cluster_data is freed. It seems to me a leak but I don't understand why nobody reported this problem before (it should happen at every write so anybody using qcow encrypted should rapidly see this problem). Perhaps there is a sort of garbage collector I'm not aware? Frediano Ziglio
Re: [Qemu-devel] Default cache mode
2011/6/29 Anthony Liguori anth...@codemonkey.ws: On 06/29/2011 07:16 AM, Kevin Wolf wrote: Am 29.06.2011 14:06, schrieb Anthony Liguori: On 06/29/2011 06:59 AM, Kevin Wolf wrote: Hi, I think we have touched this topic before during some IRC discussions or somewhere deep in a mailing list thread, but I think it hasn't been discussed on the list. Our default cache mode of cache=writethrough is extremely conservative and provides absolute safety at the cost of performance, But for the most part, we track bare metal fairly well in terms of block performance, no? Or are you really referring to qcow2 as a specific example? In the past, we used a different default caching mode for qcow2. I think that could be done again if there was a compelling reason. No, people are also complaining about bad performance with raw. Which isn't really surprising when you do a flush after each single write request. O_SYNC is really much more than is needed in the average case. Which file system on the host? At any rate, I'm a big fan of making wce tunable in the guest and then I think setting wce=1 is quite reasonable to do by default. Regards, Anthony Liguori Kevin Personally I think default lead to poor performance but currently people know that even critical application are handled correctly. Assuming a client connect to a database server on a qemu guest when server reply transaction ok you know that even a host crash lead to a successful transaction. At least the fact that this assumption won't be true has to be stated. Lately I'm doing some extensive testing on image performance and I noted that are not only image dependent but also filesystem (where image is stored) dependent but when data are allocated speed is good. raw disks are not so fast as expected? Try to use fallocate command to allocate the file than do a dd to fill the entire file with zeros and writethrough will perform very well. Allocating raw with qemu-img lead to a normal sparse file which written with O_DSYNC flag is quite slow due to extents allocation and fragmentation. Regards Frediano Ziglio
Re: [Qemu-devel] [RFC PATCH v2] Specification for qcow2 version 3
2011/6/27 Kevin Wolf kw...@redhat.com: This is the second draft for what I think could be added when we increase qcow2's version number to 3. This includes points that have been made by several people over the past few months. We're probably not going to implement this next week, but I think it's important to get discussions started early, so here it is. Changes implemented in this RFC: - Added compatible/incompatible/auto-clear feature bits plus an optional feature name table to allow useful error messages even if an older version doesn't know some feature at all. - Added a dirty flag which tells that the refcount may not be accurate (QED mode). This means that we can save writes to the refcount table with cache=writethrough, but isn't really useful otherwise since Qcow2Cache. - Configurable refcount width. If you don't want to use internal snapshots, make refcounts one bit and save cache space and I/O. - Added subclusters. This separate the COW size (one subcluster, I'm thinking of 64k default size here) from the allocation size (one cluster, 2M). Less fragmentation, less metadata, but still reasonable COW granularity. This also allows to preallocate clusters, but none of their subclusters. You can have an image that is like raw + COW metadata, and you can also preallocate metadata for images with backing files. - Zero cluster flags. This allows discard even with a backing file that doesn't contain zeros. It is also useful for copy-on-read/image streaming, as you'll want to keep sparseness without accessing the remote image for an unallocated cluster all the time. - Fixed internal snapshot metadata to use 64 bit VM state size. You can't save a snapshot of a VM with = 4 GB RAM today. Possible future additions: - Add per-L2-table dirty flag to L1? - Add per-refcount-block full flag to refcount table? Hi, thinking about image improvement I would add - GUID for image and backing file - relative path for backing file This would help finding images in a distributed environment or if file are moved, ie: gfs/nfs/ocfs mounted in different mount points, backing used a template in a different images directory and move this directory somewhere else. Also with GUID a possible higher level could manage a GUID - file image db. I was also think about a backing file length field to support resizing but probably can be implemented with zero cluster. Assume you have a image of 5gb, create a new image with first image as backing one, now resize second image from 5gb to 3gb then resize it again (after some works) to 10gb, part from 3gb to 5gb should not be read from backing file. Also a bit in l2 offset to say there is no l2 table cause all clusters in l2 are contiguous so we avoid entirely l2. Obviously this require an optimization step to detect or create such condition. For check perhaps it would be helpful to save not only a flag but also a size where data are ok (for instance already allocated and with refcount saved correctly). A possible optimization for refcount would be to initialize refcount to 1 instead of 0. When clusters are allocated at end-of-file this would not require refcount change and would be easy to check file size to see which clusters are marked as allocated but not present. Fields for sectors and heads to support old CHS systems ?? This mail sound quite strange to me, I thought qed would be the future of qcow2 but I must be really wrong. I think a big limit for current qed and qcow2 implementation is the serialization of metadata informations (qcow2 use synchronous operation while qed use a queue). I used bonnie++ program to test speed and performances allocating data is about 15-20% of allocated one. I'm working (in the few spare time I have) improving it. VirtualBox and ESX use large clusters (1mb) to mitigate allocation/metadata problem. Perhaps raising default cluster size would help changing a spread idea of bad qemu i/o performance. Regards Frediano Ziglio
[Qemu-devel] [PATCH 1/1] fix operator precedence
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- cmd.c |6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd.c b/cmd.c index db2c9c4..ecca167 100644 --- a/cmd.c +++ b/cmd.c @@ -486,7 +486,7 @@ timestr( snprintf(ts, size, %u:%02u.%02u, (unsigned int) MINUTES(tv-tv_sec), (unsigned int) SECONDS(tv-tv_sec), - (unsigned int) usec * 100); + (unsigned int) (usec * 100)); return; } format |= VERBOSE_FIXED_TIME; /* fallback if hours needed */ @@ -497,9 +497,9 @@ timestr( (unsigned int) HOURS(tv-tv_sec), (unsigned int) MINUTES(tv-tv_sec), (unsigned int) SECONDS(tv-tv_sec), - (unsigned int) usec * 100); + (unsigned int) (usec * 100)); } else { - snprintf(ts, size, 0.%04u sec, (unsigned int) usec * 1); + snprintf(ts, size, 0.%04u sec, (unsigned int) (usec * 1)); } } -- 1.7.1
[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.
Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). This patch is required for Darwin. Patch has been tested under FreeBSD, Darwin and Linux. --- hw/rtl8139.c | 139 +++--- 1 files changed, 84 insertions(+), 55 deletions(-) diff --git a/hw/rtl8139.c b/hw/rtl8139.c index f04dd54..72e2242 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -41,6 +41,10 @@ * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2010-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only + * when strictly needed (required for for + * Darwin) */ #include hw.h @@ -60,9 +64,6 @@ /* Calculate CRCs properly on Rx packets */ #define RTL8139_CALCULATE_RXCRC 1 -/* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 - #if defined(RTL8139_CALCULATE_RXCRC) /* For crc32 */ #include zlib.h @@ -491,9 +492,12 @@ typedef struct RTL8139State { /* PCI interrupt timer */ QEMUTimer *timer; +int64_t TimerExpire; } RTL8139State; +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time); + static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command) { DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command)); @@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val) s-IntrMask = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + } static uint32_t rtl8139_IntrMask_read(RTL8139State *s) @@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val) rtl8139_update_irq(s); s-IntrStatus = newStatus; +/* + * Computing if we miss an interrupt here is not that correct but + * considered that we should have had already an interrupt + * and probably emulated is slower is better to assume this resetting was + * done before testing on previous rtl8139_update_irq lead to IRQ loosing + */ +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + #endif } static uint32_t rtl8139_IntrStatus_read(RTL8139State *s) { +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); + uint32_t ret = s-IntrStatus; DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret)); @@ -2739,6 +2755,46 @@ static void rtl8139_io_writew(void *opaque, uint8_t addr, uint32_t val) } } +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time) +{ +int64_t pci_time, next_time; +uint32_t low_pci; + +DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n)); + +if (s-TimerExpire current_time = s-TimerExpire) { +s-IntrStatus |= PCSTimeout; +rtl8139_update_irq(s); +} + +/* Set QEMU timer only if needed that is + * - TimerInt 0 (we have a timer) + * - mask = 1 (we want an interrupt timer) + * - irq = 0 (irq is not already active) + * If any of above change we need to compute timer again + * Also we must check if timer is passed without QEMU timer + */ +s-TimerExpire = 0; +if (!s-TimerInt) { +return; +} + +pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY, +get_ticks_per_sec()); +low_pci = pci_time 0x; +pci_time = pci_time - low_pci + s-TimerInt; +if (low_pci = s-TimerInt) { +pci_time += 0x1LL; +} +next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(), +PCI_FREQUENCY); +s-TimerExpire = next_time; + +if ((s-IntrMask PCSTimeout) != 0 (s-IntrStatus PCSTimeout) == 0) { +qemu_mod_timer(s-timer, next_time); +} +} + static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) { RTL8139State *s = opaque; @@ -2784,13 +2840,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) case Timer: DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n)); -s-TCTR = 0; s-TCTR_base = qemu_get_clock(vm_clock); +rtl8139_set_next_tctr_time(s, s-TCTR_base); break; case FlashReg: DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, val)); -s-TimerInt = val; +if (s-TimerInt != val) { +s-TimerInt = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); +} break; default: @@ -3000,7 +3059,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t addr) break; case Timer: -ret = s-TCTR; +ret
Re: [Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.
2010/2/20 Igor Kovalenko igor.v.kovale...@gmail.com: On Sat, Feb 20, 2010 at 2:26 AM, Anthony Liguori anth...@codemonkey.ws wrote: On 02/19/2010 04:22 PM, Alexander Graf wrote: On 19.02.2010, at 22:50, Anthony Liguori wrote: On 02/16/2010 02:35 AM, Frediano Ziglio wrote: Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). This patch is required for Darwin. Patch has been tested under FreeBSD, Darwin and Linux. Signed-off-by: Frediano Zigliofredd...@gmail.com Alex, I know you've played with this feature for OS X in the past. Is this something we should be enabling unconditionally? I've played with it and realized that 1) OSX's rtl8139 driver uses this timer instead of tx/rx interrupts to process the queue 2) it was broken I haven't gotten around to test if this patch makes it work, but I'd suppose so. So yes, I think it should be enabled by default. It's a feature real hw has, so we should emulate it. The thing that concerns me is why it was disabled by default. Igor, what was the reasoning for disabling it if you recall? It was very inefficient pci timer implementation which fired callback frequently. Once it was found that (freebsd?) was relying on pci timer in realtek driver to communicate we just added missing piece. Now this patch fixes that inefficiency issue. I'm not that use with this ML way... sorry if somebody will receive a lot of SPAM... I updated my patch with Anthony suggestions (just code style). Previous code (current git) issue a timer every pci tick (about 33 millions per second!) in order to compute correctly TCTR. This patch enable QEMU timers only when strictly needed (for interrupt purposes) computing TCTR when needed (only on driver read). I tested with patch under Linux, FreeBSD and Darwin (Mac OS X) and it works correctly. I also bought a physical card to test the behaviour with a small test program and test program work with emulated code too (previously code does not). I sent the test program but I don't know if is possible to integrate it into QEMU code (is a standalone test). Frediano Ziglio
[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.
Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). This patch is required for Darwin. Patch has been tested under FreeBSD, Darwin and Linux. --- hw/rtl8139.c | 139 +++--- 1 files changed, 84 insertions(+), 55 deletions(-) diff --git a/hw/rtl8139.c b/hw/rtl8139.c index f04dd54..72e2242 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -41,6 +41,10 @@ * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2010-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only + * when strictly needed (required for for + * Darwin) */ #include hw.h @@ -60,9 +64,6 @@ /* Calculate CRCs properly on Rx packets */ #define RTL8139_CALCULATE_RXCRC 1 -/* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 - #if defined(RTL8139_CALCULATE_RXCRC) /* For crc32 */ #include zlib.h @@ -491,9 +492,12 @@ typedef struct RTL8139State { /* PCI interrupt timer */ QEMUTimer *timer; +int64_t TimerExpire; } RTL8139State; +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time); + static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command) { DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command)); @@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val) s-IntrMask = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + } static uint32_t rtl8139_IntrMask_read(RTL8139State *s) @@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val) rtl8139_update_irq(s); s-IntrStatus = newStatus; +/* + * Computing if we miss an interrupt here is not that correct but + * considered that we should have had already an interrupt + * and probably emulated is slower is better to assume this resetting was + * done before testing on previous rtl8139_update_irq lead to IRQ loosing + */ +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + #endif } static uint32_t rtl8139_IntrStatus_read(RTL8139State *s) { +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); + uint32_t ret = s-IntrStatus; DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret)); @@ -2739,6 +2755,46 @@ static void rtl8139_io_writew(void *opaque, uint8_t addr, uint32_t val) } } +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time) +{ +int64_t pci_time, next_time; +uint32_t low_pci; + +DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n)); + +if (s-TimerExpire current_time = s-TimerExpire) { +s-IntrStatus |= PCSTimeout; +rtl8139_update_irq(s); +} + +/* Set QEMU timer only if needed that is + * - TimerInt 0 (we have a timer) + * - mask = 1 (we want an interrupt timer) + * - irq = 0 (irq is not already active) + * If any of above change we need to compute timer again + * Also we must check if timer is passed without QEMU timer + */ +s-TimerExpire = 0; +if (!s-TimerInt) { +return; +} + +pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY, +get_ticks_per_sec()); +low_pci = pci_time 0x; +pci_time = pci_time - low_pci + s-TimerInt; +if (low_pci = s-TimerInt) { +pci_time += 0x1LL; +} +next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(), +PCI_FREQUENCY); +s-TimerExpire = next_time; + +if ((s-IntrMask PCSTimeout) != 0 (s-IntrStatus PCSTimeout) == 0) { +qemu_mod_timer(s-timer, next_time); +} +} + static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) { RTL8139State *s = opaque; @@ -2784,13 +2840,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) case Timer: DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n)); -s-TCTR = 0; s-TCTR_base = qemu_get_clock(vm_clock); +rtl8139_set_next_tctr_time(s, s-TCTR_base); break; case FlashReg: DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, val)); -s-TimerInt = val; +if (s-TimerInt != val) { +s-TimerInt = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); +} break; default: @@ -3000,7 +3059,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t addr) break; case Timer: -ret = s-TCTR; +ret
[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.
Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). This patch is required for Darwin. Patch has been tested under FreeBSD, Darwin and Linux. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- hw/rtl8139.c | 139 +++--- 1 files changed, 84 insertions(+), 55 deletions(-) diff --git a/hw/rtl8139.c b/hw/rtl8139.c index f04dd54..72e2242 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -41,6 +41,10 @@ * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2010-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only + * when strictly needed (required for for + * Darwin) */ #include hw.h @@ -60,9 +64,6 @@ /* Calculate CRCs properly on Rx packets */ #define RTL8139_CALCULATE_RXCRC 1 -/* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 - #if defined(RTL8139_CALCULATE_RXCRC) /* For crc32 */ #include zlib.h @@ -491,9 +492,12 @@ typedef struct RTL8139State { /* PCI interrupt timer */ QEMUTimer *timer; +int64_t TimerExpire; } RTL8139State; +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time); + static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command) { DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command)); @@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val) s-IntrMask = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + } static uint32_t rtl8139_IntrMask_read(RTL8139State *s) @@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val) rtl8139_update_irq(s); s-IntrStatus = newStatus; +/* + * Computing if we miss an interrupt here is not that correct but + * considered that we should have had already an interrupt + * and probably emulated is slower is better to assume this resetting was + * done before testing on previous rtl8139_update_irq lead to IRQ loosing + */ +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + #endif } static uint32_t rtl8139_IntrStatus_read(RTL8139State *s) { +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); + uint32_t ret = s-IntrStatus; DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret)); @@ -2739,6 +2755,46 @@ static void rtl8139_io_writew(void *opaque, uint8_t addr, uint32_t val) } } +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time) +{ +int64_t pci_time, next_time; +uint32_t low_pci; + +DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n)); + +if (s-TimerExpire current_time = s-TimerExpire) { +s-IntrStatus |= PCSTimeout; +rtl8139_update_irq(s); +} + +/* Set QEMU timer only if needed that is + * - TimerInt 0 (we have a timer) + * - mask = 1 (we want an interrupt timer) + * - irq = 0 (irq is not already active) + * If any of above change we need to compute timer again + * Also we must check if timer is passed without QEMU timer + */ +s-TimerExpire = 0; +if (!s-TimerInt) { +return; +} + +pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY, +get_ticks_per_sec()); +low_pci = pci_time 0x; +pci_time = pci_time - low_pci + s-TimerInt; +if (low_pci = s-TimerInt) { +pci_time += 0x1LL; +} +next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(), +PCI_FREQUENCY); +s-TimerExpire = next_time; + +if ((s-IntrMask PCSTimeout) != 0 (s-IntrStatus PCSTimeout) == 0) { +qemu_mod_timer(s-timer, next_time); +} +} + static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) { RTL8139State *s = opaque; @@ -2784,13 +2840,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) case Timer: DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n)); -s-TCTR = 0; s-TCTR_base = qemu_get_clock(vm_clock); +rtl8139_set_next_tctr_time(s, s-TCTR_base); break; case FlashReg: DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, val)); -s-TimerInt = val; +if (s-TimerInt != val) { +s-TimerInt = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); +} break; default: @@ -3000,7 +3059,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t addr) break; case Timer
[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.
Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). This patch is required for Darwin. Patch has been tested under FreeBSD, Darwin and Linux. Signed-off-by: Frediano Ziglio fredd...@gmail.com --- hw/rtl8139.c | 136 ++--- 1 files changed, 81 insertions(+), 55 deletions(-) diff --git a/hw/rtl8139.c b/hw/rtl8139.c index f04dd54..e43c15c 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -41,6 +41,10 @@ * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2010-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only + * when strictly needed (required for for + * Darwin) */ #include hw.h @@ -60,9 +64,6 @@ /* Calculate CRCs properly on Rx packets */ #define RTL8139_CALCULATE_RXCRC 1 -/* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 - #if defined(RTL8139_CALCULATE_RXCRC) /* For crc32 */ #include zlib.h @@ -491,9 +492,12 @@ typedef struct RTL8139State { /* PCI interrupt timer */ QEMUTimer *timer; +int64_t TimerExpire; } RTL8139State; +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time); + static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command) { DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command)); @@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val) s-IntrMask = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + } static uint32_t rtl8139_IntrMask_read(RTL8139State *s) @@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val) rtl8139_update_irq(s); s-IntrStatus = newStatus; +/* + * Computing if we miss an interrupt here is not that correct but + * considered that we should have had already an interrupt + * and probably emulated is slower is better to assume this resetting was + * done before testing on previous rtl8139_update_irq lead to IRQ loosing + */ +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + #endif } static uint32_t rtl8139_IntrStatus_read(RTL8139State *s) { +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); + uint32_t ret = s-IntrStatus; DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret)); @@ -2739,6 +2755,43 @@ static void rtl8139_io_writew(void *opaque, uint8_t addr, uint32_t val) } } +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time) +{ +DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n)); + +if (s-TimerExpire current_time = s-TimerExpire) { +s-IntrStatus |= PCSTimeout; +rtl8139_update_irq(s); +} + +/* Set QEMU timer only if needed that is + * - TimerInt 0 (we have a timer) + * - mask = 1 (we want an interrupt timer) + * - irq = 0 (irq is not already active) + * If any of above change we need to compute timer again + * Also we must check if timer is passed without QEMU timer + */ +s-TimerExpire = 0; +if (!s-TimerInt) { +return; +} + +int64_t pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY, +get_ticks_per_sec()); +uint32_t low_pci = pci_time 0x; +pci_time = pci_time - low_pci + s-TimerInt; +if (low_pci = s-TimerInt) { +pci_time += 0x1LL; +} +int64_t next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(), +PCI_FREQUENCY); +s-TimerExpire = next_time; + +if ((s-IntrMask PCSTimeout) != 0 (s-IntrStatus PCSTimeout) == 0) { +qemu_mod_timer(s-timer, next_time); +} +} + static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) { RTL8139State *s = opaque; @@ -2784,13 +2837,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) case Timer: DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n)); -s-TCTR = 0; s-TCTR_base = qemu_get_clock(vm_clock); +rtl8139_set_next_tctr_time(s, s-TCTR_base); break; case FlashReg: DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, val)); -s-TimerInt = val; +if (s-TimerInt != val) { +s-TimerInt = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); +} break; default: @@ -3000,7 +3056,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t addr) break; case Timer: -ret = s-TCTR
[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139. Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set).
Signed-off-by: Frediano Ziglio fredd...@gmail.com --- hw/rtl8139.c | 136 ++--- 1 files changed, 81 insertions(+), 55 deletions(-) diff --git a/hw/rtl8139.c b/hw/rtl8139.c index f04dd54..e43c15c 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -41,6 +41,10 @@ * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2010-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only + * when strictly needed (required for for + * Darwin) */ #include hw.h @@ -60,9 +64,6 @@ /* Calculate CRCs properly on Rx packets */ #define RTL8139_CALCULATE_RXCRC 1 -/* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 - #if defined(RTL8139_CALCULATE_RXCRC) /* For crc32 */ #include zlib.h @@ -491,9 +492,12 @@ typedef struct RTL8139State { /* PCI interrupt timer */ QEMUTimer *timer; +int64_t TimerExpire; } RTL8139State; +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time); + static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command) { DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command)); @@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val) s-IntrMask = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + } static uint32_t rtl8139_IntrMask_read(RTL8139State *s) @@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val) rtl8139_update_irq(s); s-IntrStatus = newStatus; +/* + * Computing if we miss an interrupt here is not that correct but + * considered that we should have had already an interrupt + * and probably emulated is slower is better to assume this resetting was + * done before testing on previous rtl8139_update_irq lead to IRQ loosing + */ +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + #endif } static uint32_t rtl8139_IntrStatus_read(RTL8139State *s) { +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); + uint32_t ret = s-IntrStatus; DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret)); @@ -2739,6 +2755,43 @@ static void rtl8139_io_writew(void *opaque, uint8_t addr, uint32_t val) } } +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time) +{ +DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n)); + +if (s-TimerExpire current_time = s-TimerExpire) { +s-IntrStatus |= PCSTimeout; +rtl8139_update_irq(s); +} + +/* Set QEMU timer only if needed that is + * - TimerInt 0 (we have a timer) + * - mask = 1 (we want an interrupt timer) + * - irq = 0 (irq is not already active) + * If any of above change we need to compute timer again + * Also we must check if timer is passed without QEMU timer + */ +s-TimerExpire = 0; +if (!s-TimerInt) { +return; +} + +int64_t pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY, +get_ticks_per_sec()); +uint32_t low_pci = pci_time 0x; +pci_time = pci_time - low_pci + s-TimerInt; +if (low_pci = s-TimerInt) { +pci_time += 0x1LL; +} +int64_t next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(), +PCI_FREQUENCY); +s-TimerExpire = next_time; + +if ((s-IntrMask PCSTimeout) != 0 (s-IntrStatus PCSTimeout) == 0) { +qemu_mod_timer(s-timer, next_time); +} +} + static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) { RTL8139State *s = opaque; @@ -2784,13 +2837,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) case Timer: DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n)); -s-TCTR = 0; s-TCTR_base = qemu_get_clock(vm_clock); +rtl8139_set_next_tctr_time(s, s-TCTR_base); break; case FlashReg: DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, val)); -s-TimerInt = val; +if (s-TimerInt != val) { +s-TimerInt = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); +} break; default: @@ -3000,7 +3056,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t addr) break; case Timer: -ret = s-TCTR; +ret = muldiv64(qemu_get_clock(vm_clock) - s-TCTR_base, + PCI_FREQUENCY, get_ticks_per_sec()); DEBUG_PRINT((RTL8139: TCTR Timer read val=0x%08x
[Qemu-devel] [PATCH] rtl8139 timer interrupt rewrote
rewrote timer implementation for rtl8139. Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). Signed-off-by: Frediano Ziglio fredd...@gmail.com --- hw/rtl8139.c | 136 ++--- 1 files changed, 81 insertions(+), 55 deletions(-) diff --git a/hw/rtl8139.c b/hw/rtl8139.c index f04dd54..3b332a6 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -41,6 +41,10 @@ * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2009-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only + * when strictly needed (required for for + * Darwin) */ #include hw.h @@ -60,9 +64,6 @@ /* Calculate CRCs properly on Rx packets */ #define RTL8139_CALCULATE_RXCRC 1 -/* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 - #if defined(RTL8139_CALCULATE_RXCRC) /* For crc32 */ #include zlib.h @@ -491,9 +492,12 @@ typedef struct RTL8139State { /* PCI interrupt timer */ QEMUTimer *timer; +int64_t TimerExpire; } RTL8139State; +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time); + static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command) { DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command)); @@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val) s-IntrMask = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + } static uint32_t rtl8139_IntrMask_read(RTL8139State *s) @@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val) rtl8139_update_irq(s); s-IntrStatus = newStatus; +/* + * Computing if we miss an interrupt here is not that correct but + * considered that we should have had already an interrupt + * and probably emulated is slower is better to assume this resetting was + * done before testing on previous rtl8139_update_irq lead to IRQ loosing + */ +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + #endif } static uint32_t rtl8139_IntrStatus_read(RTL8139State *s) { +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); + uint32_t ret = s-IntrStatus; DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret)); @@ -2739,6 +2755,43 @@ static void rtl8139_io_writew(void *opaque, uint8_t addr, uint32_t val) } } +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time) +{ +DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n)); + +if (s-TimerExpire current_time = s-TimerExpire) { +s-IntrStatus |= PCSTimeout; +rtl8139_update_irq(s); +} + +/* Set QEMU timer only if needed that is + * - TimerInt 0 (we have a timer) + * - mask = 1 (we want an interrupt timer) + * - irq = 0 (irq is not already active) + * If any of above change we need to compute timer again + * Also we must check if timer is passed without QEMU timer + */ +s-TimerExpire = 0; +if (!s-TimerInt) { +return; +} + +int64_t pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY, +get_ticks_per_sec()); +uint32_t low_pci = pci_time 0x; +pci_time = pci_time - low_pci + s-TimerInt; +if (low_pci = s-TimerInt) { +pci_time += 0x1LL; +} +int64_t next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(), +PCI_FREQUENCY); +s-TimerExpire = next_time; + +if ((s-IntrMask PCSTimeout) != 0 (s-IntrStatus PCSTimeout) == 0) { +qemu_mod_timer(s-timer, next_time); +} +} + static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) { RTL8139State *s = opaque; @@ -2784,13 +2837,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) case Timer: DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n)); -s-TCTR = 0; s-TCTR_base = qemu_get_clock(vm_clock); +rtl8139_set_next_tctr_time(s, s-TCTR_base); break; case FlashReg: DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, val)); -s-TimerInt = val; +if (s-TimerInt != val) { +s-TimerInt = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); +} break; default: @@ -3000,7 +3056,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t addr) break; case Timer: -ret = s-TCTR; +ret = muldiv64(qemu_get_clock(vm_clock) - s-TCTR_base
Re: [Qemu-devel] [PATCH] rtl8139 timer interrupt rewrote
2010/2/8 Igor Kovalenko igor.v.kovale...@gmail.com: On Sun, Feb 7, 2010 at 6:22 PM, Frediano Ziglio fredd...@gmail.com wrote: rewrote timer implementation for rtl8139. Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). Signed-off-by: Frediano Ziglio fredd...@gmail.com -- diff --git a/hw/rtl8139.c b/hw/rtl8139.c index f04dd54..0d877b7 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -41,6 +41,10 @@ * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2009-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only + * when strictly needed (required for for + * Darwin) */ #include hw.h @@ -61,7 +65,7 @@ #define RTL8139_CALCULATE_RXCRC 1 /* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 +#define RTL8139_ONBOARD_TIMER 1 Please remove this macro. Removed (see updated patch) +static void rtl8139_pre_save(void *opaque) +{ +RTL8139State* s = opaque; + +// set IntrStatus correctly +int64_t current_time = qemu_get_clock(vm_clock); +rtl8139_set_next_tctr_time(s, current_time); +s-TCTR = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY, + get_ticks_per_sec()); +} + Seems like TCTR is not used after restoring from saved state. Is it intentional? Yes, wanted, mainly TCTR is computed from TCTR_base and tick counts so is currently quite unused but is computed for compatibility with previous state. Can you please check if freebsd works with this change? Yes, it works correctly. I don't know FreeBSD that much but I downloaded latest 8.0 live version, got a shell, configured with network manually with old ifconfig and tried some data exchange (mainly scp/ssh). Regards Frediano Ziglio
[Qemu-devel] Updated rtl8139 timer interrupt rewrote
Hi, I rewrote timer implementation for this card. I wrote even a small Linux guest test program (attached as main.c). This patch add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). I tested this patch with a Darwin system and with my test program (I bought a compatible physical card and it works too). This patch replace previous (I updated just style). Regards Frediano Ziglio diff --git a/hw/rtl8139.c b/hw/rtl8139.c index f04dd54..0d877b7 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -41,6 +41,10 @@ * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2009-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only + * when strictly needed (required for for + * Darwin) */ #include hw.h @@ -61,7 +65,7 @@ #define RTL8139_CALCULATE_RXCRC 1 /* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 +#define RTL8139_ONBOARD_TIMER 1 #if defined(RTL8139_CALCULATE_RXCRC) /* For crc32 */ @@ -489,11 +493,20 @@ typedef struct RTL8139State { intcplus_txbuffer_len; intcplus_txbuffer_offset; +#ifdef RTL8139_ONBOARD_TIMER /* PCI interrupt timer */ QEMUTimer *timer; +int64_t TimerExpire; +#endif } RTL8139State; +#ifdef RTL8139_ONBOARD_TIMER +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time); +#else +#define rtl8139_set_next_tctr_time(s,t) do { ; } while(0) +#endif + static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command) { DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command)); @@ -2522,7 +2535,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val) s-IntrMask = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + } static uint32_t rtl8139_IntrMask_read(RTL8139State *s) @@ -2555,12 +2570,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val) rtl8139_update_irq(s); s-IntrStatus = newStatus; +/* + * Computing if we miss an interrupt here is not that correct but + * considered that we should have had already an interrupt + * and probably emulated is slower is better to assume this resetting was + * done before testing on previous rtl8139_update_irq lead to IRQ loosing + */ +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + #endif } static uint32_t rtl8139_IntrStatus_read(RTL8139State *s) { +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); + uint32_t ret = s-IntrStatus; DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret)); @@ -2739,6 +2764,45 @@ static void rtl8139_io_writew(void *opaque, uint8_t addr, uint32_t val) } } +#ifdef RTL8139_ONBOARD_TIMER +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time) +{ +DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n)); + +if (s-TimerExpire current_time = s-TimerExpire) { +s-IntrStatus |= PCSTimeout; +rtl8139_update_irq(s); +} + +/* Set QEMU timer only if needed that is + * - TimerInt 0 (we have a timer) + * - mask = 1 (we want an interrupt timer) + * - irq = 0 (irq is not already active) + * If any of above change we need to compute timer again + * Also we must check if timer is passed without QEMU timer + */ +s-TimerExpire = 0; +if (!s-TimerInt) { +return; +} + +int64_t pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY, +get_ticks_per_sec()); +uint32_t low_pci = pci_time 0x; +pci_time = pci_time - low_pci + s-TimerInt; +if (low_pci = s-TimerInt) { +pci_time += 0x1LL; +} +int64_t next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(), +PCI_FREQUENCY); +s-TimerExpire = next_time; + +if ((s-IntrMask PCSTimeout) != 0 (s-IntrStatus PCSTimeout) == 0) { +qemu_mod_timer(s-timer, next_time); +} +} +#endif + static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) { RTL8139State *s = opaque; @@ -2784,13 +2848,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val) case Timer: DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n)); -s-TCTR = 0; s-TCTR_base = qemu_get_clock(vm_clock); +rtl8139_set_next_tctr_time(s, s-TCTR_base); break; case FlashReg: DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, val)); -s-TimerInt = val; +if (s-TimerInt != val) { +s-TimerInt = val
[Qemu-devel] rtl8139 timer interrupt rewrote
Hi, I rewrote timer implementation for this card. I wrote even a small Linux guest test program (attached as main.c). This patch add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set). I tested this patch with a Darwin system and with my test program (I bought a compatible physical card and it works too). Regards Frediano Ziglio --- hw/rtl8139.c.orig 2010-01-14 23:17:59.0 +0100 +++ hw/rtl8139.c 2010-02-05 20:25:31.711462995 +0100 @@ -34,41 +34,44 @@ * Implemented Tally Counters, increased VM load/save version * Implemented IP/TCP/UDP checksum task offloading * * 2006-Jul-04 Igor Kovalenko : Implemented TCP segmentation offloading * Fixed MTU=1500 for produced ethernet frames * * 2006-Jul-09 Igor Kovalenko : Fixed TCP header length calculation while processing * segmentation offloading * Removed slirp.h dependency * Added rx/tx buffer reset when enabling rx/tx operation + * + * 2009-Feb-04 Frediano Ziglio: Rewrote timer support using QEMU timer only when strictly + * needed (required for for Darwin) */ #include hw.h #include pci.h #include qemu-timer.h #include net.h #include loader.h /* debug RTL8139 card */ //#define DEBUG_RTL8139 1 #define PCI_FREQUENCY 3300L /* debug RTL8139 card C+ mode only */ //#define DEBUG_RTL8139CP 1 /* Calculate CRCs properly on Rx packets */ #define RTL8139_CALCULATE_RXCRC 1 /* Uncomment to enable on-board timer interrupts */ -//#define RTL8139_ONBOARD_TIMER 1 +#define RTL8139_ONBOARD_TIMER 1 #if defined(RTL8139_CALCULATE_RXCRC) /* For crc32 */ #include zlib.h #endif #define SET_MASKED(input, mask, curr) \ ( ( (input) ~(mask) ) | ( (curr) (mask) ) ) /* arg % size for size which is a power of 2 */ @@ -482,25 +485,34 @@ typedef struct RTL8139State { int64_tTCTR_base; /* Tally counters */ RTL8139TallyCounters tally_counters; /* Non-persistent data */ uint8_t *cplus_txbuffer; intcplus_txbuffer_len; intcplus_txbuffer_offset; +#ifdef RTL8139_ONBOARD_TIMER /* PCI interrupt timer */ QEMUTimer *timer; +int64_t TimerExpire; +#endif } RTL8139State; +#ifdef RTL8139_ONBOARD_TIMER +static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time); +#else +#define rtl8139_set_next_tctr_time(s,t) do { ; } while(0) +#endif + static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command) { DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command)); switch (command Chip9346_op_mask) { case Chip9346_op_read: { eeprom-address = command EEPROM_9346_ADDR_MASK; eeprom-output = eeprom-contents[eeprom-address]; @@ -2510,21 +2522,23 @@ static uint32_t rtl8139_RxBuf_read(RTL81 static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val) { DEBUG_PRINT((RTL8139: IntrMask write(w) val=0x%04x\n, val)); /* mask unwriteable bits */ val = SET_MASKED(val, 0x1e00, s-IntrMask); s-IntrMask = val; +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + } static uint32_t rtl8139_IntrMask_read(RTL8139State *s) { uint32_t ret = s-IntrMask; DEBUG_PRINT((RTL8139: IntrMask read(w) val=0x%04x\n, ret)); return ret; } @@ -2543,26 +2557,36 @@ static void rtl8139_IntrStatus_write(RTL uint16_t newStatus = s-IntrStatus ~val; /* mask unwriteable bits */ newStatus = SET_MASKED(newStatus, 0x1e00, s-IntrStatus); /* writing 1 to interrupt status register bit clears it */ s-IntrStatus = 0; rtl8139_update_irq(s); s-IntrStatus = newStatus; +/* + * Computing if we miss an interrupt here is not that correct but + * considered that we should have had already an interrupt + * and probably emulated is slower is better to assume this resetting was done before + * testing on previous rtl8139_update_irq lead to IRQ loosing + */ +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); rtl8139_update_irq(s); + #endif } static uint32_t rtl8139_IntrStatus_read(RTL8139State *s) { +rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock)); + uint32_t ret = s-IntrStatus; DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret)); #if 0 /* reading ISR clears all interrupts */ s-IntrStatus = 0; rtl8139_update_irq(s); @@ -2727,20 +2751,56 @@ static void rtl8139_io_writew(void *opaq default: DEBUG_PRINT((RTL8139: ioport write(w) addr=0x%x val=0x%04x via write(b)\n, addr, val)); rtl8139_io_writeb(opaque, addr, val 0xff); rtl8139_io_writeb(opaque, addr + 1, (val 8) 0xff