[Qemu-devel] [PATCH 3/3] aio: always check paio_init result

2011-07-27 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/raw-posix.c |   11 +--
 1 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 275b41e..3c6bd4b 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -230,13 +230,15 @@ static int raw_open_common(BlockDriverState *bs, const 
char *filename,
 }
 }
 
+/* We're falling back to POSIX AIO in some cases so init always */
+if (paio_init()  0) {
+goto out_free_buf;
+}
+
 #ifdef CONFIG_LINUX_AIO
 if ((bdrv_flags  (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
   (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
 
-/* We're falling back to POSIX AIO in some cases */
-paio_init();
-
 s-aio_ctx = laio_init();
 if (!s-aio_ctx) {
 goto out_free_buf;
@@ -245,9 +247,6 @@ static int raw_open_common(BlockDriverState *bs, const char 
*filename,
 } else
 #endif
 {
-if (paio_init()  0) {
-goto out_free_buf;
-}
 #ifdef CONFIG_LINUX_AIO
 s-use_aio = 0;
 #endif
-- 
1.7.1




[Qemu-devel] [PATCH 0/2] improve Linux AIO support

2011-07-27 Thread Frediano Ziglio
These patches avoid many fallbacks to POSIX AIO and enable Linux AIO even
if nocache is not specified.
Also add flush support with Linux AIO.

Frediano Ziglio (2):
  linux aio: support flush operation
  aio: use Linux AIO even if nocache is not specified

 block/raw-posix.c |   30 +-
 linux-aio.c   |3 +++
 2 files changed, 20 insertions(+), 13 deletions(-)




[Qemu-devel] [PATCH 1/2] linux aio: support flush operation

2011-07-27 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/raw-posix.c |7 +++
 linux-aio.c   |3 +++
 2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 3c6bd4b..27ae81e 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -628,6 +628,13 @@ static BlockDriverAIOCB *raw_aio_flush(BlockDriverState 
*bs,
 if (fd_open(bs)  0)
 return NULL;
 
+#ifdef CONFIG_LINUX_AIO
+if (s-use_aio) {
+return laio_submit(bs, s-aio_ctx, s-fd, 0, NULL,
+   0, cb, opaque, QEMU_AIO_FLUSH);
+}
+#endif
+
 return paio_submit(bs, s-fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
 }
 
diff --git a/linux-aio.c b/linux-aio.c
index 68f4b3d..d07c435 100644
--- a/linux-aio.c
+++ b/linux-aio.c
@@ -215,6 +215,9 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void 
*aio_ctx, int fd,
 case QEMU_AIO_READ:
 io_prep_preadv(iocbs, fd, qiov-iov, qiov-niov, offset);
break;
+case QEMU_AIO_FLUSH:
+io_prep_fdsync(iocbs, fd);
+break;
 default:
 fprintf(stderr, %s: invalid AIO request type 0x%x.\n,
 __func__, type);
-- 
1.7.1




[Qemu-devel] [PATCH 2/2] aio: use Linux AIO even if nocache is not specified

2011-07-27 Thread Frediano Ziglio
Currently Linux AIO are used only if nocache is specified.
Linux AIO works in all cases. The only problem is that currently Linux AIO
does not align data so I add a test that use POSIX AIO in this case.

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/raw-posix.c |   23 ++-
 1 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 27ae81e..078a256 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -236,21 +236,16 @@ static int raw_open_common(BlockDriverState *bs, const 
char *filename,
 }
 
 #ifdef CONFIG_LINUX_AIO
-if ((bdrv_flags  (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
-  (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+s-use_aio = 0;
+if ((bdrv_flags  BDRV_O_NATIVE_AIO)) {
 
 s-aio_ctx = laio_init();
 if (!s-aio_ctx) {
 goto out_free_buf;
 }
 s-use_aio = 1;
-} else
-#endif
-{
-#ifdef CONFIG_LINUX_AIO
-s-use_aio = 0;
-#endif
 }
+#endif
 
 #ifdef CONFIG_XFS
 if (platform_test_xfs_fd(s-fd)) {
@@ -592,14 +587,16 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState 
*bs,
 if (s-aligned_buf) {
 if (!qiov_is_aligned(bs, qiov)) {
 type |= QEMU_AIO_MISALIGNED;
-#ifdef CONFIG_LINUX_AIO
-} else if (s-use_aio) {
-return laio_submit(bs, s-aio_ctx, s-fd, sector_num, qiov,
-   nb_sectors, cb, opaque, type);
-#endif
 }
 }
 
+#ifdef CONFIG_LINUX_AIO
+if (s-use_aio  !(type  QEMU_AIO_MISALIGNED)) {
+return laio_submit(bs, s-aio_ctx, s-fd, sector_num, qiov,
+   nb_sectors, cb, opaque, type);
+}
+#endif
+
 return paio_submit(bs, s-fd, sector_num, qiov, nb_sectors,
cb, opaque, type);
 }
-- 
1.7.1




Re: [Qemu-devel] [PATCH 1/2] linux aio: support flush operation

2011-07-27 Thread Frediano Ziglio

Il giorno 27/lug/2011, alle ore 20:31, Christoph Hellwig h...@lst.de ha 
scritto:

 Did you test this at all?
 

Yes! Not at kernel level :-)
Usually I trust documentation and man pages.

 On Wed, Jul 27, 2011 at 08:25:25PM +0200, Frediano Ziglio wrote:
 +case QEMU_AIO_FLUSH:
 +io_prep_fdsync(iocbs, fd);
 +break;
 
 Looks great, but doesn't work as expected.
 
 Hint: grep for aio_fsync in the linux kernel.
 

Thanks. I'll try to port misaligned access to Linux AIO. Also I'll add some 
comments on code to avoid somebody do the same mistache I did.
Mainly however -k qemu-img and aio=native in blockdev options are silently 
ignored if nocache is not enabled. Also I notice that combining XFS, Linux AIO, 
O_DIRECT and O_DSYNC give impressive performance but currently there is no way 
to specify all that flags together cause nocache enable O_DIRECT while O_DSYNC 
is enabled with writethrough.

Frediano


Re: [Qemu-devel] [PATCH 09/10] posix-aio-compat: Allow read after EOF

2011-07-26 Thread Frediano Ziglio
2011/7/26 Kevin Wolf kw...@redhat.com:
 In order to be able to transparently replace bdrv_read calls by bdrv_co_read,
 reading beyond EOF must produce zeros instead of short reads for AIO, too.

 Signed-off-by: Kevin Wolf kw...@redhat.com
 ---
  posix-aio-compat.c |   19 +++
  1 files changed, 19 insertions(+), 0 deletions(-)

 diff --git a/posix-aio-compat.c b/posix-aio-compat.c
 index 788d113..8dc00cb 100644
 --- a/posix-aio-compat.c
 +++ b/posix-aio-compat.c
 @@ -198,6 +198,12 @@ static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb 
 *aiocb)
     return len;
  }

 +/*
 + * Read/writes the data to/from a given linear buffer.
 + *
 + * Returns the number of bytes handles or -errno in case of an error. Short
 + * reads are only returned if the end of the file is reached.
 + */
  static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
  {
     ssize_t offset = 0;
 @@ -334,6 +340,19 @@ static void *aio_thread(void *unused)

         switch (aiocb-aio_type  QEMU_AIO_TYPE_MASK) {
         case QEMU_AIO_READ:
 +            ret = handle_aiocb_rw(aiocb);
 +            if (ret = 0  ret  aiocb-aio_nbytes  
 aiocb-common.bs-growable) {
 +                /* A short read means that we have reached EOF. Pad the 
 buffer
 +                 * with zeros for bytes after EOF. */
 +                QEMUIOVector qiov;
 +
 +                qemu_iovec_init_external(qiov, aiocb-aio_iov,
 +                                         aiocb-aio_niov);
 +                qemu_iovec_memset_skip(qiov, 0, aiocb-aio_nbytes - ret, 
 ret);
 +
 +                ret = aiocb-aio_nbytes;
 +            }
 +            break;
         case QEMU_AIO_WRITE:
             ret = handle_aiocb_rw(aiocb);
             break;
 --
 1.7.6


Still not tested but I think to know what does it solve :)

I think Linux AIO require same attention.

Frediano



[Qemu-devel] [PATCH 0/5] qcow: coroutines cleanup

2011-07-22 Thread Frediano Ziglio
These patches mostly cleanup some AIO code using coroutines.
These patches apply to Kevin's repository, branch coroutine-block.
Mostly they use stack instead of allocated AIO structure.

Frediano Ziglio (5):
  qcow: allocate QCowAIOCB structure using stack
  qcow: QCowAIOCB field cleanup
  qcow: move some blocks of code to avoid useless variable
initialization
  qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb
into qcow_co_writev
  qcow: remove old #undefined code

 block/qcow.c  |  374 ++---
 block/qcow2.c |   38 ++
 2 files changed, 129 insertions(+), 283 deletions(-)




[Qemu-devel] [PATCH 1/5] qcow: allocate QCowAIOCB structure using stack

2011-07-22 Thread Frediano Ziglio
instead of calling qemi_aio_get use stack

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c  |   52 
 block/qcow2.c |   38 +++---
 2 files changed, 27 insertions(+), 63 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 6447c2a..d19ef04 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -503,28 +503,12 @@ typedef struct QCowAIOCB {
 BlockDriverAIOCB *hd_aiocb;
 } QCowAIOCB;
 
-static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-if (acb-hd_aiocb)
-bdrv_aio_cancel(acb-hd_aiocb);
-qemu_aio_release(acb);
-}
-
-static AIOPool qcow_aio_pool = {
-.aiocb_size = sizeof(QCowAIOCB),
-.cancel = qcow_aio_cancel,
-};
-
 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-int is_write)
+int is_write, QCowAIOCB *acb)
 {
-QCowAIOCB *acb;
-
-acb = qemu_aio_get(qcow_aio_pool, bs, NULL, NULL);
-if (!acb)
-return NULL;
+memset(acb, 0, sizeof(*acb));
+acb-common.bs = bs;
 acb-hd_aiocb = NULL;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
@@ -543,9 +527,8 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 return acb;
 }
 
-static int qcow_aio_read_cb(void *opaque)
+static int qcow_aio_read_cb(QCowAIOCB *acb)
 {
-QCowAIOCB *acb = opaque;
 BlockDriverState *bs = acb-common.bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
@@ -634,29 +617,27 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t 
sector_num,
  int nb_sectors, QEMUIOVector *qiov)
 {
 BDRVQcowState *s = bs-opaque;
-QCowAIOCB *acb;
+QCowAIOCB acb;
 int ret;
 
-acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
+qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0, acb);
 
 qemu_co_mutex_lock(s-lock);
 do {
-ret = qcow_aio_read_cb(acb);
+ret = qcow_aio_read_cb(acb);
 } while (ret  0);
 qemu_co_mutex_unlock(s-lock);
 
-if (acb-qiov-niov  1) {
-qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size);
-qemu_vfree(acb-orig_buf);
+if (acb.qiov-niov  1) {
+qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size);
+qemu_vfree(acb.orig_buf);
 }
-qemu_aio_release(acb);
 
 return ret;
 }
 
-static int qcow_aio_write_cb(void *opaque)
+static int qcow_aio_write_cb(QCowAIOCB *acb)
 {
-QCowAIOCB *acb = opaque;
 BlockDriverState *bs = acb-common.bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
@@ -714,23 +695,22 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t 
sector_num,
   int nb_sectors, QEMUIOVector *qiov)
 {
 BDRVQcowState *s = bs-opaque;
-QCowAIOCB *acb;
+QCowAIOCB acb;
 int ret;
 
 s-cluster_cache_offset = -1; /* disable compressed cache */
 
-acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
+qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1, acb);
 
 qemu_co_mutex_lock(s-lock);
 do {
-ret = qcow_aio_write_cb(acb);
+ret = qcow_aio_write_cb(acb);
 } while (ret  0);
 qemu_co_mutex_unlock(s-lock);
 
-if (acb-qiov-niov  1) {
-qemu_vfree(acb-orig_buf);
+if (acb.qiov-niov  1) {
+qemu_vfree(acb.orig_buf);
 }
-qemu_aio_release(acb);
 
 return ret;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index f07d550..edc068e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -388,17 +388,6 @@ typedef struct QCowAIOCB {
 QLIST_ENTRY(QCowAIOCB) next_depend;
 } QCowAIOCB;
 
-static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-qemu_aio_release(acb);
-}
-
-static AIOPool qcow2_aio_pool = {
-.aiocb_size = sizeof(QCowAIOCB),
-.cancel = qcow2_aio_cancel,
-};
-
 /*
  * Returns 0 when the request is completed successfully, 1 when there is still
  * a part left to do and -errno in error cases.
@@ -528,13 +517,10 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb)
 static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
   QEMUIOVector *qiov, int nb_sectors,
   BlockDriverCompletionFunc *cb,
-  void *opaque, int is_write)
+  void *opaque, int is_write, QCowAIOCB *acb)
 {
-QCowAIOCB *acb;
-
-acb = qemu_aio_get(qcow2_aio_pool, bs, cb, opaque);
-if (!acb)
-return NULL;
+memset(acb, 0, sizeof(*acb));
+acb-common.bs = bs;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
 acb-is_write = is_write;
@@ -554,19 +540,18 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t 
sector_num,
   int

[Qemu-devel] [PATCH 2/5] qcow: QCowAIOCB field cleanup

2011-07-22 Thread Frediano Ziglio
remove unused field from this structure and put some of them in 
qcow_aio_read_cb and qcow_aio_write_cb

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |  135 +++---
 1 files changed, 63 insertions(+), 72 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index d19ef04..e52df91 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -487,72 +487,61 @@ static int qcow_read(BlockDriverState *bs, int64_t 
sector_num,
 #endif
 
 typedef struct QCowAIOCB {
-BlockDriverAIOCB common;
+BlockDriverState *bs;
 int64_t sector_num;
 QEMUIOVector *qiov;
 uint8_t *buf;
 void *orig_buf;
 int nb_sectors;
-int n;
-uint64_t cluster_offset;
-uint8_t *cluster_data;
-struct iovec hd_iov;
-bool is_write;
-QEMUBH *bh;
-QEMUIOVector hd_qiov;
-BlockDriverAIOCB *hd_aiocb;
 } QCowAIOCB;
 
 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 int is_write, QCowAIOCB *acb)
 {
-memset(acb, 0, sizeof(*acb));
-acb-common.bs = bs;
-acb-hd_aiocb = NULL;
+acb-bs = bs;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
-acb-is_write = is_write;
 
 if (qiov-niov  1) {
 acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size);
 if (is_write)
 qemu_iovec_to_buffer(qiov, acb-buf);
 } else {
+acb-orig_buf = NULL;
 acb-buf = (uint8_t *)qiov-iov-iov_base;
 }
 acb-nb_sectors = nb_sectors;
-acb-n = 0;
-acb-cluster_offset = 0;
 return acb;
 }
 
 static int qcow_aio_read_cb(QCowAIOCB *acb)
 {
-BlockDriverState *bs = acb-common.bs;
+BlockDriverState *bs = acb-bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
-int ret;
-
-acb-hd_aiocb = NULL;
+int ret, n = 0;
+uint64_t cluster_offset = 0;
+struct iovec hd_iov;
+QEMUIOVector hd_qiov;
 
  redo:
 /* post process the read buffer */
-if (!acb-cluster_offset) {
+if (!cluster_offset) {
 /* nothing to do */
-} else if (acb-cluster_offset  QCOW_OFLAG_COMPRESSED) {
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* nothing to do */
 } else {
 if (s-crypt_method) {
 encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
-acb-n, 0,
+n, 0,
 s-aes_decrypt_key);
 }
 }
 
-acb-nb_sectors -= acb-n;
-acb-sector_num += acb-n;
-acb-buf += acb-n * 512;
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
 
 if (acb-nb_sectors == 0) {
 /* request completed */
@@ -560,57 +549,57 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 
 /* prepare next AIO request */
-acb-cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
+cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
  0, 0, 0, 0);
 index_in_cluster = acb-sector_num  (s-cluster_sectors - 1);
-acb-n = s-cluster_sectors - index_in_cluster;
-if (acb-n  acb-nb_sectors)
-acb-n = acb-nb_sectors;
+n = s-cluster_sectors - index_in_cluster;
+if (n  acb-nb_sectors)
+n = acb-nb_sectors;
 
-if (!acb-cluster_offset) {
+if (!cluster_offset) {
 if (bs-backing_hd) {
 /* read from the base image */
-acb-hd_iov.iov_base = (void *)acb-buf;
-acb-hd_iov.iov_len = acb-n * 512;
-qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1);
+hd_iov.iov_base = (void *)acb-buf;
+hd_iov.iov_len = n * 512;
+qemu_iovec_init_external(hd_qiov, hd_iov, 1);
 qemu_co_mutex_unlock(s-lock);
 ret = bdrv_co_readv(bs-backing_hd, acb-sector_num,
-acb-n, acb-hd_qiov);
+n, hd_qiov);
 qemu_co_mutex_lock(s-lock);
 if (ret  0) {
 return -EIO;
 }
 } else {
 /* Note: in this case, no need to wait */
-memset(acb-buf, 0, 512 * acb-n);
+memset(acb-buf, 0, 512 * n);
 goto redo;
 }
-} else if (acb-cluster_offset  QCOW_OFLAG_COMPRESSED) {
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* add AIO support for compressed blocks ? */
-if (decompress_cluster(bs, acb-cluster_offset)  0) {
+if (decompress_cluster(bs, cluster_offset)  0) {
 return -EIO;
 }
 memcpy(acb-buf,
-   s-cluster_cache + index_in_cluster * 512, 512 * acb-n);
+   s-cluster_cache + index_in_cluster * 512, 512 * n);
 goto redo;
 } else {
-if ((acb-cluster_offset  511) != 0) {
+if ((cluster_offset  511) != 0) {
 return -EIO;
 }
-acb-hd_iov.iov_base = (void *)acb-buf

[Qemu-devel] [PATCH 4/5] qcow: embed qcow_aio_read_cb into qcow_co_readv and qcow_aio_write_cb into qcow_co_writev

2011-07-22 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |  287 -
 1 files changed, 121 insertions(+), 166 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 015a472..2feb7d7 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -486,221 +486,176 @@ static int qcow_read(BlockDriverState *bs, int64_t 
sector_num,
 }
 #endif
 
-typedef struct QCowAIOCB {
-BlockDriverState *bs;
-int64_t sector_num;
-QEMUIOVector *qiov;
-uint8_t *buf;
-void *orig_buf;
-int nb_sectors;
-} QCowAIOCB;
-
-static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
-int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-int is_write, QCowAIOCB *acb)
-{
-acb-bs = bs;
-acb-sector_num = sector_num;
-acb-qiov = qiov;
-
-if (qiov-niov  1) {
-acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size);
-if (is_write)
-qemu_iovec_to_buffer(qiov, acb-buf);
-} else {
-acb-orig_buf = NULL;
-acb-buf = (uint8_t *)qiov-iov-iov_base;
-}
-acb-nb_sectors = nb_sectors;
-return acb;
-}
-
-static int qcow_aio_read_cb(QCowAIOCB *acb)
+static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
 {
-BlockDriverState *bs = acb-bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
-int ret, n;
+int ret = 0, n;
 uint64_t cluster_offset;
 struct iovec hd_iov;
 QEMUIOVector hd_qiov;
+uint8_t *buf;
+void *orig_buf;
 
- redo:
-if (acb-nb_sectors == 0) {
-/* request completed */
-return 0;
+if (qiov-niov  1) {
+buf = orig_buf = qemu_blockalign(bs, qiov-size);
+} else {
+orig_buf = NULL;
+buf = (uint8_t *)qiov-iov-iov_base;
 }
 
-/* prepare next request */
-cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
- 0, 0, 0, 0);
-index_in_cluster = acb-sector_num  (s-cluster_sectors - 1);
-n = s-cluster_sectors - index_in_cluster;
-if (n  acb-nb_sectors)
-n = acb-nb_sectors;
+qemu_co_mutex_lock(s-lock);
+
+while (nb_sectors != 0) {
+/* prepare next request */
+cluster_offset = get_cluster_offset(bs, sector_num  9,
+ 0, 0, 0, 0);
+index_in_cluster = sector_num  (s-cluster_sectors - 1);
+n = s-cluster_sectors - index_in_cluster;
+if (n  nb_sectors)
+n = nb_sectors;
 
-if (!cluster_offset) {
-if (bs-backing_hd) {
-/* read from the base image */
-hd_iov.iov_base = (void *)acb-buf;
+if (!cluster_offset) {
+if (bs-backing_hd) {
+/* read from the base image */
+hd_iov.iov_base = (void *)buf;
+hd_iov.iov_len = n * 512;
+qemu_iovec_init_external(hd_qiov, hd_iov, 1);
+qemu_co_mutex_unlock(s-lock);
+ret = bdrv_co_readv(bs-backing_hd, sector_num,
+n, hd_qiov);
+qemu_co_mutex_lock(s-lock);
+if (ret  0) {
+goto fail;
+}
+} else {
+/* Note: in this case, no need to wait */
+memset(buf, 0, 512 * n);
+}
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
+/* add AIO support for compressed blocks ? */
+if (decompress_cluster(bs, cluster_offset)  0) {
+goto fail;
+}
+memcpy(buf,
+   s-cluster_cache + index_in_cluster * 512, 512 * n);
+} else {
+if ((cluster_offset  511) != 0) {
+goto fail;
+}
+hd_iov.iov_base = (void *)buf;
 hd_iov.iov_len = n * 512;
 qemu_iovec_init_external(hd_qiov, hd_iov, 1);
 qemu_co_mutex_unlock(s-lock);
-ret = bdrv_co_readv(bs-backing_hd, acb-sector_num,
+ret = bdrv_co_readv(bs-file,
+(cluster_offset  9) + index_in_cluster,
 n, hd_qiov);
 qemu_co_mutex_lock(s-lock);
 if (ret  0) {
-return -EIO;
+break;
+}
+if (s-crypt_method) {
+encrypt_sectors(s, sector_num, buf, buf,
+n, 0,
+s-aes_decrypt_key);
 }
-} else {
-/* Note: in this case, no need to wait */
-memset(acb-buf, 0, 512 * n);
-}
-} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
-/* add AIO support for compressed blocks ? */
-if (decompress_cluster(bs, cluster_offset)  0) {
-return -EIO;
-}
-memcpy(acb-buf,
-   s-cluster_cache

[Qemu-devel] [PATCH 5/5] qcow: remove old #undefined code

2011-07-22 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |   63 --
 1 files changed, 0 insertions(+), 63 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 2feb7d7..42331c7 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -190,24 +190,6 @@ static int qcow_set_key(BlockDriverState *bs, const char 
*key)
 return -1;
 if (AES_set_decrypt_key(keybuf, 128, s-aes_decrypt_key) != 0)
 return -1;
-#if 0
-/* test */
-{
-uint8_t in[16];
-uint8_t out[16];
-uint8_t tmp[16];
-for(i=0;i16;i++)
-in[i] = i;
-AES_encrypt(in, tmp, s-aes_encrypt_key);
-AES_decrypt(tmp, out, s-aes_decrypt_key);
-for(i = 0; i  16; i++)
-printf( %02x, tmp[i]);
-printf(\n);
-for(i = 0; i  16; i++)
-printf( %02x, out[i]);
-printf(\n);
-}
-#endif
 return 0;
 }
 
@@ -441,51 +423,6 @@ static int decompress_cluster(BlockDriverState *bs, 
uint64_t cluster_offset)
 return 0;
 }
 
-#if 0
-
-static int qcow_read(BlockDriverState *bs, int64_t sector_num,
- uint8_t *buf, int nb_sectors)
-{
-BDRVQcowState *s = bs-opaque;
-int ret, index_in_cluster, n;
-uint64_t cluster_offset;
-
-while (nb_sectors  0) {
-cluster_offset = get_cluster_offset(bs, sector_num  9, 0, 0, 0, 0);
-index_in_cluster = sector_num  (s-cluster_sectors - 1);
-n = s-cluster_sectors - index_in_cluster;
-if (n  nb_sectors)
-n = nb_sectors;
-if (!cluster_offset) {
-if (bs-backing_hd) {
-/* read from the base image */
-ret = bdrv_read(bs-backing_hd, sector_num, buf, n);
-if (ret  0)
-return -1;
-} else {
-memset(buf, 0, 512 * n);
-}
-} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
-if (decompress_cluster(bs, cluster_offset)  0)
-return -1;
-memcpy(buf, s-cluster_cache + index_in_cluster * 512, 512 * n);
-} else {
-ret = bdrv_pread(bs-file, cluster_offset + index_in_cluster * 
512, buf, n * 512);
-if (ret != n * 512)
-return -1;
-if (s-crypt_method) {
-encrypt_sectors(s, sector_num, buf, buf, n, 0,
-s-aes_decrypt_key);
-}
-}
-nb_sectors -= n;
-sector_num += n;
-buf += n * 512;
-}
-return 0;
-}
-#endif
-
 static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
  int nb_sectors, QEMUIOVector *qiov)
 {
-- 
1.7.1




[Qemu-devel] [PATCH 3/5] qcow: move some blocks of code to avoid useless variable initialization

2011-07-22 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |   53 ++---
 1 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index e52df91..015a472 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -520,35 +520,18 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 BlockDriverState *bs = acb-bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
-int ret, n = 0;
-uint64_t cluster_offset = 0;
+int ret, n;
+uint64_t cluster_offset;
 struct iovec hd_iov;
 QEMUIOVector hd_qiov;
 
  redo:
-/* post process the read buffer */
-if (!cluster_offset) {
-/* nothing to do */
-} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
-/* nothing to do */
-} else {
-if (s-crypt_method) {
-encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
-n, 0,
-s-aes_decrypt_key);
-}
-}
-
-acb-nb_sectors -= n;
-acb-sector_num += n;
-acb-buf += n * 512;
-
 if (acb-nb_sectors == 0) {
 /* request completed */
 return 0;
 }
 
-/* prepare next AIO request */
+/* prepare next request */
 cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
  0, 0, 0, 0);
 index_in_cluster = acb-sector_num  (s-cluster_sectors - 1);
@@ -572,7 +555,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 } else {
 /* Note: in this case, no need to wait */
 memset(acb-buf, 0, 512 * n);
-goto redo;
 }
 } else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* add AIO support for compressed blocks ? */
@@ -581,7 +563,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 memcpy(acb-buf,
s-cluster_cache + index_in_cluster * 512, 512 * n);
-goto redo;
 } else {
 if ((cluster_offset  511) != 0) {
 return -EIO;
@@ -599,6 +580,23 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 }
 
+/* post process the read buffer */
+if (!cluster_offset) {
+/* nothing to do */
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
+/* nothing to do */
+} else {
+if (s-crypt_method) {
+encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
+n, 0,
+s-aes_decrypt_key);
+}
+}
+
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
+
 goto redo;
 }
 
@@ -630,16 +628,12 @@ static int qcow_aio_write_cb(QCowAIOCB *acb)
 int index_in_cluster;
 uint64_t cluster_offset;
 const uint8_t *src_buf;
-int ret, n = 0;
+int ret, n;
 uint8_t *cluster_data = NULL;
 struct iovec hd_iov;
 QEMUIOVector hd_qiov;
 
   redo:
-acb-nb_sectors -= n;
-acb-sector_num += n;
-acb-buf += n * 512;
-
 if (acb-nb_sectors == 0) {
 /* request completed */
 return 0;
@@ -681,6 +675,11 @@ static int qcow_aio_write_cb(QCowAIOCB *acb)
 if (ret  0) {
 return ret;
 }
+
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
+
 goto redo;
 }
 
-- 
1.7.1




Re: [Qemu-devel] [RFC] qcow2: 2 way to improve performance updating refcount

2011-07-22 Thread Frediano Ziglio
2011/7/22 Kevin Wolf kw...@redhat.com:
 Am 21.07.2011 18:17, schrieb Frediano Ziglio:
 Hi,
   after a snapshot is taken currently many write operations are quite
 slow due to
 - refcount updates (decrement old and increment new )
 - cluster allocation and file expansion
 - read-modify-write on partial clusters

 I found 2 way to improve refcount performance

 Method 1 - Lazy count
 Mainly do not take into account count for current snapshot, that is
 current snapshot counts as 0. This would require to add a
 current_snapshot in header and update refcount when current is changed.
 So for these operation
 - creating snapshot, performance are the same, just increment for old
 snapshot instead of the new one
 - normal write operations. As current snaphot counts as 0 there is not
 operations here so do not write any data
 - changing current snapshot, this is the worst case, you have to
 increment for the current snapshot and decrement for the new so it will
 take twice
 - deleting snapshot, if is the current just set current_snapshot to a
 dummy not existing value, if is not the current just decrement counters,
 no performance changes

 How would you do cluster allocation if you don't have refcounts any more
 that can tell you if a cluster is used or not?


You have refcount, is only that current snapshot counts as 0. An
example may help, start with A snapshot A counts as zero so all
refcounts are 0, now we create a snapshot B and make it current so
refcounts are 1

A --- B

If you change a cluster in snapshot B counts are still 1. If you go
back to A counters are increment (cause you leave B) and then
decrement (cause you enter in A).

Perhaps the problem is how to distinguish 0 from allocated in
current and not allocated. Yes, with which I suppose above it's a
problem, but we can easily use -1 as not allocated. If current and
refcount 0 mark as -1, if not current we would have to increment
counters of current, mark current as -1 than decrement for deleting,
yes in this case you have twice the time.

 Method 2 - Read-only parent
 Here parents are readonly, instead of storing a refcount store a numeric
 id of the owner. If the owner is not current copy the cluster and change
 it. Considering this situation

 A --- B --- C

 B cannot be changed so in order to change B you have to create a new
 snapshot

 A --- B --- C
          \--- D

 and change D. It can take more space cause you have in this case an
 additional snapshot.

 Operations:
 - creating snapshot, really fast as you don't have to change any
 ownership
 - normal write operations. If owner is not the same allocate a new
 cluster and just store a new owner for new cluster. Also ownership for
 past-to-end cluster could be set all to current owner in order to
 collapse allocations
 - changing current snapshot, no changes required for owners
 - deleting snapshot. Only possible if you have no child or a single
 child. Will require to scan all l2 tables and merge and update owner.

 I think this has similar characteristics as we have with external
 snapshots (i.e. backing files). The advantage is that with applying it
 to internal snapshots is that when deleting a snapshot you don't have to
 copy around all the data.

 Probably this change could even be done transparently for the user, so
 that B still appears to be writeable, but in fact refers to D now.


 Anyway, have you checked how bad the refcount work really is? I think
 that writing the VM state takes a lot longer, so that optimising the
 refcount update may be the wrong approach, especially if it requires a
 format change. My results with qemu-img snapshot suggest that it's not
 worth it:

 kwolf@dhcp-5-188:~/images$ ~/source/qemu/qemu-img info scratch.qcow2
 image: scratch.qcow2
 file format: qcow2
 virtual size: 8.0G (8589934592 bytes)
 disk size: 4.0G
 cluster_size: 65536
 kwolf@dhcp-5-188:~/images$ time ~/source/qemu/qemu-img snapshot -c test
 scratch.qcow2

 real    0m0.116s
 user    0m0.009s
 sys     0m0.040s
 kwolf@dhcp-5-188:~/images$ time ~/source/qemu/qemu-img snapshot -d test
 scratch.qcow2

 real    0m0.084s
 user    0m0.011s
 sys     0m0.044s

 Kevin


I'm not worried about time just taking snapshot more after taking
snapshot during normal use. As you stated taking snapshot you can
disable cache writethrough making it very fast but during normal
operations you can't.

Personally I'm pondering a log too to allow collapsing metadata
updates. Even an external (another file) full log (with data) to try
to reduce even overhead caused by read-modify-write during partial
cluster updates and reduce file fragmentation. But as you can see from
my patches I'm still exercising myself with Qemu code.

Regards
  Frediano



Re: [Qemu-devel] [RFC PATCH 0/5] qcow: coroutines cleanup

2011-07-22 Thread Frediano Ziglio
2011/7/22 Kevin Wolf kw...@redhat.com:
 Am 20.07.2011 15:56, schrieb Frediano Ziglio:
 These patches mostly cleanup some AIO code using coroutines.
 These patches apply to Kevin's repository, branch coroutine-block.
 Mostly they use stack instead of allocated AIO structure.

 Frediano Ziglio (5):
   qcow: allocate QCowAIOCB structure using stack
   qcow: QCowAIOCB field cleanup
   qcow: move some blocks of code to avoid useless variable
     initialization
   avoid dandling pointers
   qcow: small optimization initializing QCowAIOCB

  block/qcow.c  |  210 
 +
  block/qcow2.c |   38 +++---
  2 files changed, 102 insertions(+), 146 deletions(-)

 Most of it looks good now. Did you include the RFC in the subject just
 because the coroutine work is in RFC state, too, or did you intend to
 tell me that I shouldn't merge yet?

 Kevin


As these patches are first quite big patches I send (typo or small
fixes do not counts) I just want to mark that I could write something
really wrong. Just a way to avoid somebody having to send more patches
and get more attention. Some projects are quite prone to merge even
not that fine ones. I prefer to have some (a bit) pedantic comments
and a real fix/improve.

Now I removed the RFC from last update. The main reason is that I
found your qemu-iotests repository which, I think should be merged to
main repository, but it's just my opinion.
Oh... qcow fails 004 test (even origin/coroutines-block) with a I/O error.

I must say there are a lot of small hidden things that a developer
should know about Qemu, for instance
- mailing list follow some LKML rules as CC ML and send to maintainer
to get more attention
- you can use scripts/checkpatch.pl to check your patches before send

I still have also to understand how to use git format-patch/send-email
correctly and fluently :)

Frediano



Re: [Qemu-devel] [RFC PATCH 4/5] avoid dandling pointers

2011-07-22 Thread Frediano Ziglio
2011/7/22 Kevin Wolf kw...@redhat.com:
 Am 20.07.2011 15:56, schrieb Frediano Ziglio:
 Signed-off-by: Frediano Ziglio fredd...@gmail.com
 ---
  block/qcow.c |    2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)

 Subject needs a qcow: ...

 Kevin


Yes, now I removed that patch as with argument on stack it just make
few sense...



Re: [Qemu-devel] [RFC PATCH 0/5] qcow: coroutines cleanup

2011-07-22 Thread Frediano Ziglio
2011/7/22 Stefan Hajnoczi stefa...@gmail.com:
 On Fri, Jul 22, 2011 at 11:10 AM, Kevin Wolf kw...@redhat.com wrote:
 Am 22.07.2011 11:26, schrieb Frediano Ziglio:
 - you can use scripts/checkpatch.pl to check your patches before send

 I updated the SubmitAPatch wiki earlier this week.

 Stefan


Good, now wiki is working (it seems somebody is attacking Qemu
sites... yesterday the ML).

http://git.qemu.org/git/qemu.git/ is not working so all links give 404.

I added some notes, yes checkpatch was already in the page.

About git commands to send multiple patches, I use

  git format-patch --cover-letter -s -M origin/original_branch_name
--subject-prefix='PATCH vXX' -o outgoing/

edit manually cover letter and

  git send-email --to='maintainer@domain' --cc='qemu-devel@nongnu.org'
outgoing/*

are these command correct or there is a better way?

Frediano



Re: [Qemu-devel] [RFC] qcow2: 2 way to improve performance updating refcount

2011-07-22 Thread Frediano Ziglio
2011/7/22 Stefan Hajnoczi stefa...@gmail.com:
 On Fri, Jul 22, 2011 at 10:13 AM, Frediano Ziglio fredd...@gmail.com wrote:
 2011/7/22 Kevin Wolf kw...@redhat.com:
 Am 21.07.2011 18:17, schrieb Frediano Ziglio:
 Hi,
   after a snapshot is taken currently many write operations are quite
 slow due to
 - refcount updates (decrement old and increment new )
 - cluster allocation and file expansion
 - read-modify-write on partial clusters

 I found 2 way to improve refcount performance

 Method 1 - Lazy count
 Mainly do not take into account count for current snapshot, that is
 current snapshot counts as 0. This would require to add a
 current_snapshot in header and update refcount when current is changed.
 So for these operation
 - creating snapshot, performance are the same, just increment for old
 snapshot instead of the new one
 - normal write operations. As current snaphot counts as 0 there is not
 operations here so do not write any data
 - changing current snapshot, this is the worst case, you have to
 increment for the current snapshot and decrement for the new so it will
 take twice
 - deleting snapshot, if is the current just set current_snapshot to a
 dummy not existing value, if is not the current just decrement counters,
 no performance changes

 How would you do cluster allocation if you don't have refcounts any more
 that can tell you if a cluster is used or not?


 You have refcount, is only that current snapshot counts as 0. An
 example may help, start with A snapshot A counts as zero so all
 refcounts are 0, now we create a snapshot B and make it current so
 refcounts are 1

 A --- B

 If you change a cluster in snapshot B counts are still 1. If you go
 back to A counters are increment (cause you leave B) and then
 decrement (cause you enter in A).

 Perhaps the problem is how to distinguish 0 from allocated in
 current and not allocated. Yes, with which I suppose above it's a
 problem, but we can easily use -1 as not allocated. If current and
 refcount 0 mark as -1, if not current we would have to increment
 counters of current, mark current as -1 than decrement for deleting,
 yes in this case you have twice the time.

 I'm not sure I follow your last sentence but just having a different
 refcount value for not allocated vs allocated means allocating
 write requests will need to update refcounts.


Now you have 0 for not allocated and 0 for allocated. If you assume
current snapshot counting as 0 a 0 in refcount could mean an allocated
cluster in current snapshot not shared with other snapshots and if you
don't use -1 could be also a not allocated cluster.
Thinking in another way is not that you don't update refcounts but you
update refcounts with 0 addend (that's practically not changing
refcounts).
Question was: is possible to use this trick?

 But are non-append allocations common enough that we should bother
 with them in the allocating write path?  Can we append to the end of
 the image file for allocating writes and handle defragmentation
 elsewhere (i.e. get rid of unallocated clusters in the middle of the
 file)?

 Stefan


I think so but is better to have a way to know if a cluster is
allocated without having to scan all l2 tables.

Frediano



Re: [Qemu-devel] [RFC PATCH 0/5] qcow: coroutines cleanup

2011-07-22 Thread Frediano Ziglio
Il giorno ven, 22/07/2011 alle 12.10 +0200, Kevin Wolf ha scritto: 
 Am 22.07.2011 11:26, schrieb Frediano Ziglio:
  2011/7/22 Kevin Wolf kw...@redhat.com:
  Am 20.07.2011 15:56, schrieb Frediano Ziglio:
  These patches mostly cleanup some AIO code using coroutines.
  These patches apply to Kevin's repository, branch coroutine-block.
  Mostly they use stack instead of allocated AIO structure.
 
  Frediano Ziglio (5):
qcow: allocate QCowAIOCB structure using stack
qcow: QCowAIOCB field cleanup
qcow: move some blocks of code to avoid useless variable
  initialization
avoid dandling pointers
qcow: small optimization initializing QCowAIOCB
 
   block/qcow.c  |  210 
  +
   block/qcow2.c |   38 +++---
   2 files changed, 102 insertions(+), 146 deletions(-)
 
  Most of it looks good now. Did you include the RFC in the subject just
  because the coroutine work is in RFC state, too, or did you intend to
  tell me that I shouldn't merge yet?
 
  Kevin
 
  
  As these patches are first quite big patches I send (typo or small
  fixes do not counts) I just want to mark that I could write something
  really wrong. Just a way to avoid somebody having to send more patches
  and get more attention. Some projects are quite prone to merge even
  not that fine ones. I prefer to have some (a bit) pedantic comments
  and a real fix/improve.
  
  Now I removed the RFC from last update. The main reason is that I
  found your qemu-iotests repository which, I think should be merged to
  main repository, but it's just my opinion.
  Oh... qcow fails 004 test (even origin/coroutines-block) with a I/O error.
 
 Yup, you're right, I must have messed it up. Care to fix it or should I
 look into it?
 

Care but I don't know if I'll have time before Thursday. However I found
the problem, really strange. bdrv_read returns 0 for errors 0 for
success and... bytes read on partial read! Now a qcow image of 128m is
560 bytes so when you read sector 1 you get 48 which is not a problem
for qcow code. But if you replace bdrv_read with a bdrv_co_readv (your
latest patch on coroutine-block) bdrv_co_readv return -EINVAL on partial
read.

Frediano





[Qemu-devel] [RFC] qcow2: 2 way to improve performance updating refcount

2011-07-21 Thread Frediano Ziglio
Hi,
  after a snapshot is taken currently many write operations are quite
slow due to
- refcount updates (decrement old and increment new )
- cluster allocation and file expansion
- read-modify-write on partial clusters

I found 2 way to improve refcount performance

Method 1 - Lazy count
Mainly do not take into account count for current snapshot, that is
current snapshot counts as 0. This would require to add a
current_snapshot in header and update refcount when current is changed.
So for these operation
- creating snapshot, performance are the same, just increment for old
snapshot instead of the new one
- normal write operations. As current snaphot counts as 0 there is not
operations here so do not write any data
- changing current snapshot, this is the worst case, you have to
increment for the current snapshot and decrement for the new so it will
take twice
- deleting snapshot, if is the current just set current_snapshot to a
dummy not existing value, if is not the current just decrement counters,
no performance changes

Method 2 - Read-only parent
Here parents are readonly, instead of storing a refcount store a numeric
id of the owner. If the owner is not current copy the cluster and change
it. Considering this situation

A --- B --- C

B cannot be changed so in order to change B you have to create a new
snapshot

A --- B --- C
 \--- D

and change D. It can take more space cause you have in this case an
additional snapshot.

Operations:
- creating snapshot, really fast as you don't have to change any
ownership
- normal write operations. If owner is not the same allocate a new
cluster and just store a new owner for new cluster. Also ownership for
past-to-end cluster could be set all to current owner in order to
collapse allocations
- changing current snapshot, no changes required for owners
- deleting snapshot. Only possible if you have no child or a single
child. Will require to scan all l2 tables and merge and update owner.
Same performance

Regards
  Frediano Ziglio





[Qemu-devel] [RFC PATCH 2/5] use more stack

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |  133 +++---
 1 files changed, 62 insertions(+), 71 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index d19ef04..cd1f9e3 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -487,20 +487,12 @@ static int qcow_read(BlockDriverState *bs, int64_t 
sector_num,
 #endif
 
 typedef struct QCowAIOCB {
-BlockDriverAIOCB common;
+BlockDriverState *bs;
 int64_t sector_num;
 QEMUIOVector *qiov;
 uint8_t *buf;
 void *orig_buf;
 int nb_sectors;
-int n;
-uint64_t cluster_offset;
-uint8_t *cluster_data;
-struct iovec hd_iov;
-bool is_write;
-QEMUBH *bh;
-QEMUIOVector hd_qiov;
-BlockDriverAIOCB *hd_aiocb;
 } QCowAIOCB;
 
 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
@@ -508,11 +500,9 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 int is_write, QCowAIOCB *acb)
 {
 memset(acb, 0, sizeof(*acb));
-acb-common.bs = bs;
-acb-hd_aiocb = NULL;
+acb-bs = bs;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
-acb-is_write = is_write;
 
 if (qiov-niov  1) {
 acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size);
@@ -522,37 +512,36 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 acb-buf = (uint8_t *)qiov-iov-iov_base;
 }
 acb-nb_sectors = nb_sectors;
-acb-n = 0;
-acb-cluster_offset = 0;
 return acb;
 }
 
 static int qcow_aio_read_cb(QCowAIOCB *acb)
 {
-BlockDriverState *bs = acb-common.bs;
+BlockDriverState *bs = acb-bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
-int ret;
-
-acb-hd_aiocb = NULL;
+int ret, n = 0;
+uint64_t cluster_offset = 0;
+struct iovec hd_iov;
+QEMUIOVector hd_qiov;
 
  redo:
 /* post process the read buffer */
-if (!acb-cluster_offset) {
+if (!cluster_offset) {
 /* nothing to do */
-} else if (acb-cluster_offset  QCOW_OFLAG_COMPRESSED) {
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* nothing to do */
 } else {
 if (s-crypt_method) {
 encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
-acb-n, 0,
+n, 0,
 s-aes_decrypt_key);
 }
 }
 
-acb-nb_sectors -= acb-n;
-acb-sector_num += acb-n;
-acb-buf += acb-n * 512;
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
 
 if (acb-nb_sectors == 0) {
 /* request completed */
@@ -560,57 +549,57 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 
 /* prepare next AIO request */
-acb-cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
+cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
  0, 0, 0, 0);
 index_in_cluster = acb-sector_num  (s-cluster_sectors - 1);
-acb-n = s-cluster_sectors - index_in_cluster;
-if (acb-n  acb-nb_sectors)
-acb-n = acb-nb_sectors;
+n = s-cluster_sectors - index_in_cluster;
+if (n  acb-nb_sectors)
+n = acb-nb_sectors;
 
-if (!acb-cluster_offset) {
+if (!cluster_offset) {
 if (bs-backing_hd) {
 /* read from the base image */
-acb-hd_iov.iov_base = (void *)acb-buf;
-acb-hd_iov.iov_len = acb-n * 512;
-qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1);
+hd_iov.iov_base = (void *)acb-buf;
+hd_iov.iov_len = n * 512;
+qemu_iovec_init_external(hd_qiov, hd_iov, 1);
 qemu_co_mutex_unlock(s-lock);
 ret = bdrv_co_readv(bs-backing_hd, acb-sector_num,
-acb-n, acb-hd_qiov);
+n, hd_qiov);
 qemu_co_mutex_lock(s-lock);
 if (ret  0) {
 return -EIO;
 }
 } else {
 /* Note: in this case, no need to wait */
-memset(acb-buf, 0, 512 * acb-n);
+memset(acb-buf, 0, 512 * n);
 goto redo;
 }
-} else if (acb-cluster_offset  QCOW_OFLAG_COMPRESSED) {
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* add AIO support for compressed blocks ? */
-if (decompress_cluster(bs, acb-cluster_offset)  0) {
+if (decompress_cluster(bs, cluster_offset)  0) {
 return -EIO;
 }
 memcpy(acb-buf,
-   s-cluster_cache + index_in_cluster * 512, 512 * acb-n);
+   s-cluster_cache + index_in_cluster * 512, 512 * n);
 goto redo;
 } else {
-if ((acb-cluster_offset  511) != 0) {
+if ((cluster_offset  511) != 0) {
 return -EIO;
 }
-acb-hd_iov.iov_base = (void *)acb-buf;
-acb-hd_iov.iov_len = acb-n * 512;
-qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1);
+hd_iov.iov_base = (void

[Qemu-devel] [RFC PATCH 5/5] qemu_aio_get used to clear all allocated buffer

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 007fb57..8fd1ee5 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -499,7 +499,6 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 int is_write, QCowAIOCB *acb)
 {
-memset(acb, 0, sizeof(*acb));
 acb-bs = bs;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
@@ -509,6 +508,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 if (is_write)
 qemu_iovec_to_buffer(qiov, acb-buf);
 } else {
+acb-orig_buf = NULL;
 acb-buf = (uint8_t *)qiov-iov-iov_base;
 }
 acb-nb_sectors = nb_sectors;
-- 
1.7.1




[Qemu-devel] [RFC PATCH 3/5] more stack work

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |   53 ++---
 1 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index cd1f9e3..8ccd7d7 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -520,35 +520,18 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 BlockDriverState *bs = acb-bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
-int ret, n = 0;
-uint64_t cluster_offset = 0;
+int ret, n;
+uint64_t cluster_offset;
 struct iovec hd_iov;
 QEMUIOVector hd_qiov;
 
  redo:
-/* post process the read buffer */
-if (!cluster_offset) {
-/* nothing to do */
-} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
-/* nothing to do */
-} else {
-if (s-crypt_method) {
-encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
-n, 0,
-s-aes_decrypt_key);
-}
-}
-
-acb-nb_sectors -= n;
-acb-sector_num += n;
-acb-buf += n * 512;
-
 if (acb-nb_sectors == 0) {
 /* request completed */
 return 0;
 }
 
-/* prepare next AIO request */
+/* prepare next request */
 cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
  0, 0, 0, 0);
 index_in_cluster = acb-sector_num  (s-cluster_sectors - 1);
@@ -572,7 +555,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 } else {
 /* Note: in this case, no need to wait */
 memset(acb-buf, 0, 512 * n);
-goto redo;
 }
 } else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* add AIO support for compressed blocks ? */
@@ -581,7 +563,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 memcpy(acb-buf,
s-cluster_cache + index_in_cluster * 512, 512 * n);
-goto redo;
 } else {
 if ((cluster_offset  511) != 0) {
 return -EIO;
@@ -599,6 +580,23 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 }
 
+/* post process the read buffer */
+if (!cluster_offset) {
+/* nothing to do */
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
+/* nothing to do */
+} else {
+if (s-crypt_method) {
+encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
+n, 0,
+s-aes_decrypt_key);
+}
+}
+
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
+
 goto redo;
 }
 
@@ -630,16 +628,12 @@ static int qcow_aio_write_cb(QCowAIOCB *acb)
 int index_in_cluster;
 uint64_t cluster_offset;
 const uint8_t *src_buf;
-int ret, n = 0;
+int ret, n;
 uint8_t *cluster_data = NULL;
 struct iovec hd_iov;
 QEMUIOVector hd_qiov;
 
   redo:
-acb-nb_sectors -= n;
-acb-sector_num += n;
-acb-buf += n * 512;
-
 if (acb-nb_sectors == 0) {
 /* request completed */
 return 0;
@@ -681,6 +675,11 @@ static int qcow_aio_write_cb(QCowAIOCB *acb)
 if (ret  0) {
 return ret;
 }
+
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
+
 goto redo;
 }
 
-- 
1.7.1




[Qemu-devel] [RFC PATCH 0/5] Coroutines cleanup

2011-07-20 Thread Frediano Ziglio
These patches mostly cleanup some AIO code using coroutines.
These patches apply to Kevin's repository, branch coroutine-block.
Mostly they use stack instead of allocated AIO structure.

Frediano Ziglio (5):
  allocate AIO on stack
  use more stack
  more stack work
  avoid dandling pointers
  qemu_aio_get used to clear all allocated buffer

 block/qcow.c  |  210 +
 block/qcow2.c |   38 +++---
 2 files changed, 102 insertions(+), 146 deletions(-)




[Qemu-devel] [RFC PATCH 1/5] allocate AIO on stack

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c  |   52 
 block/qcow2.c |   38 +++---
 2 files changed, 27 insertions(+), 63 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 6447c2a..d19ef04 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -503,28 +503,12 @@ typedef struct QCowAIOCB {
 BlockDriverAIOCB *hd_aiocb;
 } QCowAIOCB;
 
-static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-if (acb-hd_aiocb)
-bdrv_aio_cancel(acb-hd_aiocb);
-qemu_aio_release(acb);
-}
-
-static AIOPool qcow_aio_pool = {
-.aiocb_size = sizeof(QCowAIOCB),
-.cancel = qcow_aio_cancel,
-};
-
 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-int is_write)
+int is_write, QCowAIOCB *acb)
 {
-QCowAIOCB *acb;
-
-acb = qemu_aio_get(qcow_aio_pool, bs, NULL, NULL);
-if (!acb)
-return NULL;
+memset(acb, 0, sizeof(*acb));
+acb-common.bs = bs;
 acb-hd_aiocb = NULL;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
@@ -543,9 +527,8 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 return acb;
 }
 
-static int qcow_aio_read_cb(void *opaque)
+static int qcow_aio_read_cb(QCowAIOCB *acb)
 {
-QCowAIOCB *acb = opaque;
 BlockDriverState *bs = acb-common.bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
@@ -634,29 +617,27 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t 
sector_num,
  int nb_sectors, QEMUIOVector *qiov)
 {
 BDRVQcowState *s = bs-opaque;
-QCowAIOCB *acb;
+QCowAIOCB acb;
 int ret;
 
-acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
+qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0, acb);
 
 qemu_co_mutex_lock(s-lock);
 do {
-ret = qcow_aio_read_cb(acb);
+ret = qcow_aio_read_cb(acb);
 } while (ret  0);
 qemu_co_mutex_unlock(s-lock);
 
-if (acb-qiov-niov  1) {
-qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size);
-qemu_vfree(acb-orig_buf);
+if (acb.qiov-niov  1) {
+qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size);
+qemu_vfree(acb.orig_buf);
 }
-qemu_aio_release(acb);
 
 return ret;
 }
 
-static int qcow_aio_write_cb(void *opaque)
+static int qcow_aio_write_cb(QCowAIOCB *acb)
 {
-QCowAIOCB *acb = opaque;
 BlockDriverState *bs = acb-common.bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
@@ -714,23 +695,22 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t 
sector_num,
   int nb_sectors, QEMUIOVector *qiov)
 {
 BDRVQcowState *s = bs-opaque;
-QCowAIOCB *acb;
+QCowAIOCB acb;
 int ret;
 
 s-cluster_cache_offset = -1; /* disable compressed cache */
 
-acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
+qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1, acb);
 
 qemu_co_mutex_lock(s-lock);
 do {
-ret = qcow_aio_write_cb(acb);
+ret = qcow_aio_write_cb(acb);
 } while (ret  0);
 qemu_co_mutex_unlock(s-lock);
 
-if (acb-qiov-niov  1) {
-qemu_vfree(acb-orig_buf);
+if (acb.qiov-niov  1) {
+qemu_vfree(acb.orig_buf);
 }
-qemu_aio_release(acb);
 
 return ret;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index f07d550..edc068e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -388,17 +388,6 @@ typedef struct QCowAIOCB {
 QLIST_ENTRY(QCowAIOCB) next_depend;
 } QCowAIOCB;
 
-static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-qemu_aio_release(acb);
-}
-
-static AIOPool qcow2_aio_pool = {
-.aiocb_size = sizeof(QCowAIOCB),
-.cancel = qcow2_aio_cancel,
-};
-
 /*
  * Returns 0 when the request is completed successfully, 1 when there is still
  * a part left to do and -errno in error cases.
@@ -528,13 +517,10 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb)
 static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
   QEMUIOVector *qiov, int nb_sectors,
   BlockDriverCompletionFunc *cb,
-  void *opaque, int is_write)
+  void *opaque, int is_write, QCowAIOCB *acb)
 {
-QCowAIOCB *acb;
-
-acb = qemu_aio_get(qcow2_aio_pool, bs, cb, opaque);
-if (!acb)
-return NULL;
+memset(acb, 0, sizeof(*acb));
+acb-common.bs = bs;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
 acb-is_write = is_write;
@@ -554,19 +540,18 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t 
sector_num,
   int nb_sectors, QEMUIOVector *qiov

[Qemu-devel] [RFC PATCH 4/5] avoid dandling pointers

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 8ccd7d7..007fb57 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -616,6 +616,7 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t 
sector_num,
 if (acb.qiov-niov  1) {
 qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size);
 qemu_vfree(acb.orig_buf);
+acb.orig_buf = NULL;
 }
 
 return ret;
@@ -700,6 +701,7 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t 
sector_num,
 
 if (acb.qiov-niov  1) {
 qemu_vfree(acb.orig_buf);
+acb.orig_buf = NULL;
 }
 
 return ret;
-- 
1.7.1




[Qemu-devel] [PATCH] removed unused function

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block.c |   13 -
 block.h |2 --
 2 files changed, 0 insertions(+), 15 deletions(-)

diff --git a/block.c b/block.c
index 24a25d5..b879d2f 100644
--- a/block.c
+++ b/block.c
@@ -1108,19 +1108,6 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t 
offset,
 return 0;
 }
 
-/*
- * Writes to the file and ensures that no writes are reordered across this
- * request (acts as a barrier)
- *
- * Returns 0 on success, -errno in error cases.
- */
-int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
-const uint8_t *buf, int nb_sectors)
-{
-return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num,
-buf, BDRV_SECTOR_SIZE * nb_sectors);
-}
-
 /**
  * Truncate file to 'offset' bytes (needed only for file protocols)
  */
diff --git a/block.h b/block.h
index 859d1d9..6f3e5d0 100644
--- a/block.h
+++ b/block.h
@@ -85,8 +85,6 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
 const void *buf, int count);
 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
 const void *buf, int count);
-int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
-const uint8_t *buf, int nb_sectors);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
-- 
1.7.1




Re: [Qemu-devel] [RFC PATCH 5/5] qemu_aio_get used to clear all allocated buffer

2011-07-20 Thread Frediano Ziglio
2011/7/20 Kevin Wolf kw...@redhat.com:
 Am 20.07.2011 09:57, schrieb Frediano Ziglio:
 Signed-off-by: Frediano Ziglio fredd...@gmail.com
 ---
  block/qcow.c |    2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

 diff --git a/block/qcow.c b/block/qcow.c
 index 007fb57..8fd1ee5 100644
 --- a/block/qcow.c
 +++ b/block/qcow.c
 @@ -499,7 +499,6 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
          int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
          int is_write, QCowAIOCB *acb)
  {
 -    memset(acb, 0, sizeof(*acb));
      acb-bs = bs;
      acb-sector_num = sector_num;
      acb-qiov = qiov;
 @@ -509,6 +508,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
          if (is_write)
              qemu_iovec_to_buffer(qiov, acb-buf);
      } else {
 +        acb-orig_buf = NULL;
          acb-buf = (uint8_t *)qiov-iov-iov_base;
      }
      acb-nb_sectors = nb_sectors;

 What does this fix? Removing the memset looks like changing code for no
 obvious reason. Is there any state in acb that must survive qcow_aio_setup?

 Kevin


No, this is not a fix, just an optimization, memset was used to clear
entire structure but all fields are set by qcow_aio_setup, only
orig_buf left.
The comment, I must admit is terrible! And it's mine! I'll rewrite all
comments, perhaps merge some commits and send all patches again.

Frediano



[Qemu-devel] [RFC PATCH 5/5] qcow: small optimization initializing QCowAIOCB

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 007fb57..8fd1ee5 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -499,7 +499,6 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 int is_write, QCowAIOCB *acb)
 {
-memset(acb, 0, sizeof(*acb));
 acb-bs = bs;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
@@ -509,6 +508,7 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 if (is_write)
 qemu_iovec_to_buffer(qiov, acb-buf);
 } else {
+acb-orig_buf = NULL;
 acb-buf = (uint8_t *)qiov-iov-iov_base;
 }
 acb-nb_sectors = nb_sectors;
-- 
1.7.1




[Qemu-devel] [RFC PATCH 0/5] qcow: coroutines cleanup

2011-07-20 Thread Frediano Ziglio
These patches mostly cleanup some AIO code using coroutines.
These patches apply to Kevin's repository, branch coroutine-block.
Mostly they use stack instead of allocated AIO structure.

Frediano Ziglio (5):
  qcow: allocate QCowAIOCB structure using stack
  qcow: QCowAIOCB field cleanup
  qcow: move some blocks of code to avoid useless variable
initialization
  avoid dandling pointers
  qcow: small optimization initializing QCowAIOCB

 block/qcow.c  |  210 +
 block/qcow2.c |   38 +++---
 2 files changed, 102 insertions(+), 146 deletions(-)




[Qemu-devel] [RFC PATCH 1/5] qcow: allocate QCowAIOCB structure using stack

2011-07-20 Thread Frediano Ziglio
instead of calling qemi_aio_get use stack

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c  |   52 
 block/qcow2.c |   38 +++---
 2 files changed, 27 insertions(+), 63 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 6447c2a..d19ef04 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -503,28 +503,12 @@ typedef struct QCowAIOCB {
 BlockDriverAIOCB *hd_aiocb;
 } QCowAIOCB;
 
-static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-if (acb-hd_aiocb)
-bdrv_aio_cancel(acb-hd_aiocb);
-qemu_aio_release(acb);
-}
-
-static AIOPool qcow_aio_pool = {
-.aiocb_size = sizeof(QCowAIOCB),
-.cancel = qcow_aio_cancel,
-};
-
 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-int is_write)
+int is_write, QCowAIOCB *acb)
 {
-QCowAIOCB *acb;
-
-acb = qemu_aio_get(qcow_aio_pool, bs, NULL, NULL);
-if (!acb)
-return NULL;
+memset(acb, 0, sizeof(*acb));
+acb-common.bs = bs;
 acb-hd_aiocb = NULL;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
@@ -543,9 +527,8 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 return acb;
 }
 
-static int qcow_aio_read_cb(void *opaque)
+static int qcow_aio_read_cb(QCowAIOCB *acb)
 {
-QCowAIOCB *acb = opaque;
 BlockDriverState *bs = acb-common.bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
@@ -634,29 +617,27 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t 
sector_num,
  int nb_sectors, QEMUIOVector *qiov)
 {
 BDRVQcowState *s = bs-opaque;
-QCowAIOCB *acb;
+QCowAIOCB acb;
 int ret;
 
-acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
+qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0, acb);
 
 qemu_co_mutex_lock(s-lock);
 do {
-ret = qcow_aio_read_cb(acb);
+ret = qcow_aio_read_cb(acb);
 } while (ret  0);
 qemu_co_mutex_unlock(s-lock);
 
-if (acb-qiov-niov  1) {
-qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size);
-qemu_vfree(acb-orig_buf);
+if (acb.qiov-niov  1) {
+qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size);
+qemu_vfree(acb.orig_buf);
 }
-qemu_aio_release(acb);
 
 return ret;
 }
 
-static int qcow_aio_write_cb(void *opaque)
+static int qcow_aio_write_cb(QCowAIOCB *acb)
 {
-QCowAIOCB *acb = opaque;
 BlockDriverState *bs = acb-common.bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
@@ -714,23 +695,22 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t 
sector_num,
   int nb_sectors, QEMUIOVector *qiov)
 {
 BDRVQcowState *s = bs-opaque;
-QCowAIOCB *acb;
+QCowAIOCB acb;
 int ret;
 
 s-cluster_cache_offset = -1; /* disable compressed cache */
 
-acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
+qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1, acb);
 
 qemu_co_mutex_lock(s-lock);
 do {
-ret = qcow_aio_write_cb(acb);
+ret = qcow_aio_write_cb(acb);
 } while (ret  0);
 qemu_co_mutex_unlock(s-lock);
 
-if (acb-qiov-niov  1) {
-qemu_vfree(acb-orig_buf);
+if (acb.qiov-niov  1) {
+qemu_vfree(acb.orig_buf);
 }
-qemu_aio_release(acb);
 
 return ret;
 }
diff --git a/block/qcow2.c b/block/qcow2.c
index f07d550..edc068e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -388,17 +388,6 @@ typedef struct QCowAIOCB {
 QLIST_ENTRY(QCowAIOCB) next_depend;
 } QCowAIOCB;
 
-static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-qemu_aio_release(acb);
-}
-
-static AIOPool qcow2_aio_pool = {
-.aiocb_size = sizeof(QCowAIOCB),
-.cancel = qcow2_aio_cancel,
-};
-
 /*
  * Returns 0 when the request is completed successfully, 1 when there is still
  * a part left to do and -errno in error cases.
@@ -528,13 +517,10 @@ static int qcow2_aio_read_cb(QCowAIOCB *acb)
 static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
   QEMUIOVector *qiov, int nb_sectors,
   BlockDriverCompletionFunc *cb,
-  void *opaque, int is_write)
+  void *opaque, int is_write, QCowAIOCB *acb)
 {
-QCowAIOCB *acb;
-
-acb = qemu_aio_get(qcow2_aio_pool, bs, cb, opaque);
-if (!acb)
-return NULL;
+memset(acb, 0, sizeof(*acb));
+acb-common.bs = bs;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
 acb-is_write = is_write;
@@ -554,19 +540,18 @@ static int qcow2_co_readv(BlockDriverState *bs, int64_t 
sector_num,
   int

[Qemu-devel] [RFC PATCH 4/5] avoid dandling pointers

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 8ccd7d7..007fb57 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -616,6 +616,7 @@ static int qcow_co_readv(BlockDriverState *bs, int64_t 
sector_num,
 if (acb.qiov-niov  1) {
 qemu_iovec_from_buffer(acb.qiov, acb.orig_buf, acb.qiov-size);
 qemu_vfree(acb.orig_buf);
+acb.orig_buf = NULL;
 }
 
 return ret;
@@ -700,6 +701,7 @@ static int qcow_co_writev(BlockDriverState *bs, int64_t 
sector_num,
 
 if (acb.qiov-niov  1) {
 qemu_vfree(acb.orig_buf);
+acb.orig_buf = NULL;
 }
 
 return ret;
-- 
1.7.1




[Qemu-devel] [RFC PATCH 2/5] qcow: QCowAIOCB field cleanup

2011-07-20 Thread Frediano Ziglio
remove unused field from this structure and put some of them in 
qcow_aio_read_cb and qcow_aio_write_cb

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |  133 +++---
 1 files changed, 62 insertions(+), 71 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index d19ef04..cd1f9e3 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -487,20 +487,12 @@ static int qcow_read(BlockDriverState *bs, int64_t 
sector_num,
 #endif
 
 typedef struct QCowAIOCB {
-BlockDriverAIOCB common;
+BlockDriverState *bs;
 int64_t sector_num;
 QEMUIOVector *qiov;
 uint8_t *buf;
 void *orig_buf;
 int nb_sectors;
-int n;
-uint64_t cluster_offset;
-uint8_t *cluster_data;
-struct iovec hd_iov;
-bool is_write;
-QEMUBH *bh;
-QEMUIOVector hd_qiov;
-BlockDriverAIOCB *hd_aiocb;
 } QCowAIOCB;
 
 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
@@ -508,11 +500,9 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 int is_write, QCowAIOCB *acb)
 {
 memset(acb, 0, sizeof(*acb));
-acb-common.bs = bs;
-acb-hd_aiocb = NULL;
+acb-bs = bs;
 acb-sector_num = sector_num;
 acb-qiov = qiov;
-acb-is_write = is_write;
 
 if (qiov-niov  1) {
 acb-buf = acb-orig_buf = qemu_blockalign(bs, qiov-size);
@@ -522,37 +512,36 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
 acb-buf = (uint8_t *)qiov-iov-iov_base;
 }
 acb-nb_sectors = nb_sectors;
-acb-n = 0;
-acb-cluster_offset = 0;
 return acb;
 }
 
 static int qcow_aio_read_cb(QCowAIOCB *acb)
 {
-BlockDriverState *bs = acb-common.bs;
+BlockDriverState *bs = acb-bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
-int ret;
-
-acb-hd_aiocb = NULL;
+int ret, n = 0;
+uint64_t cluster_offset = 0;
+struct iovec hd_iov;
+QEMUIOVector hd_qiov;
 
  redo:
 /* post process the read buffer */
-if (!acb-cluster_offset) {
+if (!cluster_offset) {
 /* nothing to do */
-} else if (acb-cluster_offset  QCOW_OFLAG_COMPRESSED) {
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* nothing to do */
 } else {
 if (s-crypt_method) {
 encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
-acb-n, 0,
+n, 0,
 s-aes_decrypt_key);
 }
 }
 
-acb-nb_sectors -= acb-n;
-acb-sector_num += acb-n;
-acb-buf += acb-n * 512;
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
 
 if (acb-nb_sectors == 0) {
 /* request completed */
@@ -560,57 +549,57 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 
 /* prepare next AIO request */
-acb-cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
+cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
  0, 0, 0, 0);
 index_in_cluster = acb-sector_num  (s-cluster_sectors - 1);
-acb-n = s-cluster_sectors - index_in_cluster;
-if (acb-n  acb-nb_sectors)
-acb-n = acb-nb_sectors;
+n = s-cluster_sectors - index_in_cluster;
+if (n  acb-nb_sectors)
+n = acb-nb_sectors;
 
-if (!acb-cluster_offset) {
+if (!cluster_offset) {
 if (bs-backing_hd) {
 /* read from the base image */
-acb-hd_iov.iov_base = (void *)acb-buf;
-acb-hd_iov.iov_len = acb-n * 512;
-qemu_iovec_init_external(acb-hd_qiov, acb-hd_iov, 1);
+hd_iov.iov_base = (void *)acb-buf;
+hd_iov.iov_len = n * 512;
+qemu_iovec_init_external(hd_qiov, hd_iov, 1);
 qemu_co_mutex_unlock(s-lock);
 ret = bdrv_co_readv(bs-backing_hd, acb-sector_num,
-acb-n, acb-hd_qiov);
+n, hd_qiov);
 qemu_co_mutex_lock(s-lock);
 if (ret  0) {
 return -EIO;
 }
 } else {
 /* Note: in this case, no need to wait */
-memset(acb-buf, 0, 512 * acb-n);
+memset(acb-buf, 0, 512 * n);
 goto redo;
 }
-} else if (acb-cluster_offset  QCOW_OFLAG_COMPRESSED) {
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* add AIO support for compressed blocks ? */
-if (decompress_cluster(bs, acb-cluster_offset)  0) {
+if (decompress_cluster(bs, cluster_offset)  0) {
 return -EIO;
 }
 memcpy(acb-buf,
-   s-cluster_cache + index_in_cluster * 512, 512 * acb-n);
+   s-cluster_cache + index_in_cluster * 512, 512 * n);
 goto redo;
 } else {
-if ((acb-cluster_offset  511) != 0) {
+if ((cluster_offset  511) != 0) {
 return -EIO;
 }
-acb-hd_iov.iov_base = (void *)acb-buf;
-acb-hd_iov.iov_len = acb-n

[Qemu-devel] [RFC PATCH 3/5] qcow: move some blocks of code to avoid useless variable initialization

2011-07-20 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |   53 ++---
 1 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index cd1f9e3..8ccd7d7 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -520,35 +520,18 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 BlockDriverState *bs = acb-bs;
 BDRVQcowState *s = bs-opaque;
 int index_in_cluster;
-int ret, n = 0;
-uint64_t cluster_offset = 0;
+int ret, n;
+uint64_t cluster_offset;
 struct iovec hd_iov;
 QEMUIOVector hd_qiov;
 
  redo:
-/* post process the read buffer */
-if (!cluster_offset) {
-/* nothing to do */
-} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
-/* nothing to do */
-} else {
-if (s-crypt_method) {
-encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
-n, 0,
-s-aes_decrypt_key);
-}
-}
-
-acb-nb_sectors -= n;
-acb-sector_num += n;
-acb-buf += n * 512;
-
 if (acb-nb_sectors == 0) {
 /* request completed */
 return 0;
 }
 
-/* prepare next AIO request */
+/* prepare next request */
 cluster_offset = get_cluster_offset(bs, acb-sector_num  9,
  0, 0, 0, 0);
 index_in_cluster = acb-sector_num  (s-cluster_sectors - 1);
@@ -572,7 +555,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 } else {
 /* Note: in this case, no need to wait */
 memset(acb-buf, 0, 512 * n);
-goto redo;
 }
 } else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
 /* add AIO support for compressed blocks ? */
@@ -581,7 +563,6 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 memcpy(acb-buf,
s-cluster_cache + index_in_cluster * 512, 512 * n);
-goto redo;
 } else {
 if ((cluster_offset  511) != 0) {
 return -EIO;
@@ -599,6 +580,23 @@ static int qcow_aio_read_cb(QCowAIOCB *acb)
 }
 }
 
+/* post process the read buffer */
+if (!cluster_offset) {
+/* nothing to do */
+} else if (cluster_offset  QCOW_OFLAG_COMPRESSED) {
+/* nothing to do */
+} else {
+if (s-crypt_method) {
+encrypt_sectors(s, acb-sector_num, acb-buf, acb-buf,
+n, 0,
+s-aes_decrypt_key);
+}
+}
+
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
+
 goto redo;
 }
 
@@ -630,16 +628,12 @@ static int qcow_aio_write_cb(QCowAIOCB *acb)
 int index_in_cluster;
 uint64_t cluster_offset;
 const uint8_t *src_buf;
-int ret, n = 0;
+int ret, n;
 uint8_t *cluster_data = NULL;
 struct iovec hd_iov;
 QEMUIOVector hd_qiov;
 
   redo:
-acb-nb_sectors -= n;
-acb-sector_num += n;
-acb-buf += n * 512;
-
 if (acb-nb_sectors == 0) {
 /* request completed */
 return 0;
@@ -681,6 +675,11 @@ static int qcow_aio_write_cb(QCowAIOCB *acb)
 if (ret  0) {
 return ret;
 }
+
+acb-nb_sectors -= n;
+acb-sector_num += n;
+acb-buf += n * 512;
+
 goto redo;
 }
 
-- 
1.7.1




[Qemu-devel] [PATCH] avoid core reading with bdrv_read (qemu-io)

2011-07-19 Thread Frediano Ziglio
This patch apply to kevin coroutine-block branch and avoid code. It
fix qcow: Use coroutines patch. Test case:

$ ./qemu-img create -f qcow aaa.img 1G
Formatting 'aaa.img', fmt=qcow size=1073741824 encryption=off
$ ./qemu-io aaa.img
qemu-io read 1024 1024
Segmentation fault

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 block/qcow.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 6f7973c..1386e92 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -573,7 +573,8 @@ static int qcow_aio_read_cb(void *opaque)

 if (acb-nb_sectors == 0) {
 /* request completed */
-qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size);
+if (acb-orig_buf)
+qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size);
 return 0;
 }

@@ -648,6 +649,7 @@ static int qcow_co_readv(BlockDriverState *bs,
int64_t sector_num,

 if (acb-qiov-niov  1) {
 qemu_vfree(acb-orig_buf);
+acb-orig_buf = NULL;
 }
 qemu_aio_release(acb);

@@ -729,6 +731,7 @@ static int qcow_co_writev(BlockDriverState *bs,
int64_t sector_num,

 if (acb-qiov-niov  1) {
 qemu_vfree(acb-orig_buf);
+acb-orig_buf = NULL;
 }
 qemu_aio_release(acb);

-- 
1.7.1



[Qemu-devel] coroutines and block I/O considerations

2011-07-19 Thread Frediano Ziglio
Hi,
  I'm exercise myself in block I/O layer and I decided to test
coroutine branch cause I find it easier to use instead of normal
callback. Looking at normal code there are a lot of rows in source to
save/restore state and declare callbacks and is not that easier to
understand the normal flow. At the end I would like to create a new
image format to get rid of some performance problem I encounter using
writethrough and snapshots. I have some questions regard block I/O and
also coroutines

1- threading model. I don't understand it. I can see that aio pool
routines does not contain locking code so I think aio layer is mainly
executed in a single thread. I saw introduction of some locking using
coroutines so I think coroutines are now called from different threads
and needs lock (current implementation serialize all device
operations)

2- memory considerations on coroutines. Beside coroutines allow more
readable code I wonder if somebody considered memory. For every
coroutines a different stack has to be allocated. For instance
ucontext and win32 implementation use 4mb. Assuming 128 concurrent AIO
this require about 512mb of ram (mostly only committed but not used
and coroutines are reused).

About snapshot and block i/o I think that using external snapshot
would help making some stuff easier. By external snapshot I mean
creating a new image with backing file as current image file and using
this new image for future operations. This would allow for instance
- support snapshot with every format (even raw)
- making snapshot backup using external programs (even from different
hosts using clustered file system and without many locking issues as
original image is now read-only)
- convert images live (just snapshot, qemu-img convert, remove snapshot)

Regards
  Frediano



Re: [Qemu-devel] [PATCH] avoid core reading with bdrv_read (qemu-io)

2011-07-19 Thread Frediano Ziglio
2011/7/19 Kevin Wolf kw...@redhat.com:
 Am 19.07.2011 09:33, schrieb Frediano Ziglio:
 This patch apply to kevin coroutine-block branch and avoid code. It
 fix qcow: Use coroutines patch. Test case:

 $ ./qemu-img create -f qcow aaa.img 1G
 Formatting 'aaa.img', fmt=qcow size=1073741824 encryption=off
 $ ./qemu-io aaa.img
 qemu-io read 1024 1024
 Segmentation fault

 Signed-off-by: Frediano Ziglio fredd...@gmail.com

 Thanks for the report. I'll update the patch, but in a slightly
 different way that matches the old code better:

 diff --git a/block/qcow.c b/block/qcow.c
 index 6f7973c..6447c2a 100644
 --- a/block/qcow.c
 +++ b/block/qcow.c
 @@ -573,7 +573,6 @@ static int qcow_aio_read_cb(void *opaque)

     if (acb-nb_sectors == 0) {
         /* request completed */
 -        qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size);
         return 0;
     }

 @@ -647,6 +646,7 @@ static int qcow_co_readv(BlockDriverState *bs,
 int64_t sector_num,
     qemu_co_mutex_unlock(s-lock);

     if (acb-qiov-niov  1) {
 +        qemu_iovec_from_buffer(acb-qiov, acb-orig_buf, acb-qiov-size);
         qemu_vfree(acb-orig_buf);
     }
     qemu_aio_release(acb);


Yes, my patch also removed some dandling pointer which I don't like
but are not a problem with current code.
In case of ret  0 (error) your code could copy data that probably are
not initialized. I don't know if data is used in case of failure but
in case memory is shared with guest (I don't know, perhaps using
virtio) this lead to security issues. Also some memory debugger like
valgrind could not like that copy.

Frediano



Re: [Qemu-devel] External COW format for raw images

2011-07-19 Thread Frediano Ziglio
2011/7/19 Robert Wang wdon...@linux.vnet.ibm.com:
 As you known, raw image is very popular,but the raw image format does
 NOT support Copy-On-Write,a raw image file can NOT be used as a copy
 destination, then image streaming/Live Block Copy will NOT work.

 To fix this, we need to add a new block driver raw-cow to QEMU. If
 finished, we can use qemu-img like this:
 qemu-img create -f raw-cow -o backing_file=ubuntu.img,raw_file=my_vm.img
 my_vm.raw-cow

 1) ubuntu.img is the backing file, my_vm.img is a raw file,
 my_vm.raw-cow stores a COW bitmap related to my_vm.img.

 2) If the entire COW bitmap is set to dirty flag then we can get all
 information from my_vm.img and can ignore ubuntu.img and my_vm.raw-cow
 from now.

 To implement this, I think I can follow these steps:
 1) Add a new member to BlockDriverState struct:
 char raw_file[1024];
 This member will track raw_file parameter related to raw-cow file from
 command line.

 2)      * Create a new file block/raw-cow.c. It will be much more like the
 mixture of block/cow.c and block/raw.c.

 So I will change some functions in cow.c and raw.c to none-static, then
 raw-cow.c can re-use them. When read operation occurs, determine whether
 dirty flag in raw-cow image is set. If true, read directly from the raw
 file. After write operation, set related dirty flag in raw-cow image.
 And other functions might also be modified.

        * Of course, format_name member of BlockDriver struct will be 
 raw-cow.
 And in order to keep relationship with raw file( like my_vm.img) ,
 raw_cow_header struct should be
 struct raw_cow_header {
 uint32_t magic;
 uint32_t version;
 char backing_file[1024];
 char raw_file[1024];/* added*/
 int32_t mtime;
 uint64_t size;
 uint32_t sectorsize;
 };
        * Struct raw_cow_create_options should be one member plus based on
 cow_create_options:
 {
 .name = BLOCK_OPT_RAW_FILE,
 .type = OPT_STRING,
 .help = Raw file name
 },

 3) Add bdrv_get_raw_filename in img_info function of qemu-img.c. In
 bdrv_get_raw_filename, if the format of the image file is raw-cow,
 print the related raw file.

 Do you think my approach is right?
 Thank you.


I don't understand if you mean just a way to track clusters/sectors
changed or a new way to implement snapshotting, something that writing
data just store original to cow like like:


normal backfile

is allocated on image
  write to image
else
  allocate on image
  copy from backing to image
  write to image (patch before previous write)

cow-raw (inverse backfile)

is allocated on image
  write to backing
else
  allocate on image
  copy from backing to image
  write to backing


that is


is not allocated on image
  allocate on image
  copy from backing to image
is normal backing
  write to image
else
  write to backing


Frediano



[Qemu-devel] Possible leak in block/qcow.c

2011-07-18 Thread Frediano Ziglio
Hi,
  I noted that there are two cluster_data member in block/qcow.c, one
in BDRVQcowState, the other in QCowAIOCB. The last one is used in
qcow_aio_write_cb to hold buffer for encrypt the cluster before write
but I cannot find any related qemu_free while I can find many place
where BDRVQcowState::cluster_data is freed. It seems to me a leak but
I don't understand why nobody reported this problem before (it should
happen at every write so anybody using qcow encrypted should rapidly
see this problem). Perhaps there is a sort of garbage collector I'm
not aware?

Frediano Ziglio



Re: [Qemu-devel] Default cache mode

2011-06-29 Thread Frediano Ziglio
2011/6/29 Anthony Liguori anth...@codemonkey.ws:
 On 06/29/2011 07:16 AM, Kevin Wolf wrote:

 Am 29.06.2011 14:06, schrieb Anthony Liguori:

 On 06/29/2011 06:59 AM, Kevin Wolf wrote:

 Hi,

 I think we have touched this topic before during some IRC discussions or
 somewhere deep in a mailing list thread, but I think it hasn't been
 discussed on the list.

 Our default cache mode of cache=writethrough is extremely conservative
 and provides absolute safety at the cost of performance,

 But for the most part, we track bare metal fairly well in terms of block
 performance, no?

 Or are you really referring to qcow2 as a specific example?  In the
 past, we used a different default caching mode for qcow2.  I think that
 could be done again if there was a compelling reason.

 No, people are also complaining about bad performance with raw. Which
 isn't really surprising when you do a flush after each single write
 request. O_SYNC is really much more than is needed in the average case.

 Which file system on the host?

 At any rate, I'm a big fan of making wce tunable in the guest and then I
 think setting wce=1 is quite reasonable to do by default.

 Regards,

 Anthony Liguori


 Kevin




Personally I think default lead to poor performance but currently
people know that even critical application are handled correctly.
Assuming a client connect to a database server on a qemu guest when
server reply transaction ok you know that even a host crash lead to
a successful transaction. At least the fact that this assumption won't
be true has to be stated.

Lately I'm doing some extensive testing on image performance and I
noted that are not only image dependent but also filesystem (where
image is stored) dependent but when data are allocated speed is good.
raw disks are not so fast as expected? Try to use fallocate command to
allocate the file than do a dd to fill the entire file with zeros and
writethrough will perform very well. Allocating raw with qemu-img lead
to a normal sparse file which written with O_DSYNC flag is quite slow
due to extents allocation and fragmentation.

Regards
 Frediano Ziglio



Re: [Qemu-devel] [RFC PATCH v2] Specification for qcow2 version 3

2011-06-28 Thread Frediano Ziglio
2011/6/27 Kevin Wolf kw...@redhat.com:
 This is the second draft for what I think could be added when we increase 
 qcow2's
 version number to 3. This includes points that have been made by several 
 people
 over the past few months. We're probably not going to implement this next 
 week,
 but I think it's important to get discussions started early, so here it is.

 Changes implemented in this RFC:

 - Added compatible/incompatible/auto-clear feature bits plus an optional
  feature name table to allow useful error messages even if an older version
  doesn't know some feature at all.

 - Added a dirty flag which tells that the refcount may not be accurate (QED
  mode). This means that we can save writes to the refcount table with
  cache=writethrough, but isn't really useful otherwise since Qcow2Cache.

 - Configurable refcount width. If you don't want to use internal snapshots,
  make refcounts one bit and save cache space and I/O.

 - Added subclusters. This separate the COW size (one subcluster, I'm thinking
  of 64k default size here) from the allocation size (one cluster, 2M). Less
  fragmentation, less metadata, but still reasonable COW granularity.

  This also allows to preallocate clusters, but none of their subclusters. You
  can have an image that is like raw + COW metadata, and you can also
  preallocate metadata for images with backing files.

 - Zero cluster flags. This allows discard even with a backing file that 
 doesn't
  contain zeros. It is also useful for copy-on-read/image streaming, as you'll
  want to keep sparseness without accessing the remote image for an unallocated
  cluster all the time.

 - Fixed internal snapshot metadata to use 64 bit VM state size. You can't save
  a snapshot of a VM with = 4 GB RAM today.

 Possible future additions:

 - Add per-L2-table dirty flag to L1?
 - Add per-refcount-block full flag to refcount table?

Hi,
  thinking about image improvement I would add

- GUID for image and backing file
- relative path for backing file

This would help finding images in a distributed environment or if file
are moved, ie: gfs/nfs/ocfs mounted in different mount points, backing
used a template in a different images directory and move this
directory somewhere else. Also with GUID a possible higher level could
manage a GUID - file image db.

I was also think about a backing file length field to support
resizing but probably can be implemented with zero cluster. Assume you
have a image of 5gb, create a new image with first image as backing
one, now resize second image from 5gb to 3gb then resize it again
(after some works) to 10gb, part from 3gb to 5gb should not be read
from backing file.

Also a bit in l2 offset to say there is no l2 table cause all
clusters in l2 are contiguous so we avoid entirely l2. Obviously this
require an optimization step to detect or create such condition.

For check perhaps it would be helpful to save not only a flag but also
a size where data are ok (for instance already allocated and with
refcount saved correctly).

A possible optimization for refcount would be to initialize refcount
to 1 instead of 0. When clusters are allocated at end-of-file this
would not require refcount change and would be easy to check file size
to see which clusters are marked as allocated but not present.

Fields for sectors and heads to support old CHS systems ??

This mail sound quite strange to me, I thought qed would be the future
of qcow2 but I must be really wrong.

I think a big limit for current qed and qcow2 implementation is the
serialization of metadata informations (qcow2 use synchronous
operation while qed use a queue). I used bonnie++ program to test
speed and performances allocating data is about 15-20% of allocated
one. I'm working (in the few spare time I have) improving it.
VirtualBox and ESX use large clusters (1mb) to mitigate
allocation/metadata problem. Perhaps raising default cluster size
would help changing a spread idea of bad qemu i/o performance.

Regards
  Frediano Ziglio



[Qemu-devel] [PATCH 1/1] fix operator precedence

2011-06-20 Thread Frediano Ziglio
Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 cmd.c |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmd.c b/cmd.c
index db2c9c4..ecca167 100644
--- a/cmd.c
+++ b/cmd.c
@@ -486,7 +486,7 @@ timestr(
snprintf(ts, size, %u:%02u.%02u,
(unsigned int) MINUTES(tv-tv_sec),
(unsigned int) SECONDS(tv-tv_sec),
-   (unsigned int) usec * 100);
+   (unsigned int) (usec * 100));
return;
}
format |= VERBOSE_FIXED_TIME;   /* fallback if hours needed */
@@ -497,9 +497,9 @@ timestr(
(unsigned int) HOURS(tv-tv_sec),
(unsigned int) MINUTES(tv-tv_sec),
(unsigned int) SECONDS(tv-tv_sec),
-   (unsigned int) usec * 100);
+   (unsigned int) (usec * 100));
} else {
-   snprintf(ts, size, 0.%04u sec, (unsigned int) usec * 1);
+   snprintf(ts, size, 0.%04u sec, (unsigned int) (usec * 1));
}
 }

-- 
1.7.1



[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.

2010-02-20 Thread Frediano Ziglio
Add a QEMU timer only when needed (timeout status not set, timeout
irq wanted and timer set).

This patch is required for Darwin. Patch has been tested under
FreeBSD, Darwin and Linux.
---
 hw/rtl8139.c |  139 +++---
 1 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index f04dd54..72e2242 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -41,6 +41,10 @@
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when enabling 
rx/tx operation
+ *
+ *  2010-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer only
+ *  when strictly needed (required for for
+ *  Darwin)
  */
 
 #include hw.h
@@ -60,9 +64,6 @@
 /* Calculate CRCs properly on Rx packets */
 #define RTL8139_CALCULATE_RXCRC 1
 
-/* Uncomment to enable on-board timer interrupts */
-//#define RTL8139_ONBOARD_TIMER 1
-
 #if defined(RTL8139_CALCULATE_RXCRC)
 /* For crc32 */
 #include zlib.h
@@ -491,9 +492,12 @@ typedef struct RTL8139State {
 
 /* PCI interrupt timer */
 QEMUTimer *timer;
+int64_t TimerExpire;
 
 } RTL8139State;
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
+
 static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command)
 {
 DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command));
@@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, 
uint32_t val)
 
 s-IntrMask = val;
 
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 }
 
 static uint32_t rtl8139_IntrMask_read(RTL8139State *s)
@@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, 
uint32_t val)
 rtl8139_update_irq(s);
 
 s-IntrStatus = newStatus;
+/*
+ * Computing if we miss an interrupt here is not that correct but
+ * considered that we should have had already an interrupt
+ * and probably emulated is slower is better to assume this resetting was
+ * done before testing on previous rtl8139_update_irq lead to IRQ loosing
+ */
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 #endif
 }
 
 static uint32_t rtl8139_IntrStatus_read(RTL8139State *s)
 {
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+
 uint32_t ret = s-IntrStatus;
 
 DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret));
@@ -2739,6 +2755,46 @@ static void rtl8139_io_writew(void *opaque, uint8_t 
addr, uint32_t val)
 }
 }
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time)
+{
+int64_t pci_time, next_time;
+uint32_t low_pci;
+
+DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n));
+
+if (s-TimerExpire  current_time = s-TimerExpire) {
+s-IntrStatus |= PCSTimeout;
+rtl8139_update_irq(s);
+}
+
+/* Set QEMU timer only if needed that is
+ * - TimerInt  0 (we have a timer)
+ * - mask = 1 (we want an interrupt timer)
+ * - irq = 0  (irq is not already active)
+ * If any of above change we need to compute timer again
+ * Also we must check if timer is passed without QEMU timer
+ */
+s-TimerExpire = 0;
+if (!s-TimerInt) {
+return;
+}
+
+pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY,
+get_ticks_per_sec());
+low_pci = pci_time  0x;
+pci_time = pci_time - low_pci + s-TimerInt;
+if (low_pci = s-TimerInt) {
+pci_time += 0x1LL;
+}
+next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(),
+PCI_FREQUENCY);
+s-TimerExpire = next_time;
+
+if ((s-IntrMask  PCSTimeout) != 0  (s-IntrStatus  PCSTimeout) == 0) {
+qemu_mod_timer(s-timer, next_time);
+}
+}
+
 static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val)
 {
 RTL8139State *s = opaque;
@@ -2784,13 +2840,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t 
addr, uint32_t val)
 
 case Timer:
 DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n));
-s-TCTR = 0;
 s-TCTR_base = qemu_get_clock(vm_clock);
+rtl8139_set_next_tctr_time(s, s-TCTR_base);
 break;
 
 case FlashReg:
 DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, 
val));
-s-TimerInt = val;
+if (s-TimerInt != val) {
+s-TimerInt = val;
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+}
 break;
 
 default:
@@ -3000,7 +3059,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t 
addr)
 break;
 
 case Timer:
-ret = s-TCTR;
+ret

Re: [Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.

2010-02-20 Thread Frediano Ziglio
2010/2/20 Igor Kovalenko igor.v.kovale...@gmail.com:
 On Sat, Feb 20, 2010 at 2:26 AM, Anthony Liguori anth...@codemonkey.ws 
 wrote:
 On 02/19/2010 04:22 PM, Alexander Graf wrote:

 On 19.02.2010, at 22:50, Anthony Liguori wrote:



 On 02/16/2010 02:35 AM, Frediano Ziglio wrote:


 Add a QEMU timer only when needed (timeout status not set, timeout
 irq wanted and timer set).

 This patch is required for Darwin. Patch has been tested under
 FreeBSD, Darwin and Linux.

 Signed-off-by: Frediano Zigliofredd...@gmail.com



 Alex, I know you've played with this feature for OS X in the past.

 Is this something we should be enabling unconditionally?


 I've played with it and realized that

 1) OSX's rtl8139 driver uses this timer instead of tx/rx interrupts to
 process the queue
 2) it was broken

 I haven't gotten around to test if this patch makes it work, but I'd
 suppose so.

 So yes, I think it should be enabled by default. It's a feature real hw
 has, so we should emulate it.


 The thing that concerns me is why it was disabled by default.  Igor, what
 was the reasoning for disabling it if you recall?

 It was very inefficient pci timer implementation which fired callback
 frequently.
 Once it was found that (freebsd?) was relying on pci timer in realtek driver
 to communicate we just added missing piece.

 Now this patch fixes that inefficiency issue.


I'm not that use with this ML way... sorry if somebody will receive a
lot of SPAM...

I updated my patch with Anthony suggestions (just code style).

Previous code (current git) issue a timer every pci tick (about 33
millions per second!) in order to compute correctly TCTR. This patch
enable QEMU timers only when strictly needed (for interrupt purposes)
computing TCTR when needed (only on driver read).

I tested with patch under Linux, FreeBSD and Darwin (Mac OS X) and it
works correctly. I also bought a physical card to test the behaviour
with a small test program and test program work with emulated code too
(previously code does not). I sent the test program but I don't know
if is possible to integrate it into QEMU code (is a standalone test).

Frediano Ziglio




[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.

2010-02-20 Thread Frediano Ziglio
Add a QEMU timer only when needed (timeout status not set, timeout
irq wanted and timer set).

This patch is required for Darwin. Patch has been tested under
FreeBSD, Darwin and Linux.
---
 hw/rtl8139.c |  139 +++---
 1 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index f04dd54..72e2242 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -41,6 +41,10 @@
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when enabling 
rx/tx operation
+ *
+ *  2010-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer only
+ *  when strictly needed (required for for
+ *  Darwin)
  */
 
 #include hw.h
@@ -60,9 +64,6 @@
 /* Calculate CRCs properly on Rx packets */
 #define RTL8139_CALCULATE_RXCRC 1
 
-/* Uncomment to enable on-board timer interrupts */
-//#define RTL8139_ONBOARD_TIMER 1
-
 #if defined(RTL8139_CALCULATE_RXCRC)
 /* For crc32 */
 #include zlib.h
@@ -491,9 +492,12 @@ typedef struct RTL8139State {
 
 /* PCI interrupt timer */
 QEMUTimer *timer;
+int64_t TimerExpire;
 
 } RTL8139State;
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
+
 static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command)
 {
 DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command));
@@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, 
uint32_t val)
 
 s-IntrMask = val;
 
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 }
 
 static uint32_t rtl8139_IntrMask_read(RTL8139State *s)
@@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, 
uint32_t val)
 rtl8139_update_irq(s);
 
 s-IntrStatus = newStatus;
+/*
+ * Computing if we miss an interrupt here is not that correct but
+ * considered that we should have had already an interrupt
+ * and probably emulated is slower is better to assume this resetting was
+ * done before testing on previous rtl8139_update_irq lead to IRQ loosing
+ */
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 #endif
 }
 
 static uint32_t rtl8139_IntrStatus_read(RTL8139State *s)
 {
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+
 uint32_t ret = s-IntrStatus;
 
 DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret));
@@ -2739,6 +2755,46 @@ static void rtl8139_io_writew(void *opaque, uint8_t 
addr, uint32_t val)
 }
 }
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time)
+{
+int64_t pci_time, next_time;
+uint32_t low_pci;
+
+DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n));
+
+if (s-TimerExpire  current_time = s-TimerExpire) {
+s-IntrStatus |= PCSTimeout;
+rtl8139_update_irq(s);
+}
+
+/* Set QEMU timer only if needed that is
+ * - TimerInt  0 (we have a timer)
+ * - mask = 1 (we want an interrupt timer)
+ * - irq = 0  (irq is not already active)
+ * If any of above change we need to compute timer again
+ * Also we must check if timer is passed without QEMU timer
+ */
+s-TimerExpire = 0;
+if (!s-TimerInt) {
+return;
+}
+
+pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY,
+get_ticks_per_sec());
+low_pci = pci_time  0x;
+pci_time = pci_time - low_pci + s-TimerInt;
+if (low_pci = s-TimerInt) {
+pci_time += 0x1LL;
+}
+next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(),
+PCI_FREQUENCY);
+s-TimerExpire = next_time;
+
+if ((s-IntrMask  PCSTimeout) != 0  (s-IntrStatus  PCSTimeout) == 0) {
+qemu_mod_timer(s-timer, next_time);
+}
+}
+
 static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val)
 {
 RTL8139State *s = opaque;
@@ -2784,13 +2840,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t 
addr, uint32_t val)
 
 case Timer:
 DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n));
-s-TCTR = 0;
 s-TCTR_base = qemu_get_clock(vm_clock);
+rtl8139_set_next_tctr_time(s, s-TCTR_base);
 break;
 
 case FlashReg:
 DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, 
val));
-s-TimerInt = val;
+if (s-TimerInt != val) {
+s-TimerInt = val;
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+}
 break;
 
 default:
@@ -3000,7 +3059,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t 
addr)
 break;
 
 case Timer:
-ret = s-TCTR;
+ret

[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.

2010-02-20 Thread Frediano Ziglio
Add a QEMU timer only when needed (timeout status not set, timeout
irq wanted and timer set).

This patch is required for Darwin. Patch has been tested under
FreeBSD, Darwin and Linux.

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 hw/rtl8139.c |  139 +++---
 1 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index f04dd54..72e2242 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -41,6 +41,10 @@
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when enabling 
rx/tx operation
+ *
+ *  2010-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer only
+ *  when strictly needed (required for for
+ *  Darwin)
  */
 
 #include hw.h
@@ -60,9 +64,6 @@
 /* Calculate CRCs properly on Rx packets */
 #define RTL8139_CALCULATE_RXCRC 1
 
-/* Uncomment to enable on-board timer interrupts */
-//#define RTL8139_ONBOARD_TIMER 1
-
 #if defined(RTL8139_CALCULATE_RXCRC)
 /* For crc32 */
 #include zlib.h
@@ -491,9 +492,12 @@ typedef struct RTL8139State {
 
 /* PCI interrupt timer */
 QEMUTimer *timer;
+int64_t TimerExpire;
 
 } RTL8139State;
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
+
 static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command)
 {
 DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command));
@@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, 
uint32_t val)
 
 s-IntrMask = val;
 
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 }
 
 static uint32_t rtl8139_IntrMask_read(RTL8139State *s)
@@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, 
uint32_t val)
 rtl8139_update_irq(s);
 
 s-IntrStatus = newStatus;
+/*
+ * Computing if we miss an interrupt here is not that correct but
+ * considered that we should have had already an interrupt
+ * and probably emulated is slower is better to assume this resetting was
+ * done before testing on previous rtl8139_update_irq lead to IRQ loosing
+ */
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 #endif
 }
 
 static uint32_t rtl8139_IntrStatus_read(RTL8139State *s)
 {
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+
 uint32_t ret = s-IntrStatus;
 
 DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret));
@@ -2739,6 +2755,46 @@ static void rtl8139_io_writew(void *opaque, uint8_t 
addr, uint32_t val)
 }
 }
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time)
+{
+int64_t pci_time, next_time;
+uint32_t low_pci;
+
+DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n));
+
+if (s-TimerExpire  current_time = s-TimerExpire) {
+s-IntrStatus |= PCSTimeout;
+rtl8139_update_irq(s);
+}
+
+/* Set QEMU timer only if needed that is
+ * - TimerInt  0 (we have a timer)
+ * - mask = 1 (we want an interrupt timer)
+ * - irq = 0  (irq is not already active)
+ * If any of above change we need to compute timer again
+ * Also we must check if timer is passed without QEMU timer
+ */
+s-TimerExpire = 0;
+if (!s-TimerInt) {
+return;
+}
+
+pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY,
+get_ticks_per_sec());
+low_pci = pci_time  0x;
+pci_time = pci_time - low_pci + s-TimerInt;
+if (low_pci = s-TimerInt) {
+pci_time += 0x1LL;
+}
+next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(),
+PCI_FREQUENCY);
+s-TimerExpire = next_time;
+
+if ((s-IntrMask  PCSTimeout) != 0  (s-IntrStatus  PCSTimeout) == 0) {
+qemu_mod_timer(s-timer, next_time);
+}
+}
+
 static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val)
 {
 RTL8139State *s = opaque;
@@ -2784,13 +2840,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t 
addr, uint32_t val)
 
 case Timer:
 DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n));
-s-TCTR = 0;
 s-TCTR_base = qemu_get_clock(vm_clock);
+rtl8139_set_next_tctr_time(s, s-TCTR_base);
 break;
 
 case FlashReg:
 DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, 
val));
-s-TimerInt = val;
+if (s-TimerInt != val) {
+s-TimerInt = val;
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+}
 break;
 
 default:
@@ -3000,7 +3059,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t 
addr)
 break;
 
 case Timer

[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139.

2010-02-16 Thread Frediano Ziglio
Add a QEMU timer only when needed (timeout status not set, timeout
irq wanted and timer set).

This patch is required for Darwin. Patch has been tested under
FreeBSD, Darwin and Linux.

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 hw/rtl8139.c |  136 ++---
 1 files changed, 81 insertions(+), 55 deletions(-)

diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index f04dd54..e43c15c 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -41,6 +41,10 @@
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when enabling 
rx/tx operation
+ *
+ *  2010-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer only
+ *  when strictly needed (required for for
+ *  Darwin)
  */
 
 #include hw.h
@@ -60,9 +64,6 @@
 /* Calculate CRCs properly on Rx packets */
 #define RTL8139_CALCULATE_RXCRC 1
 
-/* Uncomment to enable on-board timer interrupts */
-//#define RTL8139_ONBOARD_TIMER 1
-
 #if defined(RTL8139_CALCULATE_RXCRC)
 /* For crc32 */
 #include zlib.h
@@ -491,9 +492,12 @@ typedef struct RTL8139State {
 
 /* PCI interrupt timer */
 QEMUTimer *timer;
+int64_t TimerExpire;
 
 } RTL8139State;
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
+
 static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command)
 {
 DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command));
@@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, 
uint32_t val)
 
 s-IntrMask = val;
 
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 }
 
 static uint32_t rtl8139_IntrMask_read(RTL8139State *s)
@@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, 
uint32_t val)
 rtl8139_update_irq(s);
 
 s-IntrStatus = newStatus;
+/*
+ * Computing if we miss an interrupt here is not that correct but
+ * considered that we should have had already an interrupt
+ * and probably emulated is slower is better to assume this resetting was
+ * done before testing on previous rtl8139_update_irq lead to IRQ loosing
+ */
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 #endif
 }
 
 static uint32_t rtl8139_IntrStatus_read(RTL8139State *s)
 {
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+
 uint32_t ret = s-IntrStatus;
 
 DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret));
@@ -2739,6 +2755,43 @@ static void rtl8139_io_writew(void *opaque, uint8_t 
addr, uint32_t val)
 }
 }
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time)
+{
+DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n));
+
+if (s-TimerExpire  current_time = s-TimerExpire) {
+s-IntrStatus |= PCSTimeout;
+rtl8139_update_irq(s);
+}
+
+/* Set QEMU timer only if needed that is
+ * - TimerInt  0 (we have a timer)
+ * - mask = 1 (we want an interrupt timer)
+ * - irq = 0  (irq is not already active)
+ * If any of above change we need to compute timer again
+ * Also we must check if timer is passed without QEMU timer
+ */
+s-TimerExpire = 0;
+if (!s-TimerInt) {
+return;
+}
+
+int64_t pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY,
+get_ticks_per_sec());
+uint32_t low_pci = pci_time  0x;
+pci_time = pci_time - low_pci + s-TimerInt;
+if (low_pci = s-TimerInt) {
+pci_time += 0x1LL;
+}
+int64_t next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(),
+PCI_FREQUENCY);
+s-TimerExpire = next_time;
+
+if ((s-IntrMask  PCSTimeout) != 0  (s-IntrStatus  PCSTimeout) == 0) {
+qemu_mod_timer(s-timer, next_time);
+}
+}
+
 static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val)
 {
 RTL8139State *s = opaque;
@@ -2784,13 +2837,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t 
addr, uint32_t val)
 
 case Timer:
 DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n));
-s-TCTR = 0;
 s-TCTR_base = qemu_get_clock(vm_clock);
+rtl8139_set_next_tctr_time(s, s-TCTR_base);
 break;
 
 case FlashReg:
 DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, 
val));
-s-TimerInt = val;
+if (s-TimerInt != val) {
+s-TimerInt = val;
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+}
 break;
 
 default:
@@ -3000,7 +3056,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t 
addr)
 break;
 
 case Timer:
-ret = s-TCTR

[Qemu-devel] [PATCH] rewrote timer implementation for rtl8139. Add a QEMU timer only when needed (timeout status not set, timeout irq wanted and timer set).

2010-02-09 Thread Frediano Ziglio

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 hw/rtl8139.c |  136 ++---
 1 files changed, 81 insertions(+), 55 deletions(-)

diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index f04dd54..e43c15c 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -41,6 +41,10 @@
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when enabling 
rx/tx operation
+ *
+ *  2010-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer only
+ *  when strictly needed (required for for
+ *  Darwin)
  */
 
 #include hw.h
@@ -60,9 +64,6 @@
 /* Calculate CRCs properly on Rx packets */
 #define RTL8139_CALCULATE_RXCRC 1
 
-/* Uncomment to enable on-board timer interrupts */
-//#define RTL8139_ONBOARD_TIMER 1
-
 #if defined(RTL8139_CALCULATE_RXCRC)
 /* For crc32 */
 #include zlib.h
@@ -491,9 +492,12 @@ typedef struct RTL8139State {
 
 /* PCI interrupt timer */
 QEMUTimer *timer;
+int64_t TimerExpire;
 
 } RTL8139State;
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
+
 static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command)
 {
 DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command));
@@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, 
uint32_t val)
 
 s-IntrMask = val;
 
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 }
 
 static uint32_t rtl8139_IntrMask_read(RTL8139State *s)
@@ -2555,12 +2561,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, 
uint32_t val)
 rtl8139_update_irq(s);
 
 s-IntrStatus = newStatus;
+/*
+ * Computing if we miss an interrupt here is not that correct but
+ * considered that we should have had already an interrupt
+ * and probably emulated is slower is better to assume this resetting was
+ * done before testing on previous rtl8139_update_irq lead to IRQ loosing
+ */
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 #endif
 }
 
 static uint32_t rtl8139_IntrStatus_read(RTL8139State *s)
 {
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+
 uint32_t ret = s-IntrStatus;
 
 DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret));
@@ -2739,6 +2755,43 @@ static void rtl8139_io_writew(void *opaque, uint8_t 
addr, uint32_t val)
 }
 }
 
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time)
+{
+DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n));
+
+if (s-TimerExpire  current_time = s-TimerExpire) {
+s-IntrStatus |= PCSTimeout;
+rtl8139_update_irq(s);
+}
+
+/* Set QEMU timer only if needed that is
+ * - TimerInt  0 (we have a timer)
+ * - mask = 1 (we want an interrupt timer)
+ * - irq = 0  (irq is not already active)
+ * If any of above change we need to compute timer again
+ * Also we must check if timer is passed without QEMU timer
+ */
+s-TimerExpire = 0;
+if (!s-TimerInt) {
+return;
+}
+
+int64_t pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY,
+get_ticks_per_sec());
+uint32_t low_pci = pci_time  0x;
+pci_time = pci_time - low_pci + s-TimerInt;
+if (low_pci = s-TimerInt) {
+pci_time += 0x1LL;
+}
+int64_t next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(),
+PCI_FREQUENCY);
+s-TimerExpire = next_time;
+
+if ((s-IntrMask  PCSTimeout) != 0  (s-IntrStatus  PCSTimeout) == 0) {
+qemu_mod_timer(s-timer, next_time);
+}
+}
+
 static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val)
 {
 RTL8139State *s = opaque;
@@ -2784,13 +2837,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t 
addr, uint32_t val)
 
 case Timer:
 DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n));
-s-TCTR = 0;
 s-TCTR_base = qemu_get_clock(vm_clock);
+rtl8139_set_next_tctr_time(s, s-TCTR_base);
 break;
 
 case FlashReg:
 DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, 
val));
-s-TimerInt = val;
+if (s-TimerInt != val) {
+s-TimerInt = val;
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+}
 break;
 
 default:
@@ -3000,7 +3056,8 @@ static uint32_t rtl8139_io_readl(void *opaque, uint8_t 
addr)
 break;
 
 case Timer:
-ret = s-TCTR;
+ret = muldiv64(qemu_get_clock(vm_clock) - s-TCTR_base,
+   PCI_FREQUENCY, get_ticks_per_sec());
 DEBUG_PRINT((RTL8139: TCTR Timer read val=0x%08x

[Qemu-devel] [PATCH] rtl8139 timer interrupt rewrote

2010-02-08 Thread Frediano Ziglio
rewrote timer implementation for rtl8139. Add a QEMU timer only when needed
(timeout status not set, timeout irq wanted and timer set).

Signed-off-by: Frediano Ziglio fredd...@gmail.com
---
 hw/rtl8139.c |  136 ++---
 1 files changed, 81 insertions(+), 55 deletions(-)

diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index f04dd54..3b332a6 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -41,6 +41,10 @@
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when
enabling rx/tx operation
+ *
+ *  2009-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer only
+ *  when strictly needed (required for for
+ *  Darwin)
  */

 #include hw.h
@@ -60,9 +64,6 @@
 /* Calculate CRCs properly on Rx packets */
 #define RTL8139_CALCULATE_RXCRC 1

-/* Uncomment to enable on-board timer interrupts */
-//#define RTL8139_ONBOARD_TIMER 1
-
 #if defined(RTL8139_CALCULATE_RXCRC)
 /* For crc32 */
 #include zlib.h
@@ -491,9 +492,12 @@ typedef struct RTL8139State {

 /* PCI interrupt timer */
 QEMUTimer *timer;
+int64_t TimerExpire;

 } RTL8139State;

+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
+
 static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command)
 {
 DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command));
@@ -2522,7 +2526,9 @@ static void rtl8139_IntrMask_write(RTL8139State
*s, uint32_t val)

 s-IntrMask = val;

+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 }

 static uint32_t rtl8139_IntrMask_read(RTL8139State *s)
@@ -2555,12 +2561,22 @@ static void
rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val)
 rtl8139_update_irq(s);

 s-IntrStatus = newStatus;
+/*
+ * Computing if we miss an interrupt here is not that correct but
+ * considered that we should have had already an interrupt
+ * and probably emulated is slower is better to assume this resetting was
+ * done before testing on previous rtl8139_update_irq lead to IRQ loosing
+ */
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 #endif
 }

 static uint32_t rtl8139_IntrStatus_read(RTL8139State *s)
 {
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+
 uint32_t ret = s-IntrStatus;

 DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret));
@@ -2739,6 +2755,43 @@ static void rtl8139_io_writew(void *opaque,
uint8_t addr, uint32_t val)
 }
 }

+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time)
+{
+DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n));
+
+if (s-TimerExpire  current_time = s-TimerExpire) {
+s-IntrStatus |= PCSTimeout;
+rtl8139_update_irq(s);
+}
+
+/* Set QEMU timer only if needed that is
+ * - TimerInt  0 (we have a timer)
+ * - mask = 1 (we want an interrupt timer)
+ * - irq = 0  (irq is not already active)
+ * If any of above change we need to compute timer again
+ * Also we must check if timer is passed without QEMU timer
+ */
+s-TimerExpire = 0;
+if (!s-TimerInt) {
+return;
+}
+
+int64_t pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY,
+get_ticks_per_sec());
+uint32_t low_pci = pci_time  0x;
+pci_time = pci_time - low_pci + s-TimerInt;
+if (low_pci = s-TimerInt) {
+pci_time += 0x1LL;
+}
+int64_t next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(),
+PCI_FREQUENCY);
+s-TimerExpire = next_time;
+
+if ((s-IntrMask  PCSTimeout) != 0  (s-IntrStatus  PCSTimeout) == 0) {
+qemu_mod_timer(s-timer, next_time);
+}
+}
+
 static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val)
 {
 RTL8139State *s = opaque;
@@ -2784,13 +2837,16 @@ static void rtl8139_io_writel(void *opaque,
uint8_t addr, uint32_t val)

 case Timer:
 DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n));
-s-TCTR = 0;
 s-TCTR_base = qemu_get_clock(vm_clock);
+rtl8139_set_next_tctr_time(s, s-TCTR_base);
 break;

 case FlashReg:
 DEBUG_PRINT((RTL8139: FlashReg TimerInt write
val=0x%08x\n, val));
-s-TimerInt = val;
+if (s-TimerInt != val) {
+s-TimerInt = val;
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+}
 break;

 default:
@@ -3000,7 +3056,8 @@ static uint32_t rtl8139_io_readl(void *opaque,
uint8_t addr)
 break;

 case Timer:
-ret = s-TCTR;
+ret = muldiv64(qemu_get_clock(vm_clock) - s-TCTR_base

Re: [Qemu-devel] [PATCH] rtl8139 timer interrupt rewrote

2010-02-08 Thread Frediano Ziglio
2010/2/8 Igor Kovalenko igor.v.kovale...@gmail.com:
 On Sun, Feb 7, 2010 at 6:22 PM, Frediano Ziglio fredd...@gmail.com wrote:
 rewrote timer implementation for rtl8139. Add a QEMU
 timer only when needed (timeout status not set, timeout irq wanted and
 timer set).

 Signed-off-by: Frediano Ziglio fredd...@gmail.com
 --
 diff --git a/hw/rtl8139.c b/hw/rtl8139.c
 index f04dd54..0d877b7 100644
 --- a/hw/rtl8139.c
 +++ b/hw/rtl8139.c
 @@ -41,6 +41,10 @@
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when
 enabling rx/tx operation
 + *
 + *  2009-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer 
 only
 + *  when strictly needed (required for for
 + *  Darwin)
  */

  #include hw.h
 @@ -61,7 +65,7 @@
  #define RTL8139_CALCULATE_RXCRC 1

  /* Uncomment to enable on-board timer interrupts */
 -//#define RTL8139_ONBOARD_TIMER 1
 +#define RTL8139_ONBOARD_TIMER 1

 Please remove this macro.


Removed (see updated patch)


 +static void rtl8139_pre_save(void *opaque)
 +{
 +RTL8139State* s = opaque;
 +
 +// set IntrStatus correctly
 +int64_t current_time = qemu_get_clock(vm_clock);
 +rtl8139_set_next_tctr_time(s, current_time);
 +s-TCTR = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY,
 +   get_ticks_per_sec());
 +}
 +

 Seems like TCTR is not used after restoring from saved state. Is it 
 intentional?


Yes, wanted, mainly TCTR is computed from TCTR_base and tick counts so
is currently quite unused but is computed for compatibility with
previous state.

 Can you please check if freebsd works with this change?


Yes, it works correctly. I don't know FreeBSD that much but I
downloaded latest 8.0 live version, got a shell, configured with
network manually with old ifconfig and tried some data exchange
(mainly scp/ssh).

Regards
 Frediano Ziglio




[Qemu-devel] Updated rtl8139 timer interrupt rewrote

2010-02-07 Thread Frediano Ziglio
Hi,
  I rewrote timer implementation for this card. I wrote even a small
Linux guest test program (attached as main.c). This patch add a QEMU
timer only when needed (timeout status not set, timeout irq wanted and
timer set). I tested this patch with a Darwin system and with my test
program (I bought a compatible physical card and it works too).

This patch replace previous (I updated just style).

Regards
  Frediano Ziglio
diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index f04dd54..0d877b7 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -41,6 +41,10 @@
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when enabling rx/tx operation
+ *
+ *  2009-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer only
+ *  when strictly needed (required for for
+ *  Darwin)
  */
 
 #include hw.h
@@ -61,7 +65,7 @@
 #define RTL8139_CALCULATE_RXCRC 1
 
 /* Uncomment to enable on-board timer interrupts */
-//#define RTL8139_ONBOARD_TIMER 1
+#define RTL8139_ONBOARD_TIMER 1
 
 #if defined(RTL8139_CALCULATE_RXCRC)
 /* For crc32 */
@@ -489,11 +493,20 @@ typedef struct RTL8139State {
 intcplus_txbuffer_len;
 intcplus_txbuffer_offset;
 
+#ifdef RTL8139_ONBOARD_TIMER
 /* PCI interrupt timer */
 QEMUTimer *timer;
+int64_t TimerExpire;
+#endif
 
 } RTL8139State;
 
+#ifdef RTL8139_ONBOARD_TIMER
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
+#else
+#define rtl8139_set_next_tctr_time(s,t) do { ; } while(0)
+#endif
+
 static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command)
 {
 DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command));
@@ -2522,7 +2535,9 @@ static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val)
 
 s-IntrMask = val;
 
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 }
 
 static uint32_t rtl8139_IntrMask_read(RTL8139State *s)
@@ -2555,12 +2570,22 @@ static void rtl8139_IntrStatus_write(RTL8139State *s, uint32_t val)
 rtl8139_update_irq(s);
 
 s-IntrStatus = newStatus;
+/*
+ * Computing if we miss an interrupt here is not that correct but
+ * considered that we should have had already an interrupt
+ * and probably emulated is slower is better to assume this resetting was
+ * done before testing on previous rtl8139_update_irq lead to IRQ loosing
+ */
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 #endif
 }
 
 static uint32_t rtl8139_IntrStatus_read(RTL8139State *s)
 {
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+
 uint32_t ret = s-IntrStatus;
 
 DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret));
@@ -2739,6 +2764,45 @@ static void rtl8139_io_writew(void *opaque, uint8_t addr, uint32_t val)
 }
 }
 
+#ifdef RTL8139_ONBOARD_TIMER
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time)
+{
+DEBUG_PRINT((RTL8139: entered rtl8139_set_next_tctr_time\n));
+
+if (s-TimerExpire  current_time = s-TimerExpire) {
+s-IntrStatus |= PCSTimeout;
+rtl8139_update_irq(s);
+}
+
+/* Set QEMU timer only if needed that is
+ * - TimerInt  0 (we have a timer)
+ * - mask = 1 (we want an interrupt timer)
+ * - irq = 0  (irq is not already active)
+ * If any of above change we need to compute timer again
+ * Also we must check if timer is passed without QEMU timer
+ */
+s-TimerExpire = 0;
+if (!s-TimerInt) {
+return;
+}
+
+int64_t pci_time = muldiv64(current_time - s-TCTR_base, PCI_FREQUENCY,
+get_ticks_per_sec());
+uint32_t low_pci = pci_time  0x;
+pci_time = pci_time - low_pci + s-TimerInt;
+if (low_pci = s-TimerInt) {
+pci_time += 0x1LL;
+}
+int64_t next_time = s-TCTR_base + muldiv64(pci_time, get_ticks_per_sec(),
+PCI_FREQUENCY);
+s-TimerExpire = next_time;
+
+if ((s-IntrMask  PCSTimeout) != 0  (s-IntrStatus  PCSTimeout) == 0) {
+qemu_mod_timer(s-timer, next_time);
+}
+}
+#endif
+
 static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val)
 {
 RTL8139State *s = opaque;
@@ -2784,13 +2848,16 @@ static void rtl8139_io_writel(void *opaque, uint8_t addr, uint32_t val)
 
 case Timer:
 DEBUG_PRINT((RTL8139: TCTR Timer reset on write\n));
-s-TCTR = 0;
 s-TCTR_base = qemu_get_clock(vm_clock);
+rtl8139_set_next_tctr_time(s, s-TCTR_base);
 break;
 
 case FlashReg:
 DEBUG_PRINT((RTL8139: FlashReg TimerInt write val=0x%08x\n, val));
-s-TimerInt = val;
+if (s-TimerInt != val) {
+s-TimerInt = val

[Qemu-devel] rtl8139 timer interrupt rewrote

2010-02-05 Thread Frediano Ziglio
Hi,
  I rewrote timer implementation for this card. I wrote even a small
Linux guest test program (attached as main.c). This patch add a QEMU
timer only when needed (timeout status not set, timeout irq wanted and
timer set). I tested this patch with a Darwin system and with my test
program (I bought a compatible physical card and it works too).

Regards
  Frediano Ziglio

--- hw/rtl8139.c.orig	2010-01-14 23:17:59.0 +0100
+++ hw/rtl8139.c	2010-02-05 20:25:31.711462995 +0100
@@ -34,41 +34,44 @@
  *  Implemented Tally Counters, increased VM load/save version
  *  Implemented IP/TCP/UDP checksum task offloading
  *
  *  2006-Jul-04  Igor Kovalenko :   Implemented TCP segmentation offloading
  *  Fixed MTU=1500 for produced ethernet frames
  *
  *  2006-Jul-09  Igor Kovalenko :   Fixed TCP header length calculation while processing
  *  segmentation offloading
  *  Removed slirp.h dependency
  *  Added rx/tx buffer reset when enabling rx/tx operation
+ *
+ *  2009-Feb-04  Frediano Ziglio:   Rewrote timer support using QEMU timer only when strictly
+ *  needed (required for for Darwin)
  */
 
 #include hw.h
 #include pci.h
 #include qemu-timer.h
 #include net.h
 #include loader.h
 
 /* debug RTL8139 card */
 //#define DEBUG_RTL8139 1
 
 #define PCI_FREQUENCY 3300L
 
 /* debug RTL8139 card C+ mode only */
 //#define DEBUG_RTL8139CP 1
 
 /* Calculate CRCs properly on Rx packets */
 #define RTL8139_CALCULATE_RXCRC 1
 
 /* Uncomment to enable on-board timer interrupts */
-//#define RTL8139_ONBOARD_TIMER 1
+#define RTL8139_ONBOARD_TIMER 1
 
 #if defined(RTL8139_CALCULATE_RXCRC)
 /* For crc32 */
 #include zlib.h
 #endif
 
 #define SET_MASKED(input, mask, curr) \
 ( ( (input)  ~(mask) ) | ( (curr)  (mask) ) )
 
 /* arg % size for size which is a power of 2 */
@@ -482,25 +485,34 @@ typedef struct RTL8139State {
 int64_tTCTR_base;
 
 /* Tally counters */
 RTL8139TallyCounters tally_counters;
 
 /* Non-persistent data */
 uint8_t   *cplus_txbuffer;
 intcplus_txbuffer_len;
 intcplus_txbuffer_offset;
 
+#ifdef RTL8139_ONBOARD_TIMER
 /* PCI interrupt timer */
 QEMUTimer *timer;
+int64_t TimerExpire;
+#endif
 
 } RTL8139State;
 
+#ifdef RTL8139_ONBOARD_TIMER
+static void rtl8139_set_next_tctr_time(RTL8139State *s, int64_t current_time);
+#else
+#define rtl8139_set_next_tctr_time(s,t) do { ; } while(0)
+#endif
+
 static void prom9346_decode_command(EEprom9346 *eeprom, uint8_t command)
 {
 DEBUG_PRINT((RTL8139: eeprom command 0x%02x\n, command));
 
 switch (command  Chip9346_op_mask)
 {
 case Chip9346_op_read:
 {
 eeprom-address = command  EEPROM_9346_ADDR_MASK;
 eeprom-output = eeprom-contents[eeprom-address];
@@ -2510,21 +2522,23 @@ static uint32_t rtl8139_RxBuf_read(RTL81
 
 static void rtl8139_IntrMask_write(RTL8139State *s, uint32_t val)
 {
 DEBUG_PRINT((RTL8139: IntrMask write(w) val=0x%04x\n, val));
 
 /* mask unwriteable bits */
 val = SET_MASKED(val, 0x1e00, s-IntrMask);
 
 s-IntrMask = val;
 
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 }
 
 static uint32_t rtl8139_IntrMask_read(RTL8139State *s)
 {
 uint32_t ret = s-IntrMask;
 
 DEBUG_PRINT((RTL8139: IntrMask read(w) val=0x%04x\n, ret));
 
 return ret;
 }
@@ -2543,26 +2557,36 @@ static void rtl8139_IntrStatus_write(RTL
 uint16_t newStatus = s-IntrStatus  ~val;
 
 /* mask unwriteable bits */
 newStatus = SET_MASKED(newStatus, 0x1e00, s-IntrStatus);
 
 /* writing 1 to interrupt status register bit clears it */
 s-IntrStatus = 0;
 rtl8139_update_irq(s);
 
 s-IntrStatus = newStatus;
+/* 
+ * Computing if we miss an interrupt here is not that correct but
+ * considered that we should have had already an interrupt
+ * and probably emulated is slower is better to assume this resetting was done before
+ * testing on previous rtl8139_update_irq lead to IRQ loosing
+ */
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
 rtl8139_update_irq(s);
+
 #endif
 }
 
 static uint32_t rtl8139_IntrStatus_read(RTL8139State *s)
 {
+rtl8139_set_next_tctr_time(s, qemu_get_clock(vm_clock));
+
 uint32_t ret = s-IntrStatus;
 
 DEBUG_PRINT((RTL8139: IntrStatus read(w) val=0x%04x\n, ret));
 
 #if 0
 
 /* reading ISR clears all interrupts */
 s-IntrStatus = 0;
 
 rtl8139_update_irq(s);
@@ -2727,20 +2751,56 @@ static void rtl8139_io_writew(void *opaq
 
 default:
 DEBUG_PRINT((RTL8139: ioport write(w) addr=0x%x val=0x%04x via write(b)\n, addr, val));
 
 rtl8139_io_writeb(opaque, addr, val  0xff);
 rtl8139_io_writeb(opaque, addr + 1, (val  8)  0xff

<    1   2   3