This patch allows to use snapshot with disabled host cache 
(i.e. -drive cache=off,snapshot=on).

To do that, memory allocated for snapshot is aligned on a 512 bytes
boundary, and read/write uses offset and count aligned on this value.
When it is not possible or too complex (for instance for some metadata),
the cache is reactivated temporarily.

All comments are welcome,
Laurent
---
 block-qcow2.c     |   40 +++++++++++++++++++++++-----------------
 block-raw-posix.c |   31 +++++++++++++++++++++++++++++++
 block.c           |   35 ++++++++++++++++++++++++-----------
 qemu-img.c        |   16 ++++++++++++++++
 vl.c              |   11 +++++++++--
 5 files changed, 103 insertions(+), 30 deletions(-)

Index: qemu/block-raw-posix.c
===================================================================
--- qemu.orig/block-raw-posix.c	2008-01-22 10:12:20.000000000 +0100
+++ qemu/block-raw-posix.c	2008-01-22 11:10:41.000000000 +0100
@@ -141,16 +141,39 @@ static int raw_open(BlockDriverState *bs
 #endif
 */
 
+static long raw_save_directio(int fd)
+{
+    long fd_arg;
+
+    fd_arg = fcntl(fd, F_GETFL);
+    if ((fd_arg & O_DIRECT) == 0)
+        return 0;
+    fcntl(fd, F_SETFL, fd_arg & ~O_DIRECT);
+    return fd_arg;
+}
+
+static void raw_restore_directio(int fd, long fd_arg)
+{
+    if (fd_arg) {
+        fdatasync(fd);
+        fcntl(fd, F_SETFL, fd_arg);
+    }
+}
+
 static int raw_pread(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
+    long fd_arg = 0;
     int ret;
 
     ret = fd_open(bs);
     if (ret < 0)
         return ret;
 
+    if ((count & 0x1FF) || (offset & 0x1FF) || ((long)buf & 0x1FF))
+        fd_arg = raw_save_directio(s->fd);
+
     if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
         ++(s->lseek_err_cnt);
         if(s->lseek_err_cnt <= 10) {
@@ -159,6 +182,7 @@ static int raw_pread(BlockDriverState *b
                               s->fd, bs->filename, offset, buf, count,
                               bs->total_sectors, errno, strerror(errno));
         }
+        raw_restore_directio(s->fd, fd_arg);
         return -1;
     }
     s->lseek_err_cnt=0;
@@ -190,6 +214,7 @@ static int raw_pread(BlockDriverState *b
     }
 
 label__raw_read__success:
+    raw_restore_directio(s->fd, fd_arg);
 
     return ret;
 }
@@ -198,12 +223,16 @@ static int raw_pwrite(BlockDriverState *
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
+    long fd_arg = 0;
     int ret;
 
     ret = fd_open(bs);
     if (ret < 0)
         return ret;
 
+    if ((count & 0x1FF) || (offset & 0x1FF) || ((long)buf & 0x1FF))
+        fd_arg = raw_save_directio(s->fd);
+
     if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
         ++(s->lseek_err_cnt);
         if(s->lseek_err_cnt) {
@@ -212,6 +241,7 @@ static int raw_pwrite(BlockDriverState *
                               s->fd, bs->filename, offset, buf, count,
                               bs->total_sectors, errno, strerror(errno));
         }
+        raw_restore_directio(s->fd, fd_arg);
         return -1;
     }
     s->lseek_err_cnt = 0;
@@ -226,6 +256,7 @@ static int raw_pwrite(BlockDriverState *
                       bs->total_sectors, ret, errno, strerror(errno));
 
 label__raw_write__success:
+    raw_restore_directio(s->fd, fd_arg);
 
     return ret;
 }
Index: qemu/vl.c
===================================================================
--- qemu.orig/vl.c	2008-01-22 10:12:20.000000000 +0100
+++ qemu/vl.c	2008-01-22 10:12:30.000000000 +0100
@@ -5593,7 +5593,7 @@ struct QEMUFile {
                            when reading */
     int buf_index;
     int buf_size; /* 0 when writing */
-    uint8_t buf[IO_BUF_SIZE];
+    uint8_t *buf;
 };
 
 QEMUFile *qemu_fopen(const char *filename, const char *mode)
@@ -5629,6 +5629,12 @@ static QEMUFile *qemu_fopen_bdrv(BlockDr
     f = qemu_mallocz(sizeof(QEMUFile));
     if (!f)
         return NULL;
+    f->buf = qemu_memalign(512, IO_BUF_SIZE);
+    if (f->buf == NULL) {
+        qemu_free(f);
+        return NULL;
+    }
+    memset(f->buf, 0, IO_BUF_SIZE);
     f->is_file = 0;
     f->bs = bs;
     f->is_writable = is_writable;
@@ -5682,6 +5688,7 @@ void qemu_fclose(QEMUFile *f)
     if (f->is_file) {
         fclose(f->outfile);
     }
+    qemu_free(f->buf);
     qemu_free(f);
 }
 
@@ -7545,7 +7552,7 @@ static void help(int exitcode)
            "-hdc/-hdd file  use 'file' as IDE hard disk 2/3 image\n"
            "-cdrom file     use 'file' as IDE cdrom image (cdrom is ide1 master)\n"
 	   "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][index=i]\n"
-           "       [,cyls=c,heads=h,secs=s[,trans=t]][snapshot=on|off]"
+           "       [,cyls=c,heads=h,secs=s[,trans=t]][snapshot=on|off]\n"
            "       [,cache=on|off]\n"
 	   "                use 'file' as a drive image\n"
            "-mtdblock file  use 'file' as on-board Flash memory image\n"
Index: qemu/block-qcow2.c
===================================================================
--- qemu.orig/block-qcow2.c	2008-01-22 10:12:20.000000000 +0100
+++ qemu/block-qcow2.c	2008-01-22 10:12:30.000000000 +0100
@@ -26,6 +26,7 @@
 #include <zlib.h>
 #include "aes.h"
 #include <assert.h>
+#include "osdep.h"
 
 /*
   Differences with QCOW:
@@ -246,7 +247,7 @@ static int qcow_open(BlockDriverState *b
     if (s->l1_size < s->l1_vm_state_index)
         goto fail;
     s->l1_table_offset = header.l1_table_offset;
-    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+    s->l1_table = qemu_memalign(512, s->l1_size * sizeof(uint64_t));
     if (!s->l1_table)
         goto fail;
     if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
@@ -256,14 +257,14 @@ static int qcow_open(BlockDriverState *b
         be64_to_cpus(&s->l1_table[i]);
     }
     /* alloc L2 cache */
-    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    s->l2_cache = qemu_memalign(512, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
     if (!s->l2_cache)
         goto fail;
     s->cluster_cache = qemu_malloc(s->cluster_size);
     if (!s->cluster_cache)
         goto fail;
     /* one more sector for decompressed data alignment */
-    s->cluster_data = qemu_malloc(s->cluster_size + 512);
+    s->cluster_data = qemu_memalign(512, s->cluster_size + 512);
     if (!s->cluster_data)
         goto fail;
     s->cluster_cache_offset = -1;
@@ -444,9 +445,10 @@ static int grow_l1_table(BlockDriverStat
 #endif
 
     new_l1_size2 = sizeof(uint64_t) * new_l1_size;
-    new_l1_table = qemu_mallocz(new_l1_size2);
+    new_l1_table = qemu_memalign(512, new_l1_size2);
     if (!new_l1_table)
         return -ENOMEM;
+    memset(new_l1_table, 0, new_l1_size2);
     memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
 
     /* write new table (align to cluster) */
@@ -893,7 +895,6 @@ static QCowAIOCB *qcow_aio_setup(BlockDr
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     QCowAIOCB *acb;
-
     acb = qemu_aio_get(bs, cb, opaque);
     if (!acb)
         return NULL;
@@ -962,11 +963,12 @@ static void qcow_aio_write_cb(void *opaq
     }
     if (s->crypt_method) {
         if (!acb->cluster_data) {
-            acb->cluster_data = qemu_mallocz(s->cluster_size);
+            acb->cluster_data = qemu_memalign(512, s->cluster_size);
             if (!acb->cluster_data) {
                 ret = -ENOMEM;
                 goto fail;
             }
+            memset(acb->cluster_data, 0, s->cluster_size);
         }
         encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
                         acb->n, 1, &s->aes_encrypt_key);
@@ -1090,12 +1092,14 @@ static int qcow_create(const char *filen
     header.l1_size = cpu_to_be32(l1_size);
     offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
 
-    s->refcount_table = qemu_mallocz(s->cluster_size);
+    s->refcount_table = qemu_memalign(512, s->cluster_size);
     if (!s->refcount_table)
         goto fail;
-    s->refcount_block = qemu_mallocz(s->cluster_size);
+    memset(s->refcount_table, 0, s->cluster_size);
+    s->refcount_block = qemu_memalign(512, s->cluster_size);
     if (!s->refcount_block)
         goto fail;
+    memset(s->refcount_block, 0, s->cluster_size);
 
     s->refcount_table_offset = offset;
     header.refcount_table_offset = cpu_to_be64(offset);
@@ -1182,7 +1186,8 @@ static int qcow_write_compressed(BlockDr
     if (nb_sectors != s->cluster_sectors)
         return -EINVAL;
 
-    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+    out_buf = qemu_memalign(512,
+                            s->cluster_size + (s->cluster_size / 1000) + 128);
     if (!out_buf)
         return -ENOMEM;
 
@@ -1264,7 +1269,7 @@ static int update_snapshot_refcount(Bloc
     l1_size2 = l1_size * sizeof(uint64_t);
     l1_allocated = 0;
     if (l1_table_offset != s->l1_table_offset) {
-        l1_table = qemu_malloc(l1_size2);
+        l1_table = qemu_memalign(512, l1_size2);
         if (!l1_table)
             goto fail;
         l1_allocated = 1;
@@ -1280,7 +1285,7 @@ static int update_snapshot_refcount(Bloc
     }
 
     l2_size = s->l2_size * sizeof(uint64_t);
-    l2_table = qemu_malloc(l2_size);
+    l2_table = qemu_memalign(512, l2_size);
     if (!l2_table)
         goto fail;
     l1_modified = 0;
@@ -1583,7 +1588,7 @@ static int qcow_snapshot_create(BlockDri
     sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
     sn->l1_size = s->l1_size;
 
-    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+    l1_table = qemu_memalign(512, s->l1_size * sizeof(uint64_t));
     if (!l1_table)
         goto fail;
     for(i = 0; i < s->l1_size; i++) {
@@ -1732,11 +1737,11 @@ static int refcount_init(BlockDriverStat
     BDRVQcowState *s = bs->opaque;
     int ret, refcount_table_size2, i;
 
-    s->refcount_block_cache = qemu_malloc(s->cluster_size);
+    s->refcount_block_cache = qemu_memalign(512, s->cluster_size);
     if (!s->refcount_block_cache)
         goto fail;
     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
-    s->refcount_table = qemu_malloc(refcount_table_size2);
+    s->refcount_table = qemu_memalign(512, refcount_table_size2);
     if (!s->refcount_table)
         goto fail;
     if (s->refcount_table_size > 0) {
@@ -1909,9 +1914,10 @@ static int grow_refcount_table(BlockDriv
            new_table_size);
 #endif
     new_table_size2 = new_table_size * sizeof(uint64_t);
-    new_table = qemu_mallocz(new_table_size2);
+    new_table = qemu_memalign(512, new_table_size2);
     if (!new_table)
         return -ENOMEM;
+    memset(new_table, 0, new_table_size2);
     memcpy(new_table, s->refcount_table,
            s->refcount_table_size * sizeof(uint64_t));
     for(i = 0; i < s->refcount_table_size; i++)
@@ -2078,7 +2084,7 @@ static int check_refcounts_l1(BlockDrive
     inc_refcounts(bs, refcount_table, refcount_table_size,
                   l1_table_offset, l1_size2);
 
-    l1_table = qemu_malloc(l1_size2);
+    l1_table = qemu_memalign(512, l1_size2);
     if (!l1_table)
         goto fail;
     if (bdrv_pread(s->hd, l1_table_offset,
@@ -2088,7 +2094,7 @@ static int check_refcounts_l1(BlockDrive
         be64_to_cpus(&l1_table[i]);
 
     l2_size = s->l2_size * sizeof(uint64_t);
-    l2_table = qemu_malloc(l2_size);
+    l2_table = qemu_memalign(512, l2_size);
     if (!l2_table)
         goto fail;
     for(i = 0; i < l1_size; i++) {
Index: qemu/qemu-img.c
===================================================================
--- qemu.orig/qemu-img.c	2008-01-22 10:12:20.000000000 +0100
+++ qemu/qemu-img.c	2008-01-22 10:12:30.000000000 +0100
@@ -55,6 +55,22 @@ void *qemu_mallocz(size_t size)
     return ptr;
 }
 
+void *qemu_memalign(size_t alignment, size_t size)
+{
+#if defined(_POSIX_C_SOURCE)
+    int ret;
+    void *ptr;
+    ret = posix_memalign(&ptr, alignment, size);
+    if (ret != 0)
+        return NULL;
+    return ptr;
+#elif defined(_BSD)
+    return valloc(size);
+#else
+    return memalign(alignment, size);
+#endif
+}
+
 char *qemu_strdup(const char *str)
 {
     char *ptr;
Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2008-01-22 10:12:20.000000000 +0100
+++ qemu/block.c	2008-01-22 11:12:18.000000000 +0100
@@ -459,7 +459,10 @@ int bdrv_commit(BlockDriverState *bs)
     BlockDriver *drv = bs->drv;
     int64_t i, total_sectors;
     int n, j;
-    unsigned char sector[512];
+    unsigned char tmp_sector[SECTOR_SIZE + 0x1FF];
+    /* align on 512 bytes boundary for O_DIRECT */
+    unsigned char *sector = (uint8_t*)
+                            (((unsigned long)tmp_sector + 0x1FF) & ~0x1FF);
 
     if (!drv)
         return -ENOMEDIUM;
@@ -569,7 +572,9 @@ int bdrv_write(BlockDriverState *bs, int
 static int bdrv_pread_em(BlockDriverState *bs, int64_t offset,
                          uint8_t *buf, int count1)
 {
-    uint8_t tmp_buf[SECTOR_SIZE];
+    uint8_t align_buf[SECTOR_SIZE + 0x1FF];
+    /* align on 512 bytes boundary for O_DIRECT */
+    uint8_t *tmp_buf = (uint8_t*)(((unsigned long)align_buf + 0x1FF) & ~0x1FF);
     int len, nb_sectors, count;
     int64_t sector_num;
 
@@ -592,11 +597,14 @@ static int bdrv_pread_em(BlockDriverStat
 
     /* read the sectors "in place" */
     nb_sectors = count >> SECTOR_BITS;
-    if (nb_sectors > 0) {
-        if (bdrv_read(bs, sector_num, buf, nb_sectors) < 0)
+    while (nb_sectors > 0) {
+        if (bdrv_read(bs, sector_num, tmp_buf, 1) < 0)
             return -EIO;
-        sector_num += nb_sectors;
-        len = nb_sectors << SECTOR_BITS;
+        /* alignment needed by O_DIRECT */
+	memcpy(buf, tmp_buf, SECTOR_SIZE);
+        nb_sectors--;
+        sector_num += 1;
+        len = 1 << SECTOR_BITS;
         buf += len;
         count -= len;
     }
@@ -613,7 +621,9 @@ static int bdrv_pread_em(BlockDriverStat
 static int bdrv_pwrite_em(BlockDriverState *bs, int64_t offset,
                           const uint8_t *buf, int count1)
 {
-    uint8_t tmp_buf[SECTOR_SIZE];
+    uint8_t align_buf[SECTOR_SIZE + 0x1FF];
+    /* align on 512 bytes boundary for O_DIRECT */
+    uint8_t *tmp_buf = (uint8_t*)(((unsigned long)align_buf + 0x1FF) & ~0x1FF);
     int len, nb_sectors, count;
     int64_t sector_num;
 
@@ -638,11 +648,14 @@ static int bdrv_pwrite_em(BlockDriverSta
 
     /* write the sectors "in place" */
     nb_sectors = count >> SECTOR_BITS;
-    if (nb_sectors > 0) {
-        if (bdrv_write(bs, sector_num, buf, nb_sectors) < 0)
+    while (nb_sectors > 0) {
+        /* alignment needed by O_DIRECT */
+        memcpy(tmp_buf, buf, SECTOR_SIZE);
+        if (bdrv_write(bs, sector_num, tmp_buf, 1) < 0)
             return -EIO;
-        sector_num += nb_sectors;
-        len = nb_sectors << SECTOR_BITS;
+        nb_sectors--;
+        sector_num += 1;
+        len = 1 << SECTOR_BITS;
         buf += len;
         count -= len;
     }

Reply via email to