block: gluster as block backend From: Bharata B Rao <bhar...@linux.vnet.ibm.com>
This patch adds gluster as the new block backend in QEMU. This gives QEMU the ability to boot VM images from gluster volumes. Signed-off-by: Bharata B Rao <bhar...@linux.vnet.ibm.com> --- Makefile.objs | 2 block/gluster.c | 435 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 436 insertions(+), 1 deletions(-) create mode 100644 block/gluster.c diff --git a/Makefile.objs b/Makefile.objs index 25190ba..859b88a 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -59,7 +59,7 @@ block-nested-$(CONFIG_POSIX) += raw-posix.o block-nested-$(CONFIG_LIBISCSI) += iscsi.o block-nested-$(CONFIG_CURL) += curl.o block-nested-$(CONFIG_RBD) += rbd.o -block-nested-$(CONFIG_GLUSTERFS) += gluster-helpers.o +block-nested-$(CONFIG_GLUSTERFS) += gluster-helpers.o gluster.o block-obj-y += $(addprefix block/, $(block-nested-y)) diff --git a/block/gluster.c b/block/gluster.c new file mode 100644 index 0000000..1566cb7 --- /dev/null +++ b/block/gluster.c @@ -0,0 +1,435 @@ +/* + * GlusterFS backend for QEMU + * + * (AIO implementation is derived from block/rbd.c) + * + * Copyright (C) 2012 Bharata B Rao <bhar...@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * (at your option) any later version. See the COPYING file in the top-level + * directory. + */ +#include "block_int.h" +#include "gluster-helpers.h" + +typedef void *gluster_file_t; + +typedef struct glusterConf { + char volfile[PATH_MAX]; + char image[PATH_MAX]; +} glusterConf; + +typedef struct BDRVGlusterState { + int fds[2]; + int open_flags; + gluster_file_t fd; + glusterfs_ctx_t *ctx; + int qemu_aio_count; + int event_reader_pos; + gluster_aiocb_t *event_gaiocb; +} BDRVGlusterState; + +typedef struct glusterAIOCB { + BlockDriverAIOCB common; + QEMUBH *bh; + QEMUIOVector *qiov; + int ret; + int write; + char *bounce; + BDRVGlusterState *s; + int cancelled; + int error; +} glusterAIOCB; + +#define GLUSTER_FD_READ 0 +#define GLUSTER_FD_WRITE 1 + +/* + * file=protocol:volfile:image + */ +static int qemu_gluster_parsename(glusterConf *c, const char *filename) +{ + char *file = g_strdup(filename); + char *token, *next_token, *saveptr; + int ret = 0; + + /* Discard the protocol */ + token = strtok_r(file, ":", &saveptr); + if (!token) { + ret = -EINVAL; + goto out; + } + + /* volfile */ + next_token = strtok_r(NULL, ":", &saveptr); + if (!next_token) { + ret = -EINVAL; + goto out; + } + strncpy(c->volfile, next_token, PATH_MAX); + + /* image */ + next_token = strtok_r(NULL, ":", &saveptr); + if (!next_token) { + ret = -EINVAL; + goto out; + } + strncpy(c->image, next_token, PATH_MAX); +out: + g_free(file); + return ret; +} + +static void gluster_aio_bh_cb(void *opaque) +{ + glusterAIOCB *acb = opaque; + + if (!acb->write) { + qemu_iovec_from_buffer(acb->qiov, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + + qemu_aio_release(acb); +} + +static void qemu_gluster_complete_aio(gluster_aiocb_t *gaiocb) +{ + glusterAIOCB *acb = (glusterAIOCB *)gaiocb->opaque; + int64_t r; + + if (acb->cancelled) { + qemu_vfree(acb->bounce); + qemu_aio_release(acb); + goto done; + } + + r = gaiocb->ret; + + if (acb->write) { + if (r < 0) { + acb->ret = r; + acb->error = 1; + } else if (!acb->error) { + acb->ret = gaiocb->size; + } + } else { + if (r < 0) { + memset(gaiocb->buf, 0, gaiocb->size); + acb->ret = r; + acb->error = 1; + } else if (r < gaiocb->size) { + memset(gaiocb->buf + r, 0, gaiocb->size - r); + if (!acb->error) { + acb->ret = gaiocb->size; + } + } else if (!acb->error) { + acb->ret = r; + } + } + acb->bh = qemu_bh_new(gluster_aio_bh_cb, acb); + qemu_bh_schedule(acb->bh); +done: + g_free(gaiocb); +} + +static void qemu_gluster_aio_event_reader(void *opaque) +{ + BDRVGlusterState *s = opaque; + ssize_t ret; + + do { + char *p = (char *)&s->event_gaiocb; + + ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos, + sizeof(s->event_gaiocb) - s->event_reader_pos); + if (ret > 0) { + s->event_reader_pos += ret; + if (s->event_reader_pos == sizeof(s->event_gaiocb)) { + s->event_reader_pos = 0; + qemu_gluster_complete_aio(s->event_gaiocb); + s->qemu_aio_count--; + } + } + } while (ret < 0 && errno == EINTR); +} + +static int qemu_gluster_aio_flush_cb(void *opaque) +{ + BDRVGlusterState *s = opaque; + + return (s->qemu_aio_count > 0); +} + +static int qemu_gluster_open(BlockDriverState *bs, const char *filename, + int bdrv_flags) +{ + BDRVGlusterState *s = bs->opaque; + glusterConf *c = g_malloc(sizeof(glusterConf)); + int ret = -1; + + if (qemu_gluster_parsename(c, filename)) { + goto out; + } + + s->ctx = gluster_init(c->volfile); + if (!s->ctx) { + goto out; + } + + /* FIX: Server client handshake takes time */ + sleep(1); + + s->open_flags |= O_BINARY; + s->open_flags &= ~O_ACCMODE; + if (bdrv_flags & BDRV_O_RDWR) { + s->open_flags |= O_RDWR; + } else { + s->open_flags |= O_RDONLY; + } + + /* Use O_DSYNC for write-through caching, no flags for write-back caching, + * and O_DIRECT for no caching. */ + if ((bdrv_flags & BDRV_O_NOCACHE)) + s->open_flags |= O_DIRECT; + if (!(bdrv_flags & BDRV_O_CACHE_WB)) + s->open_flags |= O_DSYNC; + + s->fd = gluster_open(c->image, s->open_flags, 0); + if (!s->fd) { + goto out; + } + + ret = qemu_pipe(s->fds); + if (ret < 0) { + goto out; + } + fcntl(s->fds[0], F_SETFL, O_NONBLOCK); + fcntl(s->fds[1], F_SETFL, O_NONBLOCK); + qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], + qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s); +out: + if (c) { + g_free(c); + } + if (ret < 0) { + gluster_close(s->fd); + } + return ret; +} + +static int qemu_gluster_create(const char *filename, + QEMUOptionParameter *options) +{ + glusterConf *c = g_malloc(sizeof(glusterConf)); + int ret = 0; + gluster_file_t fd; + int64_t total_size = 0; + + ret = qemu_gluster_parsename(c, filename); + if (ret) { + goto out; + } + + if (!gluster_init(c->volfile)) { + ret = -1; + goto out; + } + + /* FIX: Server client handshake takes time */ + sleep(1); + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + total_size = options->value.n / BDRV_SECTOR_SIZE; + } + options++; + } + + fd = gluster_creat(c->image, 0644); + if (!fd) { + ret = -errno; + } else { + if (gluster_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { + ret = -errno; + } + if (gluster_close(fd) != 0) { + ret = -errno; + } + } +out: + if (c) { + g_free(c); + } + return ret; +} + +static AIOPool gluster_aio_pool = { + .aiocb_size = sizeof(glusterAIOCB), +}; + +static int qemu_gluster_send_pipe(BDRVGlusterState *s, gluster_aiocb_t *gaiocb) +{ + int ret = 0; + while (1) { + fd_set wfd; + int fd = s->fds[GLUSTER_FD_WRITE]; + + ret = write(fd, (void *)&gaiocb, sizeof(gaiocb)); + if (ret >= 0) { + break; + } + if (errno == EINTR) { + continue; + } + if (errno != EAGAIN) { + break; + } + + FD_ZERO(&wfd); + FD_SET(fd, &wfd); + do { + ret = select(fd + 1, NULL, &wfd, NULL, NULL); + } while (ret < 0 && errno == EINTR); + } + return ret; +} + +static void gluster_finish_aiocb(void *arg) +{ + int ret; + gluster_aiocb_t *gaiocb = (gluster_aiocb_t *)arg; + BDRVGlusterState *s = ((glusterAIOCB *)gaiocb->opaque)->s; + + ret = qemu_gluster_send_pipe(s, gaiocb); + if (ret < 0) { + g_free(gaiocb); + } +} + +static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int write) +{ + int ret; + glusterAIOCB *acb; + gluster_aiocb_t *gaiocb; + BDRVGlusterState *s = bs->opaque; + char *buf; + size_t size; + off_t offset; + + acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque); + acb->write = write; + acb->qiov = qiov; + acb->bounce = qemu_blockalign(bs, qiov->size); + acb->ret = 0; + acb->bh = NULL; + acb->s = s; + + if (write) { + qemu_iovec_to_buffer(acb->qiov, acb->bounce); + } + + buf = acb->bounce; + offset = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + s->qemu_aio_count++; + + gaiocb = g_malloc(sizeof(gluster_aiocb_t)); + gaiocb->opaque = acb; + gaiocb->buf = buf; + gaiocb->offset = offset; + gaiocb->size = size; + gaiocb->completion_fn = &gluster_finish_aiocb; + + if (write) { + ret = gluster_aio_writev(s->fd, gaiocb); + } else { + ret = gluster_aio_readv(s->fd, gaiocb); + } + + if (ret < 0) { + goto out; + } + return &acb->common; + +out: + g_free(gaiocb); + s->qemu_aio_count--; + qemu_aio_release(acb); + return NULL; +} + +static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + +static int64_t qemu_gluster_getlength(BlockDriverState *bs) +{ + BDRVGlusterState *s = bs->opaque; + gluster_file_t fd = s->fd; + struct stat st; + int ret; + + ret = gluster_fstat(fd, &st); + if (ret < 0) { + return -1; + } else { + return st.st_size; + } +} + +static void qemu_gluster_close(BlockDriverState *bs) +{ + BDRVGlusterState *s = bs->opaque; + + if (s->fd) { + gluster_close(s->fd); + s->fd = NULL; + } +} + +static QEMUOptionParameter qemu_gluster_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + +static BlockDriver bdrv_gluster = { + .format_name = "gluster", + .protocol_name = "gluster", + .instance_size = sizeof(BDRVGlusterState), + .bdrv_file_open = qemu_gluster_open, + .bdrv_close = qemu_gluster_close, + .bdrv_create = qemu_gluster_create, + .bdrv_getlength = qemu_gluster_getlength, + + .bdrv_aio_readv = qemu_gluster_aio_readv, + .bdrv_aio_writev = qemu_gluster_aio_writev, + + .create_options = qemu_gluster_create_options, +}; + +static void bdrv_gluster_init(void) +{ + bdrv_register(&bdrv_gluster); +} + +block_init(bdrv_gluster_init);