Il 09/08/2012 15:02, Bharata B Rao ha scritto: > block: Support GlusterFS as a QEMU block backend. > > From: Bharata B Rao <bhar...@linux.vnet.ibm.com> > > This patch adds gluster as the new block backend in QEMU. This gives > QEMU the ability to boot VM images from gluster volumes. Its already > possible to boot from VM images on gluster volumes using FUSE mount, but > this patchset provides the ability to boot VM images from gluster volumes > by by-passing the FUSE layer in gluster. This is made possible by > using libgfapi routines to perform IO on gluster volumes directly. > > VM Image on gluster volume is specified like this: > > file=gluster://server:[port]/volname/image[?transport=socket] > > 'gluster' is the protocol. > > 'server' specifies the server where the volume file specification for > the given volume resides. This can be either hostname or ipv4 address > or ipv6 address. ipv6 address needs to be with in square brackets [ ]. > > port' is the port number on which gluster management daemon (glusterd) is > listening. This is optional and if not specified, QEMU will send 0 which > will make libgfapi to use the default port. > > 'volname' is the name of the gluster volume which contains the VM image. > > 'image' is the path to the actual VM image in the gluster volume. > > 'transport' specifies the transport used to connect to glusterd. This is > optional and if not specified, socket transport is used. > > Examples: > > file=gluster://1.2.3.4/testvol/a.img > file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket > file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img > file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket > file=gluster://server.domain.com:5000/testvol/dir/a.img > > Signed-off-by: Bharata B Rao <bhar...@linux.vnet.ibm.com> > Reviewed-by: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com> > --- > > block/Makefile.objs | 1 > block/gluster.c | 623 > +++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 624 insertions(+), 0 deletions(-) > create mode 100644 block/gluster.c > > > diff --git a/block/Makefile.objs b/block/Makefile.objs > index b5754d3..a1ae67f 100644 > --- a/block/Makefile.objs > +++ b/block/Makefile.objs > @@ -9,3 +9,4 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o > block-obj-$(CONFIG_LIBISCSI) += iscsi.o > block-obj-$(CONFIG_CURL) += curl.o > block-obj-$(CONFIG_RBD) += rbd.o > +block-obj-$(CONFIG_GLUSTERFS) += gluster.o > diff --git a/block/gluster.c b/block/gluster.c > new file mode 100644 > index 0000000..bbbaea8 > --- /dev/null > +++ b/block/gluster.c > @@ -0,0 +1,623 @@ > +/* > + * GlusterFS backend for QEMU > + * > + * (AIO implementation is derived from block/rbd.c) > + * > + * Copyright (C) 2012 Bharata B Rao <bhar...@linux.vnet.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or > + * (at your option) any later version. See the COPYING file in the top-level > + * directory. > + */ > +#include <glusterfs/api/glfs.h> > +#include "block_int.h" > + > +typedef struct GlusterAIOCB { > + BlockDriverAIOCB common; > + bool canceled; > + int64_t size; > + int ret; > +} GlusterAIOCB; > + > +typedef struct BDRVGlusterState { > + struct glfs *glfs; > + int fds[2]; > + struct glfs_fd *fd; > + int qemu_aio_count; > +} BDRVGlusterState; > + > +#define GLUSTER_FD_READ 0 > +#define GLUSTER_FD_WRITE 1 > + > +typedef struct GlusterURI { > + char *server; > + int port; > + char *volname; > + char *image; > + char *transport; > +} GlusterURI; > + > +static void qemu_gluster_uri_free(GlusterURI *uri) > +{ > + g_free(uri->server); > + g_free(uri->volname); > + g_free(uri->image); > + g_free(uri->transport); > + g_free(uri); > +} > + > +/* > + * We don't validate the transport option obtained here but > + * instead depend on gluster to flag an error. > + */ > +static int parse_transport(GlusterURI *uri, char *transport) > +{ > + char *token, *saveptr; > + int ret = -EINVAL; > + > + if (!transport) { > + uri->transport = g_strdup("socket"); > + ret = 0; > + goto out; > + } > + > + token = strtok_r(transport, "=", &saveptr); > + if (!token) { > + goto out; > + } > + if (strcmp(token, "transport")) { > + goto out; > + } > + token = strtok_r(NULL, "=", &saveptr); > + if (!token) { > + goto out; > + } > + uri->transport = g_strdup(token); > + ret = 0; > +out: > + return ret; > +} > + > +static int parse_server(GlusterURI *uri, char *server) > +{ > + int ret = -EINVAL; > + char *token, *saveptr; > + char *p, *q = server; > + > + p = strchr(server, '['); > + if (p) { > + /* [ipv6] */ > + if (p != server) { > + /* [ not in the beginning */ > + goto out; > + } > + q++; > + p = strrchr(p, ']'); > + if (!p) { > + /* No matching ] */ > + goto out; > + } > + *p++ = '\0'; > + uri->server = g_strdup(q); > + > + if (*p) { > + if (*p != ':') { > + /* [ipv6] followed by something other than : */ > + goto out; > + } > + uri->port = strtoul(++p, NULL, 0); > + if (uri->port < 0) { > + goto out; > + } > + } else { > + /* port not specified, use default */ > + uri->port = 0; > + } > + > + } else { > + /* ipv4 or hostname */ > + if (*server == ':') { > + /* port specified w/o a server */ > + goto out; > + } > + token = strtok_r(server, ":", &saveptr); > + if (!token) { > + goto out; > + } > + uri->server = g_strdup(token); > + token = strtok_r(NULL, ":", &saveptr); > + if (token) { > + uri->port = strtoul(token, NULL, 0); > + if (uri->port < 0) { > + goto out; > + } > + } else { > + uri->port = 0; > + } > + } > + ret = 0; > +out: > + return ret; > +} > + > +/* > + * file=gluster://server:[port]/volname/image[?transport=socket] > + * > + * 'gluster' is the protocol. > + * > + * 'server' specifies the server where the volume file specification for > + * the given volume resides. This can be either hostname or ipv4 address > + * or ipv6 address. ipv6 address needs to be with in square brackets [ ]. > + * > + * 'port' is the port number on which gluster management daemon (glusterd) is > + * listening. This is optional and if not specified, QEMU will send 0 which > + * will make libgfapi to use the default port. > + * > + * 'volname' is the name of the gluster volume which contains the VM image. > + * > + * 'image' is the path to the actual VM image in the gluster volume. > + * > + * 'transport' specifies the transport used to connect to glusterd. This is > + * optional and if not specified, socket transport is used. > + * > + * Examples: > + * > + * file=gluster://1.2.3.4/testvol/a.img > + * file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket > + * file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img > + * file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket > + * file=gluster://server.domain.com:5000/testvol/dir/a.img > + * > + * We just do minimal checking of the gluster options and mostly ensure > + * that all the expected elements of the URI are present. We depend on > libgfapi > + * APIs to return appropriate errors in case of invalid arguments. > + */ > +static int qemu_gluster_parseuri(GlusterURI *uri, const char *filename) > +{ > + char *token, *saveptr; > + char *p, *r; > + int ret = -EINVAL; > + > + p = r = g_strdup(filename); > + if (strncmp(p, "gluster://", 10)) { > + goto out; > + } > + > + /* Discard the protocol */ > + p += 10; > + > + /* server */ > + token = strtok_r(p, "/", &saveptr); > + if (!token) { > + goto out; > + } > + > + ret = parse_server(uri, token); > + if (ret < 0) { > + goto out; > + } > + > + /* volname */ > + token = strtok_r(NULL, "/", &saveptr); > + if (!token) { > + ret = -EINVAL; > + goto out; > + } > + uri->volname = g_strdup(token); > + > + /* image */ > + token = strtok_r(NULL, "?", &saveptr); > + if (!token) { > + ret = -EINVAL; > + goto out; > + } > + uri->image = g_strdup(token); > + > + /* transport */ > + token = strtok_r(NULL, "?", &saveptr); > + ret = parse_transport(uri, token); > + if (ret < 0) { > + goto out; > + } > + > + /* Flag error for extra options */ > + token = strtok_r(NULL, "?", &saveptr); > + if (token) { > + ret = -EINVAL; > + goto out; > + } > + ret = 0; > +out: > + g_free(r); > + return ret; > +} > + > +static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename) > +{ > + struct glfs *glfs = NULL; > + int ret; > + > + ret = qemu_gluster_parseuri(uri, filename); > + if (ret < 0) { > + error_report("Usage: file=gluster://server[:port]/volname/image" > + "[?transport=socket]"); > + errno = -ret; > + goto out; > + } > + > + glfs = glfs_new(uri->volname); > + if (!glfs) { > + goto out; > + } > + > + ret = glfs_set_volfile_server(glfs, uri->transport, uri->server, > + uri->port); > + if (ret < 0) { > + goto out; > + } > + > + /* > + * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when > + * GlusterFS exports it in a header. > + */ > + ret = glfs_set_logging(glfs, "-", 4); > + if (ret < 0) { > + goto out; > + } > + > + ret = glfs_init(glfs); > + if (ret) { > + error_report("Gluster connection failed for server=%s port=%d " > + "volume=%s image=%s transport=%s\n", uri->server, uri->port, > + uri->volname, uri->image, uri->transport); > + goto out; > + } > + return glfs; > + > +out: > + if (glfs) { > + glfs_fini(glfs); > + } > + return NULL; > +} > + > +static void qemu_gluster_complete_aio(GlusterAIOCB *acb) > +{ > + int ret; > + > + if (acb->canceled) { > + qemu_aio_release(acb); > + return; > + } > + > + if (acb->ret == acb->size) { > + ret = 0; /* Success */ > + } else if (acb->ret < 0) { > + ret = acb->ret; /* Read/Write failed */ > + } else { > + ret = -EIO; /* Partial read/write - fail it */ > + } > + acb->common.cb(acb->common.opaque, ret); > + qemu_aio_release(acb); > +} > + > +static void qemu_gluster_aio_event_reader(void *opaque) > +{ > + BDRVGlusterState *s = opaque; > + GlusterAIOCB *event_acb; > + int event_reader_pos = 0; > + ssize_t ret; > + > + do { > + char *p = (char *)&event_acb; > + > + ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos, > + sizeof(event_acb) - event_reader_pos); > + if (ret > 0) { > + event_reader_pos += ret; > + if (event_reader_pos == sizeof(event_acb)) { > + event_reader_pos = 0; > + qemu_gluster_complete_aio(event_acb); > + s->qemu_aio_count--; > + } > + } > + } while (ret < 0 && errno == EINTR); > +} > + > +static int qemu_gluster_aio_flush_cb(void *opaque) > +{ > + BDRVGlusterState *s = opaque; > + > + return (s->qemu_aio_count > 0); > +} > + > +static int qemu_gluster_open(BlockDriverState *bs, const char *filename, > + int bdrv_flags) > +{ > + BDRVGlusterState *s = bs->opaque; > + int open_flags = 0; > + int ret = 0; > + GlusterURI *uri = g_malloc0(sizeof(GlusterURI)); > + > + s->glfs = qemu_gluster_init(uri, filename); > + if (!s->glfs) { > + ret = -errno; > + goto out; > + } > + > + open_flags |= O_BINARY; > + open_flags &= ~O_ACCMODE; > + if (bdrv_flags & BDRV_O_RDWR) { > + open_flags |= O_RDWR; > + } else { > + open_flags |= O_RDONLY; > + } > + > + if ((bdrv_flags & BDRV_O_NOCACHE)) { > + open_flags |= O_DIRECT; > + } > + > + s->fd = glfs_open(s->glfs, uri->image, open_flags); > + if (!s->fd) { > + ret = -errno; > + goto out; > + } > + > + ret = qemu_pipe(s->fds); > + if (ret < 0) { > + goto out; > + } > + fcntl(s->fds[0], F_SETFL, O_NONBLOCK); > + fcntl(s->fds[1], F_SETFL, O_NONBLOCK);
A small thing I noticed while reviewing: since the write end of the pipe is used from the gluster thread, you do not need to make this nonblocking. Also, please use GLUSTER_FD_{READ,WRITE} instead. > + qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], > + qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s); > + > +out: > + qemu_gluster_uri_free(uri); > + if (!ret) { > + return ret; > + } > + if (s->fd) { > + glfs_close(s->fd); > + } > + if (s->glfs) { > + glfs_fini(s->glfs); > + } > + return ret; > +} > + > +static int qemu_gluster_create(const char *filename, > + QEMUOptionParameter *options) > +{ > + struct glfs *glfs; > + struct glfs_fd *fd; > + int ret = 0; > + int64_t total_size = 0; > + GlusterURI *uri = g_malloc0(sizeof(GlusterURI)); > + > + glfs = qemu_gluster_init(uri, filename); > + if (!glfs) { > + ret = -errno; > + goto out; > + } > + > + while (options && options->name) { > + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { > + total_size = options->value.n / BDRV_SECTOR_SIZE; > + } > + options++; > + } > + > + fd = glfs_creat(glfs, uri->image, > + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR); > + if (!fd) { > + ret = -errno; > + } else { > + if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { > + ret = -errno; > + } > + if (glfs_close(fd) != 0) { > + ret = -errno; > + } > + } > +out: > + qemu_gluster_uri_free(uri); > + if (glfs) { > + glfs_fini(glfs); > + } > + return ret; > +} > + > +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb) > +{ > + GlusterAIOCB *acb = (GlusterAIOCB *)blockacb; > + > + acb->common.cb(acb->common.opaque, -ECANCELED); > + acb->canceled = true; I think this is wrong, because the write could still complete later and undo the effects of a second write that is done by the guest. That is: gluster QEMU guest ---------------------------------------------------------- <--- write #1 <--- write #1 <--- cancel write #1 write #1 canceled ---> <--- write #2 <--- write #2 write #2 completed ---> write #2 completed --> write #1 completed ---> Now, the persistent storage recorded the effect of write #1, but the guest thinks that it recorded the effect of write #2 instead. You can simply do qemu_aio_flush() here. > +} > + > +static AIOPool gluster_aio_pool = { > + .aiocb_size = sizeof(GlusterAIOCB), > + .cancel = qemu_gluster_aio_cancel, > +}; > + > +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb) > +{ > + int ret = 0; > + while (1) { > + fd_set wfd; > + int fd = s->fds[GLUSTER_FD_WRITE]; > + > + ret = write(fd, (void *)&acb, sizeof(acb)); > + if (ret >= 0) { > + break; > + } > + if (errno == EINTR) { > + continue; > + } > + if (errno != EAGAIN) { > + break; > + } > + > + FD_ZERO(&wfd); > + FD_SET(fd, &wfd); > + do { > + ret = select(fd + 1, NULL, &wfd, NULL, NULL); > + } while (ret < 0 && errno == EINTR); If you make the fd non-blocking, you can avoid the select here. > + } > + return ret; > +} > + > +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) > +{ > + GlusterAIOCB *acb = (GlusterAIOCB *)arg; > + BDRVGlusterState *s = acb->common.bs->opaque; > + > + acb->ret = ret; > + if (qemu_gluster_send_pipe(s, acb) < 0) { > + /* > + * Gluster AIO callback thread failed to notify the waiting > + * QEMU thread about IO completion. Nothing much can be done > + * here but to abruptly abort. > + * > + * FIXME: Check if the read side of the fd handler can somehow > + * be notified of this failure paving the way for a graceful exit. > + */ > + error_report("Gluster failed to notify QEMU about IO completion"); > + abort(); We can fix it later with a list of AIOCBs that are ready to process (and a QemuMutex to protect the list). An EventNotifier can trigger the read handler to examine the list. > + } > +} > + > +static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, > + BlockDriverCompletionFunc *cb, void *opaque, int write) > +{ > + int ret; > + GlusterAIOCB *acb; > + BDRVGlusterState *s = bs->opaque; > + size_t size; > + off_t offset; > + > + offset = sector_num * BDRV_SECTOR_SIZE; > + size = nb_sectors * BDRV_SECTOR_SIZE; > + s->qemu_aio_count++; > + > + acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque); > + acb->size = size; > + acb->ret = 0; > + acb->canceled = false; > + > + if (write) { > + ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, > + &gluster_finish_aiocb, acb); > + } else { > + ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0, > + &gluster_finish_aiocb, acb); > + } > + > + if (ret < 0) { > + goto out; > + } > + return &acb->common; > + > +out: > + s->qemu_aio_count--; > + qemu_aio_release(acb); > + return NULL; > +} > + > +static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, > 0); > +} > + > +static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, > 1); > +} > + > +static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + int ret; > + GlusterAIOCB *acb; > + BDRVGlusterState *s = bs->opaque; > + > + acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque); > + acb->size = 0; > + acb->ret = 0; > + acb->canceled = false; > + s->qemu_aio_count++; > + > + ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb); > + if (ret < 0) { > + goto out; > + } > + return &acb->common; > + > +out: > + s->qemu_aio_count--; > + qemu_aio_release(acb); > + return NULL; > +} > + > +static int64_t qemu_gluster_getlength(BlockDriverState *bs) > +{ > + BDRVGlusterState *s = bs->opaque; > + struct stat st; > + int ret; > + > + ret = glfs_fstat(s->fd, &st); > + if (ret < 0) { > + return -errno; > + } else { > + return st.st_size; > + } > +} > + > +static void qemu_gluster_close(BlockDriverState *bs) > +{ > + BDRVGlusterState *s = bs->opaque; > + > + if (s->fd) { > + glfs_close(s->fd); > + s->fd = NULL; > + } > + glfs_fini(s->glfs); > +} > + > +static QEMUOptionParameter qemu_gluster_create_options[] = { > + { > + .name = BLOCK_OPT_SIZE, > + .type = OPT_SIZE, > + .help = "Virtual disk size" > + }, > + { NULL } > +}; > + > +static BlockDriver bdrv_gluster = { > + .format_name = "gluster", > + .protocol_name = "gluster", > + .instance_size = sizeof(BDRVGlusterState), > + .bdrv_file_open = qemu_gluster_open, > + .bdrv_close = qemu_gluster_close, > + .bdrv_create = qemu_gluster_create, > + .bdrv_getlength = qemu_gluster_getlength, > + > + .bdrv_aio_readv = qemu_gluster_aio_readv, > + .bdrv_aio_writev = qemu_gluster_aio_writev, > + .bdrv_aio_flush = qemu_gluster_aio_flush, > + > + .create_options = qemu_gluster_create_options, > +}; > + > +static void bdrv_gluster_init(void) > +{ > + bdrv_register(&bdrv_gluster); > +} > + > +block_init(bdrv_gluster_init); > > >