Re: [Qemu-devel] [PATCH v6 2/2] block: Support GlusterFS as a QEMU block backend

Paolo Bonzini Thu, 06 Sep 2012 00:35:44 -0700

Il 09/08/2012 15:02, Bharata B Rao ha scritto:
> block: Support GlusterFS as a QEMU block backend.
> 
> From: Bharata B Rao <bhar...@linux.vnet.ibm.com>
> 
> This patch adds gluster as the new block backend in QEMU. This gives
> QEMU the ability to boot VM images from gluster volumes. Its already
> possible to boot from VM images on gluster volumes using FUSE mount, but
> this patchset provides the ability to boot VM images from gluster volumes
> by by-passing the FUSE layer in gluster. This is made possible by
> using libgfapi routines to perform IO on gluster volumes directly.
> 
> VM Image on gluster volume is specified like this:
> 
> file=gluster://server:[port]/volname/image[?transport=socket]
> 
> 'gluster' is the protocol.
> 
> 'server' specifies the server where the volume file specification for
> the given volume resides. This can be either hostname or ipv4 address
> or ipv6 address. ipv6 address needs to be with in square brackets [ ].
> 
> port' is the port number on which gluster management daemon (glusterd) is
> listening. This is optional and if not specified, QEMU will send 0 which
> will make libgfapi to use the default port.
> 
> 'volname' is the name of the gluster volume which contains the VM image.
> 
> 'image' is the path to the actual VM image in the gluster volume.
> 
> 'transport' specifies the transport used to connect to glusterd. This is
> optional and if not specified, socket transport is used.
> 
> Examples:
> 
> file=gluster://1.2.3.4/testvol/a.img
> file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
> file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
> file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
> file=gluster://server.domain.com:5000/testvol/dir/a.img
> 
> Signed-off-by: Bharata B Rao <bhar...@linux.vnet.ibm.com>
> Reviewed-by: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com>
> ---
> 
>  block/Makefile.objs |    1 
>  block/gluster.c     |  623 
> +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 624 insertions(+), 0 deletions(-)
>  create mode 100644 block/gluster.c
> 
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index b5754d3..a1ae67f 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -9,3 +9,4 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o
>  block-obj-$(CONFIG_LIBISCSI) += iscsi.o
>  block-obj-$(CONFIG_CURL) += curl.o
>  block-obj-$(CONFIG_RBD) += rbd.o
> +block-obj-$(CONFIG_GLUSTERFS) += gluster.o
> diff --git a/block/gluster.c b/block/gluster.c
> new file mode 100644
> index 0000000..bbbaea8
> --- /dev/null
> +++ b/block/gluster.c
> @@ -0,0 +1,623 @@
> +/*
> + * GlusterFS backend for QEMU
> + *
> + * (AIO implementation is derived from block/rbd.c)
> + *
> + * Copyright (C) 2012 Bharata B Rao <bhar...@linux.vnet.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> + * (at your option) any later version. See the COPYING file in the top-level
> + * directory.
> + */
> +#include <glusterfs/api/glfs.h>
> +#include "block_int.h"
> +
> +typedef struct GlusterAIOCB {
> +    BlockDriverAIOCB common;
> +    bool canceled;
> +    int64_t size;
> +    int ret;
> +} GlusterAIOCB;
> +
> +typedef struct BDRVGlusterState {
> +    struct glfs *glfs;
> +    int fds[2];
> +    struct glfs_fd *fd;
> +    int qemu_aio_count;
> +} BDRVGlusterState;
> +
> +#define GLUSTER_FD_READ 0
> +#define GLUSTER_FD_WRITE 1
> +
> +typedef struct GlusterURI {
> +    char *server;
> +    int port;
> +    char *volname;
> +    char *image;
> +    char *transport;
> +} GlusterURI;
> +
> +static void qemu_gluster_uri_free(GlusterURI *uri)
> +{
> +    g_free(uri->server);
> +    g_free(uri->volname);
> +    g_free(uri->image);
> +    g_free(uri->transport);
> +    g_free(uri);
> +}
> +
> +/*
> + * We don't validate the transport option obtained here but
> + * instead depend on gluster to flag an error.
> + */
> +static int parse_transport(GlusterURI *uri, char *transport)
> +{
> +    char *token, *saveptr;
> +    int ret = -EINVAL;
> +
> +    if (!transport) {
> +        uri->transport = g_strdup("socket");
> +        ret = 0;
> +        goto out;
> +    }
> +
> +    token = strtok_r(transport, "=", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +    if (strcmp(token, "transport")) {
> +        goto out;
> +    }
> +    token = strtok_r(NULL, "=", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +    uri->transport = g_strdup(token);
> +    ret = 0;
> +out:
> +    return ret;
> +}
> +
> +static int parse_server(GlusterURI *uri, char *server)
> +{
> +    int ret = -EINVAL;
> +    char *token, *saveptr;
> +    char *p, *q = server;
> +
> +    p = strchr(server, '[');
> +    if (p) {
> +        /* [ipv6] */
> +        if (p != server) {
> +            /* [ not in the beginning */
> +            goto out;
> +        }
> +        q++;
> +        p = strrchr(p, ']');
> +        if (!p) {
> +            /* No matching ] */
> +            goto out;
> +        }
> +        *p++ = '\0';
> +        uri->server = g_strdup(q);
> +
> +        if (*p) {
> +            if (*p != ':') {
> +                /* [ipv6] followed by something other than : */
> +                goto out;
> +            }
> +            uri->port = strtoul(++p, NULL, 0);
> +            if (uri->port < 0) {
> +                goto out;
> +            }
> +        } else {
> +            /* port not specified, use default */
> +            uri->port = 0;
> +        }
> +
> +    } else {
> +        /* ipv4 or hostname */
> +        if (*server == ':') {
> +            /* port specified w/o a server */
> +            goto out;
> +        }
> +        token = strtok_r(server, ":", &saveptr);
> +        if (!token) {
> +            goto out;
> +        }
> +        uri->server = g_strdup(token);
> +        token = strtok_r(NULL, ":", &saveptr);
> +        if (token) {
> +            uri->port = strtoul(token, NULL, 0);
> +            if (uri->port < 0) {
> +                goto out;
> +            }
> +        } else {
> +            uri->port = 0;
> +        }
> +    }
> +    ret = 0;
> +out:
> +    return ret;
> +}
> +
> +/*
> + * file=gluster://server:[port]/volname/image[?transport=socket]
> + *
> + * 'gluster' is the protocol.
> + *
> + * 'server' specifies the server where the volume file specification for
> + * the given volume resides. This can be either hostname or ipv4 address
> + * or ipv6 address. ipv6 address needs to be with in square brackets [ ].
> + *
> + * 'port' is the port number on which gluster management daemon (glusterd) is
> + * listening. This is optional and if not specified, QEMU will send 0 which
> + * will make libgfapi to use the default port.
> + *
> + * 'volname' is the name of the gluster volume which contains the VM image.
> + *
> + * 'image' is the path to the actual VM image in the gluster volume.
> + *
> + * 'transport' specifies the transport used to connect to glusterd. This is
> + * optional and if not specified, socket transport is used.
> + *
> + * Examples:
> + *
> + * file=gluster://1.2.3.4/testvol/a.img
> + * file=gluster://1.2.3.4:5000/testvol/dir/a.img?transport=socket
> + * file=gluster://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
> + * file=gluster://[1:2:3:4:5:6:7:8]:5000/testvol/dir/a.img?transport=socket
> + * file=gluster://server.domain.com:5000/testvol/dir/a.img
> + *
> + * We just do minimal checking of the gluster options and mostly ensure
> + * that all the expected elements of the URI are present. We depend on 
> libgfapi
> + * APIs to return appropriate errors in case of invalid arguments.
> + */
> +static int qemu_gluster_parseuri(GlusterURI *uri, const char *filename)
> +{
> +    char *token, *saveptr;
> +    char *p, *r;
> +    int ret = -EINVAL;
> +
> +    p = r = g_strdup(filename);
> +    if (strncmp(p, "gluster://", 10)) {
> +        goto out;
> +    }
> +
> +    /* Discard the protocol */
> +    p += 10;
> +
> +    /* server */
> +    token = strtok_r(p, "/", &saveptr);
> +    if (!token) {
> +        goto out;
> +    }
> +
> +    ret = parse_server(uri, token);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /* volname */
> +    token = strtok_r(NULL, "/", &saveptr);
> +    if (!token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    uri->volname = g_strdup(token);
> +
> +    /* image */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    if (!token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    uri->image = g_strdup(token);
> +
> +    /* transport */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    ret = parse_transport(uri, token);
> +    if (ret < 0) {
> +        goto out;
> +     }
> +
> +    /* Flag error for extra options */
> +    token = strtok_r(NULL, "?", &saveptr);
> +    if (token) {
> +        ret = -EINVAL;
> +        goto out;
> +    }
> +    ret = 0;
> +out:
> +    g_free(r);
> +    return ret;
> +}
> +
> +static struct glfs *qemu_gluster_init(GlusterURI *uri, const char *filename)
> +{
> +    struct glfs *glfs = NULL;
> +    int ret;
> +
> +    ret = qemu_gluster_parseuri(uri, filename);
> +    if (ret < 0) {
> +        error_report("Usage: file=gluster://server[:port]/volname/image"
> +            "[?transport=socket]");
> +        errno = -ret;
> +        goto out;
> +    }
> +
> +    glfs = glfs_new(uri->volname);
> +    if (!glfs) {
> +        goto out;
> +    }
> +
> +    ret = glfs_set_volfile_server(glfs, uri->transport, uri->server,
> +        uri->port);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    /*
> +     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
> +     * GlusterFS exports it in a header.
> +     */
> +    ret = glfs_set_logging(glfs, "-", 4);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +
> +    ret = glfs_init(glfs);
> +    if (ret) {
> +        error_report("Gluster connection failed for server=%s port=%d "
> +             "volume=%s image=%s transport=%s\n", uri->server, uri->port,
> +             uri->volname, uri->image, uri->transport);
> +        goto out;
> +    }
> +    return glfs;
> +
> +out:
> +    if (glfs) {
> +        glfs_fini(glfs);
> +    }
> +    return NULL;
> +}
> +
> +static void qemu_gluster_complete_aio(GlusterAIOCB *acb)
> +{
> +    int ret;
> +
> +    if (acb->canceled) {
> +        qemu_aio_release(acb);
> +        return;
> +    }
> +
> +    if (acb->ret == acb->size) {
> +        ret = 0; /* Success */
> +    } else if (acb->ret < 0) {
> +        ret = acb->ret; /* Read/Write failed */
> +    } else {
> +        ret = -EIO; /* Partial read/write - fail it */
> +    }
> +    acb->common.cb(acb->common.opaque, ret);
> +    qemu_aio_release(acb);
> +}
> +
> +static void qemu_gluster_aio_event_reader(void *opaque)
> +{
> +    BDRVGlusterState *s = opaque;
> +    GlusterAIOCB *event_acb;
> +    int event_reader_pos = 0;
> +    ssize_t ret;
> +
> +    do {
> +        char *p = (char *)&event_acb;
> +
> +        ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos,
> +                   sizeof(event_acb) - event_reader_pos);
> +        if (ret > 0) {
> +            event_reader_pos += ret;
> +            if (event_reader_pos == sizeof(event_acb)) {
> +                event_reader_pos = 0;
> +                qemu_gluster_complete_aio(event_acb);
> +                s->qemu_aio_count--;
> +            }
> +        }
> +    } while (ret < 0 && errno == EINTR);
> +}
> +
> +static int qemu_gluster_aio_flush_cb(void *opaque)
> +{
> +    BDRVGlusterState *s = opaque;
> +
> +    return (s->qemu_aio_count > 0);
> +}
> +
> +static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
> +    int bdrv_flags)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +    int open_flags = 0;
> +    int ret = 0;
> +    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
> +
> +    s->glfs = qemu_gluster_init(uri, filename);
> +    if (!s->glfs) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    open_flags |=  O_BINARY;
> +    open_flags &= ~O_ACCMODE;
> +    if (bdrv_flags & BDRV_O_RDWR) {
> +        open_flags |= O_RDWR;
> +    } else {
> +        open_flags |= O_RDONLY;
> +    }
> +
> +    if ((bdrv_flags & BDRV_O_NOCACHE)) {
> +        open_flags |= O_DIRECT;
> +    }
> +
> +    s->fd = glfs_open(s->glfs, uri->image, open_flags);
> +    if (!s->fd) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    ret = qemu_pipe(s->fds);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    fcntl(s->fds[0], F_SETFL, O_NONBLOCK);
> +    fcntl(s->fds[1], F_SETFL, O_NONBLOCK);


A small thing I noticed while reviewing: since the write end of the pipe
is used from the gluster thread, you do not need to make this nonblocking.

Also, please use GLUSTER_FD_{READ,WRITE} instead.

> +    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
> +        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
> +
> +out:
> +    qemu_gluster_uri_free(uri);
> +    if (!ret) {
> +        return ret;
> +    }
> +    if (s->fd) {
> +        glfs_close(s->fd);
> +    }
> +    if (s->glfs) {
> +        glfs_fini(s->glfs);
> +    }
> +    return ret;
> +}
> +
> +static int qemu_gluster_create(const char *filename,
> +        QEMUOptionParameter *options)
> +{
> +    struct glfs *glfs;
> +    struct glfs_fd *fd;
> +    int ret = 0;
> +    int64_t total_size = 0;
> +    GlusterURI *uri = g_malloc0(sizeof(GlusterURI));
> +
> +    glfs = qemu_gluster_init(uri, filename);
> +    if (!glfs) {
> +        ret = -errno;
> +        goto out;
> +    }
> +
> +    while (options && options->name) {
> +        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
> +            total_size = options->value.n / BDRV_SECTOR_SIZE;
> +        }
> +        options++;
> +    }
> +
> +    fd = glfs_creat(glfs, uri->image,
> +        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
> +    if (!fd) {
> +        ret = -errno;
> +    } else {
> +        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
> +            ret = -errno;
> +        }
> +        if (glfs_close(fd) != 0) {
> +            ret = -errno;
> +        }
> +    }
> +out:
> +    qemu_gluster_uri_free(uri);
> +    if (glfs) {
> +        glfs_fini(glfs);
> +    }
> +    return ret;
> +}
> +
> +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
> +{
> +    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
> +
> +    acb->common.cb(acb->common.opaque, -ECANCELED);
> +    acb->canceled = true;

I think this is wrong, because the write could still complete later and
undo the effects of a second write that is done by the guest.  That is:

   gluster                  QEMU                      guest
----------------------------------------------------------
                                             <---     write #1
                     <---   write #1
                                             <---     cancel write #1
                            write #1 canceled --->
                                             <---     write #2
                     <---   write #2
   write #2 completed --->
                            write #2 completed -->
   write #1 completed --->

Now, the persistent storage recorded the effect of write #1, but the
guest thinks that it recorded the effect of write #2 instead.

You can simply do qemu_aio_flush() here.

> +}
> +
> +static AIOPool gluster_aio_pool = {
> +    .aiocb_size = sizeof(GlusterAIOCB),
> +    .cancel = qemu_gluster_aio_cancel,
> +};
> +
> +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb)
> +{
> +    int ret = 0;
> +    while (1) {
> +        fd_set wfd;
> +        int fd = s->fds[GLUSTER_FD_WRITE];
> +
> +        ret = write(fd, (void *)&acb, sizeof(acb));
> +        if (ret >= 0) {
> +            break;
> +        }
> +        if (errno == EINTR) {
> +            continue;
> +        }
> +        if (errno != EAGAIN) {
> +            break;
> +        }
> +
> +        FD_ZERO(&wfd);
> +        FD_SET(fd, &wfd);
> +        do {
> +            ret = select(fd + 1, NULL, &wfd, NULL, NULL);
> +        } while (ret < 0 && errno == EINTR);

If you make the fd non-blocking, you can avoid the select here.

> +    }
> +    return ret;
> +}
> +
> +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
> +{
> +    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
> +    BDRVGlusterState *s = acb->common.bs->opaque;
> +
> +    acb->ret = ret;
> +    if (qemu_gluster_send_pipe(s, acb) < 0) {
> +        /*
> +         * Gluster AIO callback thread failed to notify the waiting
> +         * QEMU thread about IO completion. Nothing much can be done
> +         * here but to abruptly abort.
> +         *
> +         * FIXME: Check if the read side of the fd handler can somehow
> +         * be notified of this failure paving the way for a graceful exit.
> +         */
> +        error_report("Gluster failed to notify QEMU about IO completion");
> +        abort();

We can fix it later with a list of AIOCBs that are ready to process (and
a QemuMutex to protect the list).  An EventNotifier can trigger the read
handler to examine the list.

> +    }
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque, int write)
> +{
> +    int ret;
> +    GlusterAIOCB *acb;
> +    BDRVGlusterState *s = bs->opaque;
> +    size_t size;
> +    off_t offset;
> +
> +    offset = sector_num * BDRV_SECTOR_SIZE;
> +    size = nb_sectors * BDRV_SECTOR_SIZE;
> +    s->qemu_aio_count++;
> +
> +    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
> +    acb->size = size;
> +    acb->ret = 0;
> +    acb->canceled = false;
> +
> +    if (write) {
> +        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
> +            &gluster_finish_aiocb, acb);
> +    } else {
> +        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
> +            &gluster_finish_aiocb, acb);
> +    }
> +
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    return &acb->common;
> +
> +out:
> +    s->qemu_aio_count--;
> +    qemu_aio_release(acb);
> +    return NULL;
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 
> 0);
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
> +        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 
> 1);
> +}
> +
> +static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
> +        BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    int ret;
> +    GlusterAIOCB *acb;
> +    BDRVGlusterState *s = bs->opaque;
> +
> +    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
> +    acb->size = 0;
> +    acb->ret = 0;
> +    acb->canceled = false;
> +    s->qemu_aio_count++;
> +
> +    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
> +    if (ret < 0) {
> +        goto out;
> +    }
> +    return &acb->common;
> +
> +out:
> +    s->qemu_aio_count--;
> +    qemu_aio_release(acb);
> +    return NULL;
> +}
> +
> +static int64_t qemu_gluster_getlength(BlockDriverState *bs)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +    struct stat st;
> +    int ret;
> +
> +    ret = glfs_fstat(s->fd, &st);
> +    if (ret < 0) {
> +        return -errno;
> +    } else {
> +        return st.st_size;
> +    }
> +}
> +
> +static void qemu_gluster_close(BlockDriverState *bs)
> +{
> +    BDRVGlusterState *s = bs->opaque;
> +
> +    if (s->fd) {
> +        glfs_close(s->fd);
> +        s->fd = NULL;
> +    }
> +    glfs_fini(s->glfs);
> +}
> +
> +static QEMUOptionParameter qemu_gluster_create_options[] = {
> +    {
> +        .name = BLOCK_OPT_SIZE,
> +        .type = OPT_SIZE,
> +        .help = "Virtual disk size"
> +    },
> +    { NULL }
> +};
> +
> +static BlockDriver bdrv_gluster = {
> +    .format_name = "gluster",
> +    .protocol_name = "gluster",
> +    .instance_size = sizeof(BDRVGlusterState),
> +    .bdrv_file_open = qemu_gluster_open,
> +    .bdrv_close = qemu_gluster_close,
> +    .bdrv_create = qemu_gluster_create,
> +    .bdrv_getlength = qemu_gluster_getlength,
> +
> +    .bdrv_aio_readv = qemu_gluster_aio_readv,
> +    .bdrv_aio_writev = qemu_gluster_aio_writev,
> +    .bdrv_aio_flush = qemu_gluster_aio_flush,
> +
> +    .create_options = qemu_gluster_create_options,
> +};
> +
> +static void bdrv_gluster_init(void)
> +{
> +    bdrv_register(&bdrv_gluster);
> +}
> +
> +block_init(bdrv_gluster_init);
> 
> 
>

Re: [Qemu-devel] [PATCH v6 2/2] block: Support GlusterFS as a QEMU block backend

Reply via email to