On Thu, 15 May 2014 14:33:18 +0000
Serge Hallyn <serge.hal...@ubuntu.com> wrote:

> backing stores supported by qemu-nbd can be attached to a nbd block
> device using qemu-nbd.  This user-space process (pair) stays around
> for the duration of the device attachment.  Obviously we want it to
> go away when the container shuts down, but not before the filesystems
> have been cleanly unmounted.
> 
> The device attachment is done from the task which will become the
> container monitor before the container setup+init task is spawned.
> That task starts in a new pid namespace to ensure that the qemu-nbd
> process will be killed if need be.  It sets its parent death signal
> to sighup, and, on receiving sighup, attempts to do a clean
> qemu-device detach, then exits.  This should ensure that the
> device is detached if the qemu monitor crashes or exits.
> 
> It may be worth adding a delay before the qemu-nbd is detached, but
> my brief tests haven't seen any data corruption.
> 
> Only the parts required for running a nbd-backed container are
> implemented here.  Create, destroy, and clone are not.  The first
> use of this that I imagine is for people to use downloaded nbd-backed
> images (like ubuntu cloud images, or anything previously used with
> qemu).  I imagine people will want to create/clone/destroy out of
> band using qemu-img, but if I'm wrong about that we can implement
> the rest later.
> 
> Because attach_block_device() is done before the bdev is initialized,
> and bdev_init needs to know the nbd index so that it can mount the
> filesystem, we now need to pass the lxc_conf.
> 
> file_exists() is moved to utils.c so we can use it from bdev.c
> 
> The nbd attach/detach should lay the groundwork for trivial
> implementation of qed and raw images.
> 
> changelog (may 12): fix idx check at detach
> changelog (may 15): generalize qcow2 to nbd
> 
> Signed-off-by: Serge Hallyn <serge.hal...@ubuntu.com>

Acked-by: Dwight Engen <dwight.en...@oracle.com>

> ---
>  src/lxc/bdev.c         | 293
> ++++++++++++++++++++++++++++++++++++++++++++++++-
> src/lxc/bdev.h         |  17 ++- src/lxc/conf.c         |   3 +-
>  src/lxc/conf.h         |   1 +
>  src/lxc/lxccontainer.c |  19 +---
>  src/lxc/start.c        |  11 +-
>  src/lxc/utils.c        |   7 ++
>  src/lxc/utils.h        |   1 +
>  8 files changed, 329 insertions(+), 23 deletions(-)
> 
> diff --git a/src/lxc/bdev.c b/src/lxc/bdev.c
> index 20e9fb3..e22d83d 100644
> --- a/src/lxc/bdev.c
> +++ b/src/lxc/bdev.c
> @@ -41,6 +41,7 @@
>  #include <libgen.h>
>  #include <linux/loop.h>
>  #include <dirent.h>
> +#include <sys/prctl.h>
>  
>  #include "lxc.h"
>  #include "config.h"
> @@ -2410,6 +2411,287 @@ static const struct bdev_ops aufs_ops = {
>       .can_snapshot = true,
>  };
>  
> +//
> +// nbd dev ops
> +//
> +
> +static int nbd_detect(const char *path)
> +{
> +     if (strncmp(path, "nbd:", 4) == 0)
> +             return 1;
> +     return 0;
> +}
> +
> +struct nbd_attach_data {
> +     const char *nbd;
> +     const char *path;
> +};
> +
> +static void nbd_detach(const char *path)
> +{
> +     int ret;
> +     pid_t pid = fork();
> +
> +     if (pid < 0) {
> +             SYSERROR("Error forking to detach nbd");
> +             return;
> +     }
> +     if (pid) {
> +             ret = wait_for_pid(pid);
> +             if (ret < 0)
> +                     ERROR("nbd disconnect returned an error");
> +             return;
> +     }
> +     execlp("qemu-nbd", "qemu-nbd", "-d", path, NULL);
> +     SYSERROR("Error executing qemu-nbd");
> +     exit(1);
> +}
> +
> +static int do_attach_nbd(void *d)
> +{
> +     struct nbd_attach_data *data = d;
> +     const char *nbd, *path;
> +     pid_t pid;
> +     sigset_t mask;
> +     int sfd;
> +     ssize_t s;
> +     struct signalfd_siginfo fdsi;
> +
> +     sigemptyset(&mask);
> +     sigaddset(&mask, SIGHUP);
> +     sigaddset(&mask, SIGCHLD);
> +
> +     nbd = data->nbd;
> +     path = data->path;
> +
> +     if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) {
> +             SYSERROR("Error blocking signals for nbd watcher");
> +             exit(1);
> +     }
> +
> +     sfd = signalfd(-1, &mask, 0);
> +     if (sfd == -1) {
> +             SYSERROR("Error opening signalfd for nbd task");
> +             exit(1);
> +     }
> +
> +     if (prctl(PR_SET_PDEATHSIG, SIGHUP, 0, 0, 0) < 0)
> +             SYSERROR("Error setting parent death signal for nbd
> watcher"); +
> +     pid = fork();
> +     if (pid) {
> +             for (;;) {
> +                     s = read(sfd, &fdsi, sizeof(struct
> signalfd_siginfo));
> +                     if (s != sizeof(struct signalfd_siginfo))
> +                             SYSERROR("Error reading from
> signalfd"); +
> +                     if (fdsi.ssi_signo == SIGHUP) {
> +                             /* container has exited */
> +                             nbd_detach(nbd);
> +                             exit(0);
> +                     } else if (fdsi.ssi_signo == SIGCHLD) {
> +                             int status;
> +                             while (waitpid(-1, &status, WNOHANG)
> > 0);
> +                     }
> +             }
> +     }
> +
> +     close(sfd);
> +     if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1)
> +             WARN("Warning: unblocking signals for nbd watcher");
> +
> +     execlp("qemu-nbd", "qemu-nbd", "-c", nbd, path, NULL);
> +     SYSERROR("Error executing qemu-nbd");
> +     exit(1);
> +}
> +
> +static bool clone_attach_nbd(const char *nbd, const char *path)
> +{
> +     pid_t pid;
> +     struct nbd_attach_data data;
> +
> +     data.nbd = nbd;
> +     data.path = path;
> +
> +     pid = lxc_clone(do_attach_nbd, &data, CLONE_NEWPID);
> +     if (pid < 0)
> +             return false;
> +     return true;
> +}
> +
> +static bool nbd_busy(int idx)
> +{
> +     char path[100];
> +     int ret;
> +
> +     ret = snprintf(path, 100, "/sys/block/nbd%d/pid", idx);
> +     if (ret < 0 || ret >= 100)
> +             return true;
> +     return file_exists(path);
> +}
> +
> +static bool attach_nbd(char *src, struct lxc_conf *conf)
> +{
> +     char *orig = alloca(strlen(src)+1), *p, path[50];
> +     int i = 0;
> +
> +     strcpy(orig, src);
> +     /* if path is followed by a partition, drop that for now */
> +     p = strchr(orig, ':');
> +     if (p)
> +             *p = '\0';
> +     while (1) {
> +             sprintf(path, "/dev/nbd%d", i);
> +             if (!file_exists(path))
> +                     return false;
> +             if (nbd_busy(i)) {
> +                     i++;
> +                     continue;
> +             }
> +             if (!clone_attach_nbd(path, orig))
> +                     return false;
> +             conf->nbd_idx = i;
> +             return true;
> +     }
> +}
> +
> +static bool requires_nbd(const char *path)
> +{
> +     if (strncmp(path, "nbd:", 4) == 0)
> +             return true;
> +     return false;
> +}
> +
> +/*
> + * attach_block_device returns true if all went well,
> + * meaning either a block device was attached or was not
> + * needed.  It returns false if something went wrong and
> + * container startup shoudl be stopped.
> + */
> +bool attach_block_device(struct lxc_conf *conf)
> +{
> +     char *path;
> +
> +     if (!conf->rootfs.path)
> +             return true;
> +     path = conf->rootfs.path;
> +     if (!requires_nbd(path))
> +             return true;
> +     path = strchr(path, ':');
> +     if (!path)
> +             return false;
> +     path++;
> +     if (!attach_nbd(path, conf))
> +             return false;
> +     return true;
> +}
> +
> +void detach_nbd_idx(int idx)
> +{
> +     int ret;
> +     char path[50];
> +
> +     ret = snprintf(path, 50, "/dev/nbd%d", idx);
> +     if (ret < 0 || ret >= 50)
> +             return;
> +
> +     nbd_detach(path);
> +}
> +
> +void detach_block_device(struct lxc_conf *conf)
> +{
> +     if (conf->nbd_idx != -1)
> +             detach_nbd_idx(conf->nbd_idx);
> +}
> +
> +/*
> + * Pick the partition # off the end of a nbd:file:p
> + * description.  Return 1-9 for the partition id, or 0
> + * for no partition.
> + */
> +static int nbd_get_partition(const char *src)
> +{
> +     char *p = strchr(src, ':');
> +     if (!p)
> +             return 0;
> +     p = strchr(p+1, ':');
> +     if (!p)
> +             return 0;
> +     p++;
> +     if (*p < '1' && *p > '9')
> +             return 0;
> +     return *p - '0';
> +}
> +
> +static int nbd_mount(struct bdev *bdev)
> +{
> +     int ret = -1, partition;
> +     char path[50];
> +
> +     if (strcmp(bdev->type, "nbd"))
> +             return -22;
> +     if (!bdev->src || !bdev->dest)
> +             return -22;
> +
> +     /* nbd_idx should have been copied by bdev_init from the
> lxc_conf */
> +     if (bdev->nbd_idx < 0)
> +             return -22;
> +     partition = nbd_get_partition(bdev->src);
> +     if (partition)
> +             ret = snprintf(path, 50, "/dev/nbd%dp%d",
> bdev->nbd_idx,
> +                             partition);
> +     else
> +             ret = snprintf(path, 50, "/dev/nbd%d",
> bdev->nbd_idx);
> +     if (ret < 0 || ret >= 50) {
> +             ERROR("Error setting up nbd device path");
> +             return ret;
> +     }
> +     ret = mount_unknown_fs(path, bdev->dest, bdev->mntopts);
> +     if (ret < 0)
> +             ERROR("Error mounting %s", bdev->src);
> +
> +     return ret;
> +}
> +
> +static int nbd_create(struct bdev *bdev, const char *dest, const
> char *n,
> +                     struct bdev_specs *specs)
> +{
> +     return -ENOSYS;
> +}
> +
> +static int nbd_clonepaths(struct bdev *orig, struct bdev *new, const
> char *oldname,
> +             const char *cname, const char *oldpath, const char
> *lxcpath, int snap,
> +             uint64_t newsize, struct lxc_conf *conf)
> +{
> +     return -ENOSYS;
> +}
> +
> +static int nbd_destroy(struct bdev *orig)
> +{
> +     return -ENOSYS;
> +}
> +
> +static int nbd_umount(struct bdev *bdev)
> +{
> +     int ret;
> +
> +     if (strcmp(bdev->type, "nbd"))
> +             return -22;
> +     if (!bdev->src || !bdev->dest)
> +             return -22;
> +     ret = umount(bdev->dest);
> +     return ret;
> +}
> +
> +static const struct bdev_ops nbd_ops = {
> +     .detect = &nbd_detect,
> +     .mount = &nbd_mount,
> +     .umount = &nbd_umount,
> +     .clone_paths = &nbd_clonepaths,
> +     .destroy = &nbd_destroy,
> +     .create = &nbd_create,
> +     .can_snapshot = true,
> +};
>  
>  static const struct bdev_type bdevs[] = {
>       {.name = "zfs", .ops = &zfs_ops,},
> @@ -2419,6 +2701,7 @@ static const struct bdev_type bdevs[] = {
>       {.name = "aufs", .ops = &aufs_ops,},
>       {.name = "overlayfs", .ops = &overlayfs_ops,},
>       {.name = "loop", .ops = &loop_ops,},
> +     {.name = "nbd", .ops = &nbd_ops,},
>  };
>  
>  static const size_t numbdevs = sizeof(bdevs) / sizeof(struct
> bdev_type); @@ -2454,7 +2737,7 @@ struct bdev *bdev_get(const char
> *type) return bdev;
>  }
>  
> -struct bdev *bdev_init(const char *src, const char *dst, const char
> *mntopts) +struct bdev *bdev_init(struct lxc_conf *conf, const char
> *src, const char *dst, const char *mntopts) {
>       int i;
>       struct bdev *bdev;
> @@ -2480,6 +2763,8 @@ struct bdev *bdev_init(const char *src, const
> char *dst, const char *mntopts) bdev->src = strdup(src);
>       if (dst)
>               bdev->dest = strdup(dst);
> +     if (strcmp(bdev->type, "nbd") == 0)
> +             bdev->nbd_idx = conf->nbd_idx;
>  
>       return bdev;
>  }
> @@ -2538,9 +2823,9 @@ static int rsync_rootfs_wrapper(void *data)
>       return rsync_rootfs(arg);
>  }
>  
> -bool bdev_is_dir(const char *path)
> +bool bdev_is_dir(struct lxc_conf *conf, const char *path)
>  {
> -     struct bdev *orig = bdev_init(path, NULL, NULL);
> +     struct bdev *orig = bdev_init(conf, path, NULL, NULL);
>       bool ret = false;
>       if (!orig)
>               return ret;
> @@ -2605,7 +2890,7 @@ struct bdev *bdev_copy(struct lxc_container
> *c0, const char *cname, return NULL;
>       }
>  
> -     orig = bdev_init(src, NULL, NULL);
> +     orig = bdev_init(c0->lxc_conf, src, NULL, NULL);
>       if (!orig) {
>               ERROR("failed to detect blockdev type for %s", src);
>               return NULL;
> diff --git a/src/lxc/bdev.h b/src/lxc/bdev.h
> index cc0bf02..0893c11 100644
> --- a/src/lxc/bdev.h
> +++ b/src/lxc/bdev.h
> @@ -24,8 +24,7 @@
>  #ifndef __LXC_BDEV_H
>  #define __LXC_BDEV_H
>  /* blockdev operations for:
> - * aufs, dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs
> - * someday: qemu-nbd, qcow2, qed
> + * aufs, dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs, nbd
> (qcow2, raw, vdi, qed) */
>  
>  #include "config.h"
> @@ -83,11 +82,13 @@ struct bdev {
>       // turn the following into a union if need be
>       // lofd is the open fd for the mounted loopback file
>       int lofd;
> +     // index for the connected nbd device
> +     int nbd_idx;
>  };
>  
>  char *overlay_getlower(char *p);
>  
> -bool bdev_is_dir(const char *path);
> +bool bdev_is_dir(struct lxc_conf *conf, const char *path);
>  
>  /*
>   * Instantiate a bdev object.  The src is used to determine which
> blockdev @@ -100,7 +101,8 @@ bool bdev_is_dir(const char *path);
>   * use /var/lib/lxc/canonical/rootfs as lower dir,
> and /var/lib/lxc/c1/delta
>   * as the upper, writeable layer.
>   */
> -struct bdev *bdev_init(const char *src, const char *dst, const char
> *data); +struct bdev *bdev_init(struct lxc_conf *conf, const char
> *src, const char *dst,
> +                     const char *data);
>  
>  struct bdev *bdev_copy(struct lxc_container *c0, const char *cname,
>                       const char *lxcpath, const char *bdevtype,
> @@ -110,6 +112,13 @@ struct bdev *bdev_create(const char *dest, const
> char *type, const char *cname, struct bdev_specs *specs);
>  void bdev_put(struct bdev *bdev);
>  
> +/*
> + * these are really for qemu-nbd support, as container shutdown
> + * must explicitly request device detach.
> + */
> +bool attach_block_device(struct lxc_conf *conf);
> +void detach_block_device(struct lxc_conf *conf);
> +
>  /* define constants if the kernel/glibc headers don't define them */
>  #ifndef MS_DIRSYNC
>  #define MS_DIRSYNC  128
> diff --git a/src/lxc/conf.c b/src/lxc/conf.c
> index 78d9de2..7427a94 100644
> --- a/src/lxc/conf.c
> +++ b/src/lxc/conf.c
> @@ -1555,7 +1555,7 @@ static int setup_rootfs(struct lxc_conf *conf)
>       }
>  
>       // First try mounting rootfs using a bdev
> -     struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount,
> rootfs->options);
> +     struct bdev *bdev = bdev_init(conf, rootfs->path,
> rootfs->mount, rootfs->options); if (bdev && bdev->ops->mount(bdev)
> == 0) { bdev_put(bdev);
>               DEBUG("mounted '%s' on '%s'", rootfs->path,
> rootfs->mount); @@ -2675,6 +2675,7 @@ struct lxc_conf
> *lxc_conf_init(void) new->console.slave = -1;
>       new->console.name[0] = '\0';
>       new->maincmd_fd = -1;
> +     new->nbd_idx = -1;
>       new->rootfs.mount = strdup(default_rootfs_mount);
>       if (!new->rootfs.mount) {
>               ERROR("lxc_conf_init : %m");
> diff --git a/src/lxc/conf.h b/src/lxc/conf.h
> index 865b87a..3a81d0e 100644
> --- a/src/lxc/conf.h
> +++ b/src/lxc/conf.h
> @@ -334,6 +334,7 @@ struct lxc_conf {
>       int start_delay;
>       int start_order;
>       struct lxc_list groups;
> +     int nbd_idx;
>  };
>  
>  int run_lxc_hooks(const char *name, char *hook, struct lxc_conf
> *conf, diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c
> index 255fde5..fdac433 100644
> --- a/src/lxc/lxccontainer.c
> +++ b/src/lxc/lxccontainer.c
> @@ -82,13 +82,6 @@ return -1;
>  
>  lxc_log_define(lxc_container, lxc);
>  
> -static bool file_exists(const char *f)
> -{
> -     struct stat statbuf;
> -
> -     return stat(f, &statbuf) == 0;
> -}
> -
>  static bool config_file_exists(const char *lxcpath, const char
> *cname) {
>       /* $lxcpath + '/' + $cname + '/config' + \0 */
> @@ -900,7 +893,7 @@ static bool create_run_template(struct
> lxc_container *c, char *tpath, bool quiet if (strncmp(src, "aufs:",
> 5) == 0) src = overlay_getlower(src+5);
>  
> -             bdev = bdev_init(src, c->lxc_conf->rootfs.mount,
> NULL);
> +             bdev = bdev_init(c->lxc_conf, src,
> c->lxc_conf->rootfs.mount, NULL); if (!bdev) {
>                       ERROR("Error opening rootfs");
>                       exit(1);
> @@ -1992,7 +1985,7 @@ static int do_bdev_destroy(struct lxc_conf
> *conf) struct bdev *r;
>       int ret = 0;
>  
> -     r = bdev_init(conf->rootfs.path, conf->rootfs.mount, NULL);
> +     r = bdev_init(conf, conf->rootfs.path, conf->rootfs.mount,
> NULL); if (!r)
>               return -1;
>  
> @@ -2522,7 +2515,7 @@ static int clone_update_rootfs(struct
> clone_update_data *data) 
>       if (unshare(CLONE_NEWNS) < 0)
>               return -1;
> -     bdev = bdev_init(c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL);
> +     bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL); if (!bdev)
>               return -1;
>       if (strcmp(bdev->type, "dir") != 0) {
> @@ -2787,7 +2780,7 @@ static bool lxcapi_rename(struct lxc_container
> *c, const char *newname) if (!c || !c->name || !c->config_path
> || !c->lxc_conf) return false;
>  
> -     bdev = bdev_init(c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL);
> +     bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL); if (!bdev) {
>               ERROR("Failed to find original backing store type");
>               return false;
> @@ -2880,7 +2873,7 @@ static int lxcapi_snapshot(struct lxc_container
> *c, const char *commentfile) */
>       flags = LXC_CLONE_SNAPSHOT | LXC_CLONE_KEEPMACADDR |
> LXC_CLONE_KEEPNAME | LXC_CLONE_KEEPBDEVTYPE |
> LXC_CLONE_MAYBE_SNAPSHOT;
> -     if (bdev_is_dir(c->lxc_conf->rootfs.path)) {
> +     if (bdev_is_dir(c->lxc_conf, c->lxc_conf->rootfs.path)) {
>               ERROR("Snapshot of directory-backed container
> requested."); ERROR("Making a copy-clone.  If you do want snapshots,
> then"); ERROR("please create an aufs or overlayfs clone first,
> snapshot that"); @@ -3082,7 +3075,7 @@ static bool
> lxcapi_snapshot_restore(struct lxc_container *c, const char *snapnam
> if (!c || !c->name || !c->config_path) return false;
>  
> -     bdev = bdev_init(c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL);
> +     bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path,
> c->lxc_conf->rootfs.mount, NULL); if (!bdev) {
>               ERROR("Failed to find original backing store type");
>               return false;
> diff --git a/src/lxc/start.c b/src/lxc/start.c
> index df1304a..a7fb1d3 100644
> --- a/src/lxc/start.c
> +++ b/src/lxc/start.c
> @@ -69,6 +69,7 @@
>  #include "namespace.h"
>  #include "lxcseccomp.h"
>  #include "caps.h"
> +#include "bdev.h"
>  #include "lsm/lsm.h"
>  
>  lxc_log_define(lxc_start, lxc);
> @@ -1054,10 +1055,15 @@ int __lxc_start(const char *name, struct
> lxc_conf *conf, handler->conf->need_utmp_watch = 0;
>       }
>  
> +     if (!attach_block_device(handler->conf)) {
> +             ERROR("Failure attaching block device");
> +             goto out_fini_nonet;
> +     }
> +
>       err = lxc_spawn(handler);
>       if (err) {
>               ERROR("failed to spawn '%s'", name);
> -             goto out_fini_nonet;
> +             goto out_detach_blockdev;
>       }
>  
>       netnsfd = get_netns_fd(handler->pid);
> @@ -1110,6 +1116,9 @@ int __lxc_start(const char *name, struct
> lxc_conf *conf, out_fini:
>       lxc_delete_network(handler);
>  
> +out_detach_blockdev:
> +     detach_block_device(handler->conf);
> +
>  out_fini_nonet:
>       lxc_fini(name, handler);
>       return err;
> diff --git a/src/lxc/utils.c b/src/lxc/utils.c
> index efec414..b076ce7 100644
> --- a/src/lxc/utils.c
> +++ b/src/lxc/utils.c
> @@ -1306,3 +1306,10 @@ next_loop:
>       free(path);
>       return NULL;
>  }
> +
> +bool file_exists(const char *f)
> +{
> +     struct stat statbuf;
> +
> +     return stat(f, &statbuf) == 0;
> +}
> diff --git a/src/lxc/utils.h b/src/lxc/utils.h
> index b5e054c..9c618b7 100644
> --- a/src/lxc/utils.h
> +++ b/src/lxc/utils.h
> @@ -280,3 +280,4 @@ uint64_t fnv_64a_buf(void *buf, size_t len,
> uint64_t hval); int detect_shared_rootfs(void);
>  int detect_ramfs_rootfs(void);
>  char *on_path(char *cmd);
> +bool file_exists(const char *f);

_______________________________________________
lxc-devel mailing list
lxc-devel@lists.linuxcontainers.org
http://lists.linuxcontainers.org/listinfo/lxc-devel

Reply via email to