On Thu, 15 May 2014 14:33:18 +0000 Serge Hallyn <serge.hal...@ubuntu.com> wrote:
> backing stores supported by qemu-nbd can be attached to a nbd block > device using qemu-nbd. This user-space process (pair) stays around > for the duration of the device attachment. Obviously we want it to > go away when the container shuts down, but not before the filesystems > have been cleanly unmounted. > > The device attachment is done from the task which will become the > container monitor before the container setup+init task is spawned. > That task starts in a new pid namespace to ensure that the qemu-nbd > process will be killed if need be. It sets its parent death signal > to sighup, and, on receiving sighup, attempts to do a clean > qemu-device detach, then exits. This should ensure that the > device is detached if the qemu monitor crashes or exits. > > It may be worth adding a delay before the qemu-nbd is detached, but > my brief tests haven't seen any data corruption. > > Only the parts required for running a nbd-backed container are > implemented here. Create, destroy, and clone are not. The first > use of this that I imagine is for people to use downloaded nbd-backed > images (like ubuntu cloud images, or anything previously used with > qemu). I imagine people will want to create/clone/destroy out of > band using qemu-img, but if I'm wrong about that we can implement > the rest later. > > Because attach_block_device() is done before the bdev is initialized, > and bdev_init needs to know the nbd index so that it can mount the > filesystem, we now need to pass the lxc_conf. > > file_exists() is moved to utils.c so we can use it from bdev.c > > The nbd attach/detach should lay the groundwork for trivial > implementation of qed and raw images. > > changelog (may 12): fix idx check at detach > changelog (may 15): generalize qcow2 to nbd > > Signed-off-by: Serge Hallyn <serge.hal...@ubuntu.com> Acked-by: Dwight Engen <dwight.en...@oracle.com> > --- > src/lxc/bdev.c | 293 > ++++++++++++++++++++++++++++++++++++++++++++++++- > src/lxc/bdev.h | 17 ++- src/lxc/conf.c | 3 +- > src/lxc/conf.h | 1 + > src/lxc/lxccontainer.c | 19 +--- > src/lxc/start.c | 11 +- > src/lxc/utils.c | 7 ++ > src/lxc/utils.h | 1 + > 8 files changed, 329 insertions(+), 23 deletions(-) > > diff --git a/src/lxc/bdev.c b/src/lxc/bdev.c > index 20e9fb3..e22d83d 100644 > --- a/src/lxc/bdev.c > +++ b/src/lxc/bdev.c > @@ -41,6 +41,7 @@ > #include <libgen.h> > #include <linux/loop.h> > #include <dirent.h> > +#include <sys/prctl.h> > > #include "lxc.h" > #include "config.h" > @@ -2410,6 +2411,287 @@ static const struct bdev_ops aufs_ops = { > .can_snapshot = true, > }; > > +// > +// nbd dev ops > +// > + > +static int nbd_detect(const char *path) > +{ > + if (strncmp(path, "nbd:", 4) == 0) > + return 1; > + return 0; > +} > + > +struct nbd_attach_data { > + const char *nbd; > + const char *path; > +}; > + > +static void nbd_detach(const char *path) > +{ > + int ret; > + pid_t pid = fork(); > + > + if (pid < 0) { > + SYSERROR("Error forking to detach nbd"); > + return; > + } > + if (pid) { > + ret = wait_for_pid(pid); > + if (ret < 0) > + ERROR("nbd disconnect returned an error"); > + return; > + } > + execlp("qemu-nbd", "qemu-nbd", "-d", path, NULL); > + SYSERROR("Error executing qemu-nbd"); > + exit(1); > +} > + > +static int do_attach_nbd(void *d) > +{ > + struct nbd_attach_data *data = d; > + const char *nbd, *path; > + pid_t pid; > + sigset_t mask; > + int sfd; > + ssize_t s; > + struct signalfd_siginfo fdsi; > + > + sigemptyset(&mask); > + sigaddset(&mask, SIGHUP); > + sigaddset(&mask, SIGCHLD); > + > + nbd = data->nbd; > + path = data->path; > + > + if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1) { > + SYSERROR("Error blocking signals for nbd watcher"); > + exit(1); > + } > + > + sfd = signalfd(-1, &mask, 0); > + if (sfd == -1) { > + SYSERROR("Error opening signalfd for nbd task"); > + exit(1); > + } > + > + if (prctl(PR_SET_PDEATHSIG, SIGHUP, 0, 0, 0) < 0) > + SYSERROR("Error setting parent death signal for nbd > watcher"); + > + pid = fork(); > + if (pid) { > + for (;;) { > + s = read(sfd, &fdsi, sizeof(struct > signalfd_siginfo)); > + if (s != sizeof(struct signalfd_siginfo)) > + SYSERROR("Error reading from > signalfd"); + > + if (fdsi.ssi_signo == SIGHUP) { > + /* container has exited */ > + nbd_detach(nbd); > + exit(0); > + } else if (fdsi.ssi_signo == SIGCHLD) { > + int status; > + while (waitpid(-1, &status, WNOHANG) > > 0); > + } > + } > + } > + > + close(sfd); > + if (sigprocmask(SIG_UNBLOCK, &mask, NULL) == -1) > + WARN("Warning: unblocking signals for nbd watcher"); > + > + execlp("qemu-nbd", "qemu-nbd", "-c", nbd, path, NULL); > + SYSERROR("Error executing qemu-nbd"); > + exit(1); > +} > + > +static bool clone_attach_nbd(const char *nbd, const char *path) > +{ > + pid_t pid; > + struct nbd_attach_data data; > + > + data.nbd = nbd; > + data.path = path; > + > + pid = lxc_clone(do_attach_nbd, &data, CLONE_NEWPID); > + if (pid < 0) > + return false; > + return true; > +} > + > +static bool nbd_busy(int idx) > +{ > + char path[100]; > + int ret; > + > + ret = snprintf(path, 100, "/sys/block/nbd%d/pid", idx); > + if (ret < 0 || ret >= 100) > + return true; > + return file_exists(path); > +} > + > +static bool attach_nbd(char *src, struct lxc_conf *conf) > +{ > + char *orig = alloca(strlen(src)+1), *p, path[50]; > + int i = 0; > + > + strcpy(orig, src); > + /* if path is followed by a partition, drop that for now */ > + p = strchr(orig, ':'); > + if (p) > + *p = '\0'; > + while (1) { > + sprintf(path, "/dev/nbd%d", i); > + if (!file_exists(path)) > + return false; > + if (nbd_busy(i)) { > + i++; > + continue; > + } > + if (!clone_attach_nbd(path, orig)) > + return false; > + conf->nbd_idx = i; > + return true; > + } > +} > + > +static bool requires_nbd(const char *path) > +{ > + if (strncmp(path, "nbd:", 4) == 0) > + return true; > + return false; > +} > + > +/* > + * attach_block_device returns true if all went well, > + * meaning either a block device was attached or was not > + * needed. It returns false if something went wrong and > + * container startup shoudl be stopped. > + */ > +bool attach_block_device(struct lxc_conf *conf) > +{ > + char *path; > + > + if (!conf->rootfs.path) > + return true; > + path = conf->rootfs.path; > + if (!requires_nbd(path)) > + return true; > + path = strchr(path, ':'); > + if (!path) > + return false; > + path++; > + if (!attach_nbd(path, conf)) > + return false; > + return true; > +} > + > +void detach_nbd_idx(int idx) > +{ > + int ret; > + char path[50]; > + > + ret = snprintf(path, 50, "/dev/nbd%d", idx); > + if (ret < 0 || ret >= 50) > + return; > + > + nbd_detach(path); > +} > + > +void detach_block_device(struct lxc_conf *conf) > +{ > + if (conf->nbd_idx != -1) > + detach_nbd_idx(conf->nbd_idx); > +} > + > +/* > + * Pick the partition # off the end of a nbd:file:p > + * description. Return 1-9 for the partition id, or 0 > + * for no partition. > + */ > +static int nbd_get_partition(const char *src) > +{ > + char *p = strchr(src, ':'); > + if (!p) > + return 0; > + p = strchr(p+1, ':'); > + if (!p) > + return 0; > + p++; > + if (*p < '1' && *p > '9') > + return 0; > + return *p - '0'; > +} > + > +static int nbd_mount(struct bdev *bdev) > +{ > + int ret = -1, partition; > + char path[50]; > + > + if (strcmp(bdev->type, "nbd")) > + return -22; > + if (!bdev->src || !bdev->dest) > + return -22; > + > + /* nbd_idx should have been copied by bdev_init from the > lxc_conf */ > + if (bdev->nbd_idx < 0) > + return -22; > + partition = nbd_get_partition(bdev->src); > + if (partition) > + ret = snprintf(path, 50, "/dev/nbd%dp%d", > bdev->nbd_idx, > + partition); > + else > + ret = snprintf(path, 50, "/dev/nbd%d", > bdev->nbd_idx); > + if (ret < 0 || ret >= 50) { > + ERROR("Error setting up nbd device path"); > + return ret; > + } > + ret = mount_unknown_fs(path, bdev->dest, bdev->mntopts); > + if (ret < 0) > + ERROR("Error mounting %s", bdev->src); > + > + return ret; > +} > + > +static int nbd_create(struct bdev *bdev, const char *dest, const > char *n, > + struct bdev_specs *specs) > +{ > + return -ENOSYS; > +} > + > +static int nbd_clonepaths(struct bdev *orig, struct bdev *new, const > char *oldname, > + const char *cname, const char *oldpath, const char > *lxcpath, int snap, > + uint64_t newsize, struct lxc_conf *conf) > +{ > + return -ENOSYS; > +} > + > +static int nbd_destroy(struct bdev *orig) > +{ > + return -ENOSYS; > +} > + > +static int nbd_umount(struct bdev *bdev) > +{ > + int ret; > + > + if (strcmp(bdev->type, "nbd")) > + return -22; > + if (!bdev->src || !bdev->dest) > + return -22; > + ret = umount(bdev->dest); > + return ret; > +} > + > +static const struct bdev_ops nbd_ops = { > + .detect = &nbd_detect, > + .mount = &nbd_mount, > + .umount = &nbd_umount, > + .clone_paths = &nbd_clonepaths, > + .destroy = &nbd_destroy, > + .create = &nbd_create, > + .can_snapshot = true, > +}; > > static const struct bdev_type bdevs[] = { > {.name = "zfs", .ops = &zfs_ops,}, > @@ -2419,6 +2701,7 @@ static const struct bdev_type bdevs[] = { > {.name = "aufs", .ops = &aufs_ops,}, > {.name = "overlayfs", .ops = &overlayfs_ops,}, > {.name = "loop", .ops = &loop_ops,}, > + {.name = "nbd", .ops = &nbd_ops,}, > }; > > static const size_t numbdevs = sizeof(bdevs) / sizeof(struct > bdev_type); @@ -2454,7 +2737,7 @@ struct bdev *bdev_get(const char > *type) return bdev; > } > > -struct bdev *bdev_init(const char *src, const char *dst, const char > *mntopts) +struct bdev *bdev_init(struct lxc_conf *conf, const char > *src, const char *dst, const char *mntopts) { > int i; > struct bdev *bdev; > @@ -2480,6 +2763,8 @@ struct bdev *bdev_init(const char *src, const > char *dst, const char *mntopts) bdev->src = strdup(src); > if (dst) > bdev->dest = strdup(dst); > + if (strcmp(bdev->type, "nbd") == 0) > + bdev->nbd_idx = conf->nbd_idx; > > return bdev; > } > @@ -2538,9 +2823,9 @@ static int rsync_rootfs_wrapper(void *data) > return rsync_rootfs(arg); > } > > -bool bdev_is_dir(const char *path) > +bool bdev_is_dir(struct lxc_conf *conf, const char *path) > { > - struct bdev *orig = bdev_init(path, NULL, NULL); > + struct bdev *orig = bdev_init(conf, path, NULL, NULL); > bool ret = false; > if (!orig) > return ret; > @@ -2605,7 +2890,7 @@ struct bdev *bdev_copy(struct lxc_container > *c0, const char *cname, return NULL; > } > > - orig = bdev_init(src, NULL, NULL); > + orig = bdev_init(c0->lxc_conf, src, NULL, NULL); > if (!orig) { > ERROR("failed to detect blockdev type for %s", src); > return NULL; > diff --git a/src/lxc/bdev.h b/src/lxc/bdev.h > index cc0bf02..0893c11 100644 > --- a/src/lxc/bdev.h > +++ b/src/lxc/bdev.h > @@ -24,8 +24,7 @@ > #ifndef __LXC_BDEV_H > #define __LXC_BDEV_H > /* blockdev operations for: > - * aufs, dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs > - * someday: qemu-nbd, qcow2, qed > + * aufs, dir, raw, btrfs, overlayfs, aufs, lvm, loop, zfs, nbd > (qcow2, raw, vdi, qed) */ > > #include "config.h" > @@ -83,11 +82,13 @@ struct bdev { > // turn the following into a union if need be > // lofd is the open fd for the mounted loopback file > int lofd; > + // index for the connected nbd device > + int nbd_idx; > }; > > char *overlay_getlower(char *p); > > -bool bdev_is_dir(const char *path); > +bool bdev_is_dir(struct lxc_conf *conf, const char *path); > > /* > * Instantiate a bdev object. The src is used to determine which > blockdev @@ -100,7 +101,8 @@ bool bdev_is_dir(const char *path); > * use /var/lib/lxc/canonical/rootfs as lower dir, > and /var/lib/lxc/c1/delta > * as the upper, writeable layer. > */ > -struct bdev *bdev_init(const char *src, const char *dst, const char > *data); +struct bdev *bdev_init(struct lxc_conf *conf, const char > *src, const char *dst, > + const char *data); > > struct bdev *bdev_copy(struct lxc_container *c0, const char *cname, > const char *lxcpath, const char *bdevtype, > @@ -110,6 +112,13 @@ struct bdev *bdev_create(const char *dest, const > char *type, const char *cname, struct bdev_specs *specs); > void bdev_put(struct bdev *bdev); > > +/* > + * these are really for qemu-nbd support, as container shutdown > + * must explicitly request device detach. > + */ > +bool attach_block_device(struct lxc_conf *conf); > +void detach_block_device(struct lxc_conf *conf); > + > /* define constants if the kernel/glibc headers don't define them */ > #ifndef MS_DIRSYNC > #define MS_DIRSYNC 128 > diff --git a/src/lxc/conf.c b/src/lxc/conf.c > index 78d9de2..7427a94 100644 > --- a/src/lxc/conf.c > +++ b/src/lxc/conf.c > @@ -1555,7 +1555,7 @@ static int setup_rootfs(struct lxc_conf *conf) > } > > // First try mounting rootfs using a bdev > - struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, > rootfs->options); > + struct bdev *bdev = bdev_init(conf, rootfs->path, > rootfs->mount, rootfs->options); if (bdev && bdev->ops->mount(bdev) > == 0) { bdev_put(bdev); > DEBUG("mounted '%s' on '%s'", rootfs->path, > rootfs->mount); @@ -2675,6 +2675,7 @@ struct lxc_conf > *lxc_conf_init(void) new->console.slave = -1; > new->console.name[0] = '\0'; > new->maincmd_fd = -1; > + new->nbd_idx = -1; > new->rootfs.mount = strdup(default_rootfs_mount); > if (!new->rootfs.mount) { > ERROR("lxc_conf_init : %m"); > diff --git a/src/lxc/conf.h b/src/lxc/conf.h > index 865b87a..3a81d0e 100644 > --- a/src/lxc/conf.h > +++ b/src/lxc/conf.h > @@ -334,6 +334,7 @@ struct lxc_conf { > int start_delay; > int start_order; > struct lxc_list groups; > + int nbd_idx; > }; > > int run_lxc_hooks(const char *name, char *hook, struct lxc_conf > *conf, diff --git a/src/lxc/lxccontainer.c b/src/lxc/lxccontainer.c > index 255fde5..fdac433 100644 > --- a/src/lxc/lxccontainer.c > +++ b/src/lxc/lxccontainer.c > @@ -82,13 +82,6 @@ return -1; > > lxc_log_define(lxc_container, lxc); > > -static bool file_exists(const char *f) > -{ > - struct stat statbuf; > - > - return stat(f, &statbuf) == 0; > -} > - > static bool config_file_exists(const char *lxcpath, const char > *cname) { > /* $lxcpath + '/' + $cname + '/config' + \0 */ > @@ -900,7 +893,7 @@ static bool create_run_template(struct > lxc_container *c, char *tpath, bool quiet if (strncmp(src, "aufs:", > 5) == 0) src = overlay_getlower(src+5); > > - bdev = bdev_init(src, c->lxc_conf->rootfs.mount, > NULL); > + bdev = bdev_init(c->lxc_conf, src, > c->lxc_conf->rootfs.mount, NULL); if (!bdev) { > ERROR("Error opening rootfs"); > exit(1); > @@ -1992,7 +1985,7 @@ static int do_bdev_destroy(struct lxc_conf > *conf) struct bdev *r; > int ret = 0; > > - r = bdev_init(conf->rootfs.path, conf->rootfs.mount, NULL); > + r = bdev_init(conf, conf->rootfs.path, conf->rootfs.mount, > NULL); if (!r) > return -1; > > @@ -2522,7 +2515,7 @@ static int clone_update_rootfs(struct > clone_update_data *data) > if (unshare(CLONE_NEWNS) < 0) > return -1; > - bdev = bdev_init(c->lxc_conf->rootfs.path, > c->lxc_conf->rootfs.mount, NULL); > + bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path, > c->lxc_conf->rootfs.mount, NULL); if (!bdev) > return -1; > if (strcmp(bdev->type, "dir") != 0) { > @@ -2787,7 +2780,7 @@ static bool lxcapi_rename(struct lxc_container > *c, const char *newname) if (!c || !c->name || !c->config_path > || !c->lxc_conf) return false; > > - bdev = bdev_init(c->lxc_conf->rootfs.path, > c->lxc_conf->rootfs.mount, NULL); > + bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path, > c->lxc_conf->rootfs.mount, NULL); if (!bdev) { > ERROR("Failed to find original backing store type"); > return false; > @@ -2880,7 +2873,7 @@ static int lxcapi_snapshot(struct lxc_container > *c, const char *commentfile) */ > flags = LXC_CLONE_SNAPSHOT | LXC_CLONE_KEEPMACADDR | > LXC_CLONE_KEEPNAME | LXC_CLONE_KEEPBDEVTYPE | > LXC_CLONE_MAYBE_SNAPSHOT; > - if (bdev_is_dir(c->lxc_conf->rootfs.path)) { > + if (bdev_is_dir(c->lxc_conf, c->lxc_conf->rootfs.path)) { > ERROR("Snapshot of directory-backed container > requested."); ERROR("Making a copy-clone. If you do want snapshots, > then"); ERROR("please create an aufs or overlayfs clone first, > snapshot that"); @@ -3082,7 +3075,7 @@ static bool > lxcapi_snapshot_restore(struct lxc_container *c, const char *snapnam > if (!c || !c->name || !c->config_path) return false; > > - bdev = bdev_init(c->lxc_conf->rootfs.path, > c->lxc_conf->rootfs.mount, NULL); > + bdev = bdev_init(c->lxc_conf, c->lxc_conf->rootfs.path, > c->lxc_conf->rootfs.mount, NULL); if (!bdev) { > ERROR("Failed to find original backing store type"); > return false; > diff --git a/src/lxc/start.c b/src/lxc/start.c > index df1304a..a7fb1d3 100644 > --- a/src/lxc/start.c > +++ b/src/lxc/start.c > @@ -69,6 +69,7 @@ > #include "namespace.h" > #include "lxcseccomp.h" > #include "caps.h" > +#include "bdev.h" > #include "lsm/lsm.h" > > lxc_log_define(lxc_start, lxc); > @@ -1054,10 +1055,15 @@ int __lxc_start(const char *name, struct > lxc_conf *conf, handler->conf->need_utmp_watch = 0; > } > > + if (!attach_block_device(handler->conf)) { > + ERROR("Failure attaching block device"); > + goto out_fini_nonet; > + } > + > err = lxc_spawn(handler); > if (err) { > ERROR("failed to spawn '%s'", name); > - goto out_fini_nonet; > + goto out_detach_blockdev; > } > > netnsfd = get_netns_fd(handler->pid); > @@ -1110,6 +1116,9 @@ int __lxc_start(const char *name, struct > lxc_conf *conf, out_fini: > lxc_delete_network(handler); > > +out_detach_blockdev: > + detach_block_device(handler->conf); > + > out_fini_nonet: > lxc_fini(name, handler); > return err; > diff --git a/src/lxc/utils.c b/src/lxc/utils.c > index efec414..b076ce7 100644 > --- a/src/lxc/utils.c > +++ b/src/lxc/utils.c > @@ -1306,3 +1306,10 @@ next_loop: > free(path); > return NULL; > } > + > +bool file_exists(const char *f) > +{ > + struct stat statbuf; > + > + return stat(f, &statbuf) == 0; > +} > diff --git a/src/lxc/utils.h b/src/lxc/utils.h > index b5e054c..9c618b7 100644 > --- a/src/lxc/utils.h > +++ b/src/lxc/utils.h > @@ -280,3 +280,4 @@ uint64_t fnv_64a_buf(void *buf, size_t len, > uint64_t hval); int detect_shared_rootfs(void); > int detect_ramfs_rootfs(void); > char *on_path(char *cmd); > +bool file_exists(const char *f); _______________________________________________ lxc-devel mailing list lxc-devel@lists.linuxcontainers.org http://lists.linuxcontainers.org/listinfo/lxc-devel