Currently, the farm driver has a fatal problem in object recovery; it blocks I/O requests long time while moving stale objects to the backend store. Here is a test script to show that.
== sheep -d /store/0 -z 0 -p 7000 sleep 2 collie cluster format -c 1 -b $1 collie vdi create test 100G -P sheep -d /store/1 -z 1 -p 7001 sleep 2 time collie vdi list == The result is as follows: == $ ./farm.sh farm using backend farm store Name Id Size Used Shared Creation time VDI id Tag test 1 100 GB 100 GB 0.0 MB 2012-08-23 12:16 7c2b25 real 18m36.962s user 0m0.108s sys 0m0.000s == To fix this problem, we need to move stale objects in the worker thread. I've spent several days trying to fix it, but it seems to need a lot of work, and it looks difficult to stable the change in a short time. That brings me another approach; adding a lightweight storage driver 'simple_store' as a workaround. (Perhaps, we should use a different name to distinguish with the previous simple_store) ------------------------------------------ >8 This introduces a storage driver 'simple_store' based on the current storage interface. The design of the new simple_store is similar to one of the farm driver. The main difference is that farm uses the sha1 based backend store for stale objects, but simple_store uses a flat directory for them. With this design, simple_store can move objects from the working directory to the backend store efficiently with rename(2). Here are pros vs cons of simple_store. Pros: - faster recovery - smaller and simpler - would be a good example to introduce other storage drivers Cons: - cluster snapshot is not supported - stale objects are not deduplicated - there is no sha1 verification Signed-off-by: MORITA Kazutaka <morita.kazut...@lab.ntt.co.jp> --- With simple_store, the above test result becomes as follows: $ ./farm.sh simple using backend simple store Name Id Size Used Shared Creation time VDI id Tag test 1 100 GB 100 GB 0.0 MB 2012-08-23 11:29 7c2b25 real 0m1.016s user 0m0.092s sys 0m0.000s For users who don't need rich features of farm, simple_store would work very well. sheep/Makefile.am | 3 +- sheep/simple_store.c | 381 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 383 insertions(+), 1 deletions(-) create mode 100644 sheep/simple_store.c diff --git a/sheep/Makefile.am b/sheep/Makefile.am index fe15d94..0ddc2b2 100644 --- a/sheep/Makefile.am +++ b/sheep/Makefile.am @@ -26,7 +26,8 @@ sbin_PROGRAMS = sheep sheep_SOURCES = sheep.c group.c request.c gateway.c store.c vdi.c work.c \ journal.c ops.c recovery.c cluster/local.c \ - object_cache.c object_list_cache.c sockfd_cache.c + object_cache.c object_list_cache.c sockfd_cache.c \ + simple_store.c if BUILD_COROSYNC sheep_SOURCES += cluster/corosync.c diff --git a/sheep/simple_store.c b/sheep/simple_store.c new file mode 100644 index 0000000..eddf305 --- /dev/null +++ b/sheep/simple_store.c @@ -0,0 +1,381 @@ +/* + * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <dirent.h> + +#include "sheep_priv.h" + +static char stale_dir[PATH_MAX]; + +static int def_open_flags = O_DIRECT | O_DSYNC | O_RDWR; + +static int get_obj_path(uint64_t oid, char *path) +{ + return sprintf(path, "%s%016" PRIx64, obj_path, oid); +} + +static int get_tmp_obj_path(uint64_t oid, char *path) +{ + return sprintf(path, "%s%016"PRIx64".tmp", obj_path, oid); +} + +static int get_stale_obj_path(uint64_t oid, char *path) +{ + return sprintf(path, "%s/%016"PRIx64, stale_dir, oid); +} + +static int for_each_objects(int (*func)(uint64_t oid)) +{ + DIR *dir; + struct dirent *d; + uint64_t oid; + int ret = SD_RES_SUCCESS; + + dir = opendir(obj_path); + if (!dir) + return SD_RES_EIO; + + while ((d = readdir(dir))) { + if (!strncmp(d->d_name, ".", 1)) + continue; + + oid = strtoull(d->d_name, NULL, 16); + if (oid == 0 || oid == ULLONG_MAX) + continue; + + ret = func(oid); + if (ret != SD_RES_SUCCESS) + break; + } + closedir(dir); + return ret; +} + +static int simple_store_exist(uint64_t oid) +{ + char path[PATH_MAX]; + + get_obj_path(oid, path); + if (access(path, R_OK | W_OK) < 0) { + if (errno != ENOENT) + eprintf("%m\n"); + return 0; + } + + return 1; +} + +static int err_to_sderr(uint64_t oid, int err) +{ + struct stat s; + + if (err != ENOENT) { + eprintf("%m\n"); + return SD_RES_EIO; + } + + if (stat(obj_path, &s) < 0) { + eprintf("corrupted\n"); + return SD_RES_EIO; + } + + dprintf("object %016" PRIx64 " not found locally\n", oid); + return SD_RES_NO_OBJ; +} + +static int simple_store_write(uint64_t oid, struct siocb *iocb, int create) +{ + int flags = def_open_flags, fd, ret = SD_RES_SUCCESS; + char path[PATH_MAX]; + ssize_t size; + + if (iocb->epoch < sys_epoch()) { + dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch()); + return SD_RES_OLD_NODE_VER; + } + if (!is_data_obj(oid)) + flags &= ~O_DIRECT; + + if (create) + flags |= O_CREAT | O_TRUNC; + + get_obj_path(oid, path); + fd = open(path, flags, def_fmode); + if (fd < 0) + return err_to_sderr(oid, errno); + + if (create && !(iocb->flags & SD_FLAG_CMD_COW)) { + ret = prealloc(fd, get_objsize(oid)); + if (ret != SD_RES_SUCCESS) + goto out; + } + size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset); + if (size != iocb->length) { + eprintf("%m\n"); + ret = SD_RES_EIO; + goto out; + } +out: + close(fd); + return ret; +} + +static int simple_store_cleanup(struct siocb *iocb) +{ + rmdir_r(stale_dir); + if (mkdir(stale_dir, 0755) < 0) { + eprintf("%m\n"); + return SD_RES_EIO; + } + + return SD_RES_SUCCESS; +} + +static int init_objlist_and_vdi_bitmap(uint64_t oid) +{ + objlist_cache_insert(oid); + + if (is_vdi_obj(oid)) { + vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid); + + set_bit(oid_to_vid(oid), sys->vdi_inuse); + } + return SD_RES_SUCCESS; +} + +static int simple_store_init(char *p) +{ + dprintf("use simple store driver\n"); + + /* create a stale directory */ + snprintf(stale_dir, sizeof(stale_dir), "%s/.stale", p); + if (mkdir(stale_dir, 0755) < 0) { + if (errno != EEXIST) { + eprintf("%m\n"); + return SD_RES_EIO; + } + } + + return for_each_objects(init_objlist_and_vdi_bitmap); +} + +static int simple_store_read_from_path(uint64_t oid, char *path, + struct siocb *iocb) +{ + int flags = def_open_flags, fd, ret = SD_RES_SUCCESS; + ssize_t size; + + if (!is_data_obj(oid)) + flags &= ~O_DIRECT; + + fd = open(path, flags); + + if (fd < 0) + return err_to_sderr(oid, errno); + + size = xpread(fd, iocb->buf, iocb->length, iocb->offset); + if (size != iocb->length) { + ret = SD_RES_EIO; + goto out; + } +out: + close(fd); + + return ret; +} + +static int simple_store_read(uint64_t oid, struct siocb *iocb) +{ + int ret; + char path[PATH_MAX]; + + get_obj_path(oid, path); + ret = simple_store_read_from_path(oid, path, iocb); + + if (ret == SD_RES_NO_OBJ && iocb->epoch < sys_epoch()) { + /* try to read from the stale directory */ + get_stale_obj_path(oid, path); + ret = simple_store_read_from_path(oid, path, iocb); + } + + return ret; +} + +static int simple_store_atomic_put(uint64_t oid, struct siocb *iocb) +{ + char path[PATH_MAX], tmp_path[PATH_MAX]; + int flags = def_open_flags | O_CREAT; + int ret = SD_RES_EIO, fd; + uint32_t len = iocb->length; + + get_obj_path(oid, path); + get_tmp_obj_path(oid, tmp_path); + + if (!is_data_obj(oid)) + flags &= ~O_DIRECT; + fd = open(tmp_path, flags, def_fmode); + if (fd < 0) { + eprintf("failed to open %s: %m\n", tmp_path); + return SD_RES_EIO; + } + + ret = xwrite(fd, iocb->buf, len); + if (ret != len) { + eprintf("failed to write object. %m\n"); + ret = SD_RES_EIO; + goto out; + } + + ret = rename(tmp_path, path); + if (ret < 0) { + eprintf("failed to rename %s to %s: %m\n", tmp_path, path); + ret = SD_RES_EIO; + goto out; + } + dprintf("%"PRIx64"\n", oid); + ret = SD_RES_SUCCESS; +out: + close(fd); + return ret; +} + +static int simple_store_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch) +{ + char path[PATH_MAX], stale_path[PATH_MAX]; + + dprintf("try link %"PRIx64" from snapshot with epoch %d\n", oid, tgt_epoch); + + get_obj_path(oid, path); + get_stale_obj_path(oid, stale_path); + + if (rename(stale_path, path) < 0) { + eprintf("%m\n"); + return SD_RES_EIO; + } + + return SD_RES_SUCCESS; +} + +static bool oid_stale(uint64_t oid) +{ + int i, nr_copies; + struct vnode_info *vinfo; + struct sd_vnode *v; + bool ret = true; + struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; + + vinfo = get_vnode_info(); + nr_copies = get_nr_copies(vinfo); + + oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid, + nr_copies, obj_vnodes); + for (i = 0; i < nr_copies; i++) { + v = obj_vnodes[i]; + if (vnode_is_local(v)) { + ret = false; + break; + } + } + + put_vnode_info(vinfo); + return ret; +} + +static int move_object_to_stale_dir(uint64_t oid) +{ + char path[PATH_MAX], stale_path[PATH_MAX]; + + if (!oid_stale(oid)) + return SD_RES_SUCCESS; + + get_obj_path(oid, path); + get_stale_obj_path(oid, stale_path); + + if (rename(path, stale_path) < 0) { + eprintf("%s:%m\n", path); + return SD_RES_EIO; + } + + dprintf("moved object %"PRIx64"\n", oid); + return SD_RES_SUCCESS; +} + +static int simple_store_end_recover(uint32_t old_epoch, + struct vnode_info *old_vnode_info) +{ + if (old_epoch == 0) + return SD_RES_SUCCESS; + + return for_each_objects(move_object_to_stale_dir); +} + +static int simple_store_format(struct siocb *iocb) +{ + unsigned ret; + const char name[] = "simple"; + + dprintf("try get a clean store\n"); + ret = rmdir_r(obj_path); + if (ret && ret != -ENOENT) { + eprintf("failed to remove %s: %s\n", obj_path, strerror(-ret)); + return SD_RES_EIO; + } + if (mkdir(obj_path, def_dmode) < 0) { + eprintf("%m\n"); + return SD_RES_EIO; + } + + if (set_cluster_store(name) < 0) + return SD_RES_EIO; + + return SD_RES_SUCCESS; +} + +static int simple_store_remove_object(uint64_t oid) +{ + char path[PATH_MAX]; + + get_obj_path(oid, path); + + if (unlink(path) < 0) { + if (errno == ENOENT) + return SD_RES_NO_OBJ; + + eprintf("%m\n"); + return SD_RES_EIO; + } + + return SD_RES_SUCCESS; +} + +static int simple_store_purge_obj(void) +{ + return for_each_objects(simple_store_remove_object); +} + +struct store_driver simple_store = { + .name = "simple", + .init = simple_store_init, + .exist = simple_store_exist, + .write = simple_store_write, + .read = simple_store_read, + .link = simple_store_link, + .atomic_put = simple_store_atomic_put, + .end_recover = simple_store_end_recover, + .cleanup = simple_store_cleanup, + .format = simple_store_format, + .remove_object = simple_store_remove_object, + .purge_obj = simple_store_purge_obj, +}; + +add_store_driver(simple_store); -- 1.7.2.5 -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog