Currently, the farm driver has a fatal problem in object recovery; it
blocks I/O requests long time while moving stale objects to the
backend store.  Here is a test script to show that.

==
sheep -d /store/0 -z 0 -p 7000
sleep 2
collie cluster format -c 1 -b $1
collie vdi create test 100G -P
sheep -d /store/1 -z 1 -p 7001
sleep 2
time collie vdi list
==

The result is as follows:
==
$ ./farm.sh farm
using backend farm store
  Name        Id    Size    Used  Shared    Creation time   VDI id  Tag
  test         1  100 GB  100 GB  0.0 MB 2012-08-23 12:16   7c2b25  

real    18m36.962s
user    0m0.108s
sys     0m0.000s
==

To fix this problem, we need to move stale objects in the worker
thread.  I've spent several days trying to fix it, but it seems to
need a lot of work, and it looks difficult to stable the change in a
short time.

That brings me another approach; adding a lightweight storage driver
'simple_store' as a workaround.  (Perhaps, we should use a different
name to distinguish with the previous simple_store)

------------------------------------------ >8
This introduces a storage driver 'simple_store' based on the current
storage interface.  The design of the new simple_store is similar to
one of the farm driver.  The main difference is that farm uses the
sha1 based backend store for stale objects, but simple_store uses a
flat directory for them.  With this design, simple_store can move
objects from the working directory to the backend store efficiently
with rename(2).

Here are pros vs cons of simple_store.

Pros:
 - faster recovery
 - smaller and simpler
 - would be a good example to introduce other storage drivers

Cons:
 - cluster snapshot is not supported
 - stale objects are not deduplicated
 - there is no sha1 verification

Signed-off-by: MORITA Kazutaka <morita.kazut...@lab.ntt.co.jp>
---

With simple_store, the above test result becomes as follows:

$ ./farm.sh simple
using backend simple store
  Name        Id    Size    Used  Shared    Creation time   VDI id  Tag
  test         1  100 GB  100 GB  0.0 MB 2012-08-23 11:29   7c2b25

real    0m1.016s
user    0m0.092s
sys     0m0.000s

For users who don't need rich features of farm, simple_store would
work very well.


 sheep/Makefile.am    |    3 +-
 sheep/simple_store.c |  381 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 383 insertions(+), 1 deletions(-)
 create mode 100644 sheep/simple_store.c

diff --git a/sheep/Makefile.am b/sheep/Makefile.am
index fe15d94..0ddc2b2 100644
--- a/sheep/Makefile.am
+++ b/sheep/Makefile.am
@@ -26,7 +26,8 @@ sbin_PROGRAMS         = sheep
 
 sheep_SOURCES          = sheep.c group.c request.c gateway.c store.c vdi.c 
work.c \
                          journal.c ops.c recovery.c cluster/local.c \
-                         object_cache.c object_list_cache.c sockfd_cache.c
+                         object_cache.c object_list_cache.c sockfd_cache.c \
+                         simple_store.c
 
 if BUILD_COROSYNC
 sheep_SOURCES          += cluster/corosync.c
diff --git a/sheep/simple_store.c b/sheep/simple_store.c
new file mode 100644
index 0000000..eddf305
--- /dev/null
+++ b/sheep/simple_store.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (C) 2012 Nippon Telegraph and Telephone Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#include "sheep_priv.h"
+
+static char stale_dir[PATH_MAX];
+
+static int def_open_flags = O_DIRECT | O_DSYNC | O_RDWR;
+
+static int get_obj_path(uint64_t oid, char *path)
+{
+       return sprintf(path, "%s%016" PRIx64, obj_path, oid);
+}
+
+static int get_tmp_obj_path(uint64_t oid, char *path)
+{
+       return sprintf(path, "%s%016"PRIx64".tmp", obj_path, oid);
+}
+
+static int get_stale_obj_path(uint64_t oid, char *path)
+{
+       return sprintf(path, "%s/%016"PRIx64, stale_dir, oid);
+}
+
+static int for_each_objects(int (*func)(uint64_t oid))
+{
+       DIR *dir;
+       struct dirent *d;
+       uint64_t oid;
+       int ret = SD_RES_SUCCESS;
+
+       dir = opendir(obj_path);
+       if (!dir)
+               return SD_RES_EIO;
+
+       while ((d = readdir(dir))) {
+               if (!strncmp(d->d_name, ".", 1))
+                       continue;
+
+               oid = strtoull(d->d_name, NULL, 16);
+               if (oid == 0 || oid == ULLONG_MAX)
+                       continue;
+
+               ret = func(oid);
+               if (ret != SD_RES_SUCCESS)
+                       break;
+       }
+       closedir(dir);
+       return ret;
+}
+
+static int simple_store_exist(uint64_t oid)
+{
+       char path[PATH_MAX];
+
+       get_obj_path(oid, path);
+       if (access(path, R_OK | W_OK) < 0) {
+               if (errno != ENOENT)
+                       eprintf("%m\n");
+               return 0;
+       }
+
+       return 1;
+}
+
+static int err_to_sderr(uint64_t oid, int err)
+{
+       struct stat s;
+
+       if (err != ENOENT) {
+               eprintf("%m\n");
+               return SD_RES_EIO;
+       }
+
+       if (stat(obj_path, &s) < 0) {
+               eprintf("corrupted\n");
+               return SD_RES_EIO;
+       }
+
+       dprintf("object %016" PRIx64 " not found locally\n", oid);
+       return SD_RES_NO_OBJ;
+}
+
+static int simple_store_write(uint64_t oid, struct siocb *iocb, int create)
+{
+       int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
+       char path[PATH_MAX];
+       ssize_t size;
+
+       if (iocb->epoch < sys_epoch()) {
+               dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch());
+               return SD_RES_OLD_NODE_VER;
+       }
+       if (!is_data_obj(oid))
+               flags &= ~O_DIRECT;
+
+       if (create)
+               flags |= O_CREAT | O_TRUNC;
+
+       get_obj_path(oid, path);
+       fd = open(path, flags, def_fmode);
+       if (fd < 0)
+               return err_to_sderr(oid, errno);
+
+       if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
+               ret = prealloc(fd, get_objsize(oid));
+               if (ret != SD_RES_SUCCESS)
+                       goto out;
+       }
+       size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
+       if (size != iocb->length) {
+               eprintf("%m\n");
+               ret = SD_RES_EIO;
+               goto out;
+       }
+out:
+       close(fd);
+       return ret;
+}
+
+static int simple_store_cleanup(struct siocb *iocb)
+{
+       rmdir_r(stale_dir);
+       if (mkdir(stale_dir, 0755) < 0) {
+               eprintf("%m\n");
+               return SD_RES_EIO;
+       }
+
+       return SD_RES_SUCCESS;
+}
+
+static int init_objlist_and_vdi_bitmap(uint64_t oid)
+{
+       objlist_cache_insert(oid);
+
+       if (is_vdi_obj(oid)) {
+               vprintf(SDOG_DEBUG, "found the VDI object %" PRIx64 "\n", oid);
+
+               set_bit(oid_to_vid(oid), sys->vdi_inuse);
+       }
+       return SD_RES_SUCCESS;
+}
+
+static int simple_store_init(char *p)
+{
+       dprintf("use simple store driver\n");
+
+       /* create a stale directory */
+       snprintf(stale_dir, sizeof(stale_dir), "%s/.stale", p);
+       if (mkdir(stale_dir, 0755) < 0) {
+               if (errno != EEXIST) {
+                       eprintf("%m\n");
+                       return SD_RES_EIO;
+               }
+       }
+
+       return for_each_objects(init_objlist_and_vdi_bitmap);
+}
+
+static int simple_store_read_from_path(uint64_t oid, char *path,
+                                      struct siocb *iocb)
+{
+       int flags = def_open_flags, fd, ret = SD_RES_SUCCESS;
+       ssize_t size;
+
+       if (!is_data_obj(oid))
+               flags &= ~O_DIRECT;
+
+       fd = open(path, flags);
+
+       if (fd < 0)
+               return err_to_sderr(oid, errno);
+
+       size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
+       if (size != iocb->length) {
+               ret = SD_RES_EIO;
+               goto out;
+       }
+out:
+       close(fd);
+
+       return ret;
+}
+
+static int simple_store_read(uint64_t oid, struct siocb *iocb)
+{
+       int ret;
+       char path[PATH_MAX];
+
+       get_obj_path(oid, path);
+       ret = simple_store_read_from_path(oid, path, iocb);
+
+       if (ret == SD_RES_NO_OBJ && iocb->epoch < sys_epoch()) {
+               /* try to read from the stale directory */
+               get_stale_obj_path(oid, path);
+               ret = simple_store_read_from_path(oid, path, iocb);
+       }
+
+       return ret;
+}
+
+static int simple_store_atomic_put(uint64_t oid, struct siocb *iocb)
+{
+       char path[PATH_MAX], tmp_path[PATH_MAX];
+       int flags = def_open_flags | O_CREAT;
+       int ret = SD_RES_EIO, fd;
+       uint32_t len = iocb->length;
+
+       get_obj_path(oid, path);
+       get_tmp_obj_path(oid, tmp_path);
+
+       if (!is_data_obj(oid))
+               flags &= ~O_DIRECT;
+       fd = open(tmp_path, flags, def_fmode);
+       if (fd < 0) {
+               eprintf("failed to open %s: %m\n", tmp_path);
+               return SD_RES_EIO;
+       }
+
+       ret = xwrite(fd, iocb->buf, len);
+       if (ret != len) {
+               eprintf("failed to write object. %m\n");
+               ret = SD_RES_EIO;
+               goto out;
+       }
+
+       ret = rename(tmp_path, path);
+       if (ret < 0) {
+               eprintf("failed to rename %s to %s: %m\n", tmp_path, path);
+               ret = SD_RES_EIO;
+               goto out;
+       }
+       dprintf("%"PRIx64"\n", oid);
+       ret = SD_RES_SUCCESS;
+out:
+       close(fd);
+       return ret;
+}
+
+static int simple_store_link(uint64_t oid, struct siocb *iocb, uint32_t 
tgt_epoch)
+{
+       char path[PATH_MAX], stale_path[PATH_MAX];
+
+       dprintf("try link %"PRIx64" from snapshot with epoch %d\n", oid, 
tgt_epoch);
+
+       get_obj_path(oid, path);
+       get_stale_obj_path(oid, stale_path);
+
+       if (rename(stale_path, path) < 0) {
+               eprintf("%m\n");
+               return SD_RES_EIO;
+       }
+
+       return SD_RES_SUCCESS;
+}
+
+static bool oid_stale(uint64_t oid)
+{
+       int i, nr_copies;
+       struct vnode_info *vinfo;
+       struct sd_vnode *v;
+       bool ret = true;
+       struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+
+       vinfo = get_vnode_info();
+       nr_copies = get_nr_copies(vinfo);
+
+       oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid,
+                     nr_copies, obj_vnodes);
+       for (i = 0; i < nr_copies; i++) {
+               v = obj_vnodes[i];
+               if (vnode_is_local(v)) {
+                       ret = false;
+                       break;
+               }
+       }
+
+       put_vnode_info(vinfo);
+       return ret;
+}
+
+static int move_object_to_stale_dir(uint64_t oid)
+{
+       char path[PATH_MAX], stale_path[PATH_MAX];
+
+       if (!oid_stale(oid))
+               return SD_RES_SUCCESS;
+
+       get_obj_path(oid, path);
+       get_stale_obj_path(oid, stale_path);
+
+       if (rename(path, stale_path) < 0) {
+               eprintf("%s:%m\n", path);
+               return SD_RES_EIO;
+       }
+
+       dprintf("moved object %"PRIx64"\n", oid);
+       return SD_RES_SUCCESS;
+}
+
+static int simple_store_end_recover(uint32_t old_epoch,
+                                   struct vnode_info *old_vnode_info)
+{
+       if (old_epoch == 0)
+               return SD_RES_SUCCESS;
+
+       return for_each_objects(move_object_to_stale_dir);
+}
+
+static int simple_store_format(struct siocb *iocb)
+{
+       unsigned ret;
+       const char name[] = "simple";
+
+       dprintf("try get a clean store\n");
+       ret = rmdir_r(obj_path);
+       if (ret && ret != -ENOENT) {
+               eprintf("failed to remove %s: %s\n", obj_path, strerror(-ret));
+               return SD_RES_EIO;
+       }
+       if (mkdir(obj_path, def_dmode) < 0) {
+               eprintf("%m\n");
+               return SD_RES_EIO;
+       }
+
+       if (set_cluster_store(name) < 0)
+               return SD_RES_EIO;
+
+       return SD_RES_SUCCESS;
+}
+
+static int simple_store_remove_object(uint64_t oid)
+{
+       char path[PATH_MAX];
+
+       get_obj_path(oid, path);
+
+       if (unlink(path) < 0) {
+               if (errno == ENOENT)
+                       return SD_RES_NO_OBJ;
+
+               eprintf("%m\n");
+               return SD_RES_EIO;
+       }
+
+       return SD_RES_SUCCESS;
+}
+
+static int simple_store_purge_obj(void)
+{
+       return for_each_objects(simple_store_remove_object);
+}
+
+struct store_driver simple_store = {
+       .name = "simple",
+       .init = simple_store_init,
+       .exist = simple_store_exist,
+       .write = simple_store_write,
+       .read = simple_store_read,
+       .link = simple_store_link,
+       .atomic_put = simple_store_atomic_put,
+       .end_recover = simple_store_end_recover,
+       .cleanup = simple_store_cleanup,
+       .format = simple_store_format,
+       .remove_object = simple_store_remove_object,
+       .purge_obj = simple_store_purge_obj,
+};
+
+add_store_driver(simple_store);
-- 
1.7.2.5

-- 
sheepdog mailing list
sheepdog@lists.wpkg.org
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to