Signed-off-by: Liu Yuan <namei.u...@gmail.com>
---
 dog/dog.c                |    6 +-
 dog/dog.h                |    4 +-
 dog/vdi.c                |   24 +++---
 include/internal_proto.h |    1 -
 include/sheep.h          |  201 +++++++++++++++-------------------------------
 sheep/gateway.c          |   10 +--
 sheep/group.c            |   12 ++-
 sheep/plain_store.c      |    5 +-
 sheep/recovery.c         |    9 +--
 sheep/request.c          |    8 +-
 10 files changed, 105 insertions(+), 175 deletions(-)

diff --git a/dog/dog.c b/dog/dog.c
index 9ec41c1..0009c92 100644
--- a/dog/dog.c
+++ b/dog/dog.c
@@ -49,8 +49,8 @@ static void usage(const struct command *commands, int status);
 uint32_t sd_epoch;
 
 struct sd_node sd_nodes[SD_MAX_NODES];
-struct sd_vnode sd_vnodes[SD_MAX_VNODES];
-int sd_nodes_nr, sd_vnodes_nr;
+int sd_nodes_nr;
+struct rb_root sd_vroot = RB_ROOT;
 
 int update_node_list(int max_nodes)
 {
@@ -92,7 +92,7 @@ int update_node_list(int max_nodes)
        }
 
        memcpy(sd_nodes, buf, size);
-       sd_vnodes_nr = nodes_to_vnodes(sd_nodes, sd_nodes_nr, sd_vnodes);
+       nodes_to_vnodes(sd_nodes, sd_nodes_nr, &sd_vroot);
        sd_epoch = hdr.epoch;
 out:
        if (buf)
diff --git a/dog/dog.h b/dog/dog.h
index e1c499e..9b17eb7 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -56,8 +56,8 @@ extern bool verbose;
 
 extern uint32_t sd_epoch;
 extern struct sd_node sd_nodes[SD_MAX_NODES];
-extern struct sd_vnode sd_vnodes[SD_MAX_VNODES];
-extern int sd_nodes_nr, sd_vnodes_nr;
+extern struct rb_root sd_vroot;
+extern int sd_nodes_nr;
 
 bool is_current(const struct sd_inode *i);
 char *strnumber(uint64_t _size);
diff --git a/dog/vdi.c b/dog/vdi.c
index 82821ac..b4e8821 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -904,14 +904,12 @@ static int do_track_object(uint64_t oid, uint8_t 
nr_copies)
        int i, j, ret;
        struct sd_req hdr;
        struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-       struct sd_vnode *vnodes;
-       const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
+       struct sd_vnode *vnode_buf[SD_MAX_COPIES];
        struct epoch_log *logs;
-       int vnodes_nr, nr_logs, log_length;
+       int nr_logs, log_length;
 
        log_length = sd_epoch * sizeof(struct epoch_log);
        logs = xmalloc(log_length);
-       vnodes = xmalloc(sizeof(*vnodes) * SD_MAX_VNODES);
 
        sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
        hdr.data_length = log_length;
@@ -927,6 +925,9 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 
        nr_logs = rsp->data_length / sizeof(struct epoch_log);
        for (i = nr_logs - 1; i >= 0; i--) {
+               struct rb_root vroot;
+               struct sd_vnode *v;
+
                printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
                       oid, logs[i].epoch, nr_copies);
                printf("---------------------------------------------------\n");
@@ -943,22 +944,23 @@ static int do_track_object(uint64_t oid, uint8_t 
nr_copies)
                        }
                        continue;
                }
-               vnodes_nr = nodes_to_vnodes(logs[i].nodes,
-                                           logs[i].nr_nodes, vnodes);
-               oid_to_vnodes(vnodes, vnodes_nr, oid, nr_copies, vnode_buf);
+               nodes_to_vnodes(logs[i].nodes, logs[i].nr_nodes, &vroot);
+               oid_to_vnodes(oid, &vroot, nr_copies, vnode_buf);
                for (j = 0; j < nr_copies; j++) {
                        const struct node_id *n = &vnode_buf[j]->node->nid;
 
                        printf("%s\n", addr_to_str(n->addr, n->port));
                }
+               rb_for_each_entry(v, &vroot, rb) {
+                       rb_erase(&v->rb, &vroot);
+                       free(v);
+               }
        }
 
        free(logs);
-       free(vnodes);
        return EXIT_SUCCESS;
 error:
        free(logs);
-       free(vnodes);
        return EXIT_SYSFAIL;
 }
 
@@ -1531,7 +1533,7 @@ static void queue_vdi_check_work(const struct sd_inode 
*inode, uint64_t oid,
                                 uint64_t *done, struct work_queue *wq)
 {
        struct vdi_check_info *info;
-       const struct sd_vnode *tgt_vnodes[SD_MAX_COPIES];
+       struct sd_vnode *tgt_vnodes[SD_MAX_COPIES];
        int nr_copies = inode->nr_copies;
 
        info = xzalloc(sizeof(*info) + sizeof(info->vcw[0]) * nr_copies);
@@ -1541,7 +1543,7 @@ static void queue_vdi_check_work(const struct sd_inode 
*inode, uint64_t oid,
        info->done = done;
        info->wq = wq;
 
-       oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes);
+       oid_to_vnodes(oid, &sd_vroot, nr_copies, tgt_vnodes);
        for (int i = 0; i < nr_copies; i++) {
                info->vcw[i].info = info;
                info->vcw[i].vnode = tgt_vnodes[i];
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 8544c75..7d1df2c 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -29,7 +29,6 @@
 
 #define SD_MAX_NODES 1024
 #define SD_DEFAULT_VNODES 128
-#define SD_MAX_VNODES (SD_MAX_NODES * SD_DEFAULT_VNODES)
 
 /*
  * Operations with opcodes above 0x80 are considered part of the inter-sheep
diff --git a/include/sheep.h b/include/sheep.h
index bbb7721..0abd053 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2012-2013 Taobao Inc.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License version
@@ -17,19 +18,18 @@
 #include "bitops.h"
 #include "list.h"
 #include "net.h"
+#include "rbtree.h"
 
 struct sd_vnode {
+       struct rb_node rb;
        const struct sd_node *node;
-       uint64_t        id;
+       uint64_t hash;
 };
 
 struct vnode_info {
-       struct sd_vnode vnodes[SD_MAX_VNODES];
-       int nr_vnodes;
-
+       struct rb_root vroot;
        struct sd_node nodes[SD_MAX_NODES];
        int nr_nodes;
-
        int nr_zones;
        refcnt_t refcnt;
 };
@@ -56,133 +56,77 @@ static inline void sd_init_req(struct sd_req *req, uint8_t 
opcode)
        req->proto_ver = opcode < 0x80 ? SD_PROTO_VER : SD_SHEEP_PROTO_VER;
 }
 
-static inline int same_zone(const struct sd_vnode *e, int n1, int n2)
+static inline int same_zone(struct sd_vnode *v1, struct sd_vnode *v2)
 {
-       return e[n1].node->zone == e[n2].node->zone;
+       return v1->node->zone == v2->node->zone;
 }
 
-/* Get the first vnode's index which is matching the OID */
-static inline int get_vnode_first_idx(const struct sd_vnode *entries,
-                                     int nr_entries, uint64_t oid)
+static inline int vnode_cmp(const struct sd_vnode *node1,
+                           const struct sd_vnode *node2)
 {
-       uint64_t id = sd_hash_oid(oid);
-       int start, end, pos;
-
-       assert(nr_entries > 0);
-
-       start = 0;
-       end = nr_entries - 1;
-
-       if (id > entries[end].id || id < entries[start].id)
-               return (end + 1) % nr_entries;
-
-       for (;;) {
-               pos = (end - start) / 2 + start;
-               if (entries[pos].id < id) {
-                       if (entries[pos + 1].id >= id)
-                               return (pos + 1) % nr_entries;
-                       start = pos;
-               } else
-                       end = pos;
-       }
+       return intcmp(node1->hash, node2->hash);
 }
 
-/* Get next vnode's index according to the PREV_IDXS */
-static inline int get_vnode_next_idx(const struct sd_vnode *entries,
-                                    int nr_entries, int *prev_idxs,
-                                    int nr_prev_idxs)
+/* If v1_hash < oid_hash <= v2_hash, then oid is resident on v2 */
+static inline struct sd_vnode *
+oid_to_first_vnode(uint64_t oid, struct rb_root *root)
 {
-       int i, idx, first_idx;
-       bool found;
-
-       first_idx = prev_idxs[0];
-       idx = prev_idxs[nr_prev_idxs - 1];
-       for (;;) {
-               idx = (idx + 1) % nr_entries;
-               if (unlikely(idx == first_idx))
-                       panic("can't find next new idx");
-
-               for (found = false, i = 0; i < nr_prev_idxs; i++) {
-                       if (same_zone(entries, idx, prev_idxs[i])) {
-                               found = true;
-                               break;
-                       }
-               }
-
-               if (!found)
-                       return idx;
-       }
+       struct sd_vnode dummy = {
+               .hash = sd_hash_oid(oid),
+       };
+       return rb_nsearch(root, &dummy, rb, vnode_cmp);
 }
 
-/* Get the n'th vnode's index which is matching the OID */
-static inline int get_vnode_nth_idx(const struct sd_vnode *entries,
-                       int nr_entries, uint64_t oid, int nth)
+/* Replica are placed along the ring one by one with different zones */
+static inline void oid_to_vnodes(uint64_t oid, struct rb_root *root,
+                                int nr_copies,
+                                struct sd_vnode **vnodes)
 {
-       int nr_idxs = 0, idxs[SD_MAX_COPIES];
-
-       idxs[nr_idxs++] = get_vnode_first_idx(entries, nr_entries, oid);
-
-       if (!nth)
-               return idxs[nth];
-
-       while (nr_idxs <= nth) {
-               idxs[nr_idxs] = get_vnode_next_idx(entries, nr_entries,
-                                                    idxs, nr_idxs);
-               nr_idxs++;
+       struct sd_vnode *next = oid_to_first_vnode(oid, root);
+
+       vnodes[0] = next;
+       for (int i = 1; i < nr_copies; i++) {
+next:
+               next = rb_entry(rb_next(&next->rb), struct sd_vnode, rb);
+               if (!next) /* Wrap around */
+                       next = rb_entry(rb_first(root), struct sd_vnode, rb);
+               if (unlikely(next == vnodes[0]))
+                       panic("can't find a valid vnode");
+               for (int j = 0; j < i; j++)
+                       if (same_zone(vnodes[j], next))
+                               goto next;
+               vnodes[i] = next;
        }
-
-       return idxs[nth];
 }
 
-static inline const struct sd_vnode *oid_to_vnode(const struct sd_vnode 
*entries,
-                                                 int nr_entries, uint64_t oid,
-                                                 int copy_idx)
+static inline const struct sd_vnode *
+oid_to_vnode(uint64_t oid, struct rb_root *root, int copy_idx)
 {
-       int idx = get_vnode_nth_idx(entries, nr_entries, oid, copy_idx);
+       struct sd_vnode *vnodes[SD_MAX_COPIES];
 
-       return &entries[idx];
+       oid_to_vnodes(oid, root, copy_idx + 1, vnodes);
+
+       return vnodes[copy_idx];
 }
 
-static inline const struct sd_node *oid_to_node(const struct sd_vnode *entries,
-                                               int nr_entries, uint64_t oid,
-                                               int copy_idx)
+static inline const struct sd_node *
+oid_to_node(uint64_t oid, struct rb_root *root, int copy_idx)
 {
        const struct sd_vnode *vnode;
 
-       vnode = oid_to_vnode(entries, nr_entries, oid, copy_idx);
+       vnode = oid_to_vnode(oid, root, copy_idx);
 
        return vnode->node;
 }
 
-static inline void oid_to_vnodes(const struct sd_vnode *entries, int 
nr_entries,
-                                uint64_t oid, int nr_copies,
-                                const struct sd_vnode **vnodes)
-{
-       int idx, idxs[SD_MAX_COPIES], i;
-
-       if (nr_entries == 0)
-               return;
-
-       idx = get_vnode_first_idx(entries, nr_entries, oid);
-       idxs[0] = idx;
-       vnodes[0] = &entries[idx];
-
-       for (i = 1; i < nr_copies; i++) {
-               idx = get_vnode_next_idx(entries, nr_entries, idxs, i);
-               idxs[i] = idx;
-               vnodes[i] = &entries[idx];
-       }
-}
-
-static inline void oid_to_nodes(const struct sd_vnode *entries, int nr_entries,
-                               uint64_t oid, int nr_copies,
+static inline void oid_to_nodes(uint64_t oid, struct rb_root *root,
+                               int nr_copies,
                                const struct sd_node **nodes)
 {
-       int i;
-       const struct sd_vnode *vnodes[SD_MAX_COPIES];
+       struct sd_vnode *vnodes[SD_MAX_COPIES];
 
-       oid_to_vnodes(entries, nr_entries, oid, nr_copies, vnodes);
-       for (i = 0; i < nr_copies; i++)
+       oid_to_vnodes(oid, root, nr_copies, vnodes);
+       for (int i = 0; i < nr_copies; i++)
                nodes[i] = vnodes[i]->node;
 }
 
@@ -273,39 +217,24 @@ static inline bool node_eq(const struct sd_node *a, const 
struct sd_node *b)
        return node_cmp(a, b) == 0;
 }
 
-static inline int vnode_cmp(const struct sd_vnode *node1,
-                           const struct sd_vnode *node2)
+static inline void
+nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes, struct rb_root 
*root)
 {
-       return intcmp(node1->id, node2->id);
-}
-
-static inline int node_to_vnodes(const struct sd_node *n,
-                                struct sd_vnode *vnodes)
-{
-       int nr = 0;
-       uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid), io_addr));
-
-       for (int i = 0; i < n->nr_vnodes; i++) {
-               hval = sd_hash_next(hval);
-               vnodes[nr].id = hval;
-               vnodes[nr].node = n;
-               nr++;
+       for (int j = 0; j < nr_nodes; j++) {
+               const struct sd_node *n = nodes + j;
+               uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid),
+                                                         io_addr));
+
+               for (int i = 0; i < n->nr_vnodes; i++) {
+                       struct sd_vnode *v = xmalloc(sizeof(*v));
+
+                       hval = sd_hash_next(hval);
+                       v->hash = hval;
+                       v->node = n;
+                       if (unlikely(rb_insert(root, v, rb, vnode_cmp)))
+                               panic("vdisk hash collison");
+               }
        }
-
-       return nr;
-}
-
-static inline int nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes,
-                                 struct sd_vnode *vnodes)
-{
-       int nr_vnodes = 0;
-
-       for (int i = 0; i < nr_nodes; i++)
-               nr_vnodes += node_to_vnodes(nodes + i, vnodes + nr_vnodes);
-
-       xqsort(vnodes, nr_vnodes, vnode_cmp);
-
-       return nr_vnodes;
 }
 
 #define MAX_NODE_STR_LEN 256
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 33f04e0..475b1bc 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -31,7 +31,7 @@ int gateway_read_obj(struct request *req)
        struct sd_req fwd_hdr;
        struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr;
        const struct sd_vnode *v;
-       const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+       struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
        uint64_t oid = req->rq.obj.oid;
        int nr_copies, j;
 
@@ -43,8 +43,7 @@ int gateway_read_obj(struct request *req)
 
        nr_copies = get_req_copy_number(req);
 
-       oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
-                     nr_copies, obj_vnodes);
+       oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes);
        for (i = 0; i < nr_copies; i++) {
                v = obj_vnodes[i];
                if (!vnode_is_local(v))
@@ -239,11 +238,10 @@ static int init_target_nodes(struct request *req, 
uint64_t oid,
                             const struct sd_node **target_nodes)
 {
        int nr_to_send;
-       const struct vnode_info *vinfo = req->vinfo;
+       struct vnode_info *vinfo = req->vinfo;
 
        nr_to_send = get_req_copy_number(req);
-       oid_to_nodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_to_send,
-                    target_nodes);
+       oid_to_nodes(oid, &vinfo->vroot, nr_to_send, target_nodes);
 
        return nr_to_send;
 }
diff --git a/sheep/group.c b/sheep/group.c
index e2ee266..378d084 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -101,8 +101,14 @@ main_fn struct vnode_info *get_vnode_info(void)
 void put_vnode_info(struct vnode_info *vnode_info)
 {
        if (vnode_info) {
-               if (refcount_dec(&vnode_info->refcnt) == 0)
+               if (refcount_dec(&vnode_info->refcnt) == 0) {
+                       struct sd_vnode *v;
+                       rb_for_each_entry(v, &vnode_info->vroot, rb) {
+                               rb_erase(&v->rb, &vnode_info->vroot);
+                               free(v);
+                       }
                        free(vnode_info);
+               }
        }
 }
 
@@ -139,14 +145,14 @@ struct vnode_info *alloc_vnode_info(const struct sd_node 
*nodes,
 
        vnode_info = xzalloc(sizeof(*vnode_info));
 
+       INIT_RB_ROOT(&vnode_info->vroot);
        vnode_info->nr_nodes = nr_nodes;
        memcpy(vnode_info->nodes, nodes, sizeof(*nodes) * nr_nodes);
        xqsort(vnode_info->nodes, nr_nodes, node_cmp);
 
        recalculate_vnodes(vnode_info->nodes, nr_nodes);
 
-       vnode_info->nr_vnodes = nodes_to_vnodes(vnode_info->nodes, nr_nodes,
-                                               vnode_info->vnodes);
+       nodes_to_vnodes(vnode_info->nodes, nr_nodes, &vnode_info->vroot);
        vnode_info->nr_zones = get_zones_nr_from(nodes, nr_nodes);
        refcount_set(&vnode_info->refcnt, 1);
        return vnode_info;
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 7cba585..68fa4e3 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -392,13 +392,12 @@ static bool oid_stale(uint64_t oid)
        struct vnode_info *vinfo;
        const struct sd_vnode *v;
        bool ret = true;
-       const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+       struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
 
        vinfo = get_vnode_info();
        nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
 
-       oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid,
-                     nr_copies, obj_vnodes);
+       oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
        for (i = 0; i < nr_copies; i++) {
                v = obj_vnodes[i];
                if (vnode_is_local(v)) {
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 6a02adf..758e4b5 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -184,7 +184,7 @@ static int recover_object_from_replica(struct 
recovery_obj_work *row,
        for (int i = 0; i < nr_copies; i++) {
                const struct sd_vnode *vnode;
 
-               vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);
+               vnode = oid_to_vnode(oid, &old->vroot, i);
 
                if (vnode_is_local(vnode)) {
                        start = i;
@@ -197,7 +197,7 @@ static int recover_object_from_replica(struct 
recovery_obj_work *row,
                const struct sd_node *node;
                int idx = (i + start) % nr_copies;
 
-               node = oid_to_node(old->vnodes, old->nr_vnodes, oid, idx);
+               node = oid_to_node(oid, &old->vroot, idx);
 
                if (invalid_node(node, row->base.cur_vinfo))
                        continue;
@@ -712,7 +712,7 @@ static void screen_object_list(struct recovery_list_work 
*rlw,
                               uint64_t *oids, size_t nr_oids)
 {
        struct recovery_work *rw = &rlw->base;
-       const struct sd_vnode *vnodes[SD_MAX_COPIES];
+       struct sd_vnode *vnodes[SD_MAX_COPIES];
        uint64_t old_count = rlw->count;
        uint64_t nr_objs;
        uint64_t i, j;
@@ -724,8 +724,7 @@ static void screen_object_list(struct recovery_list_work 
*rlw,
 
                nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
 
-               oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes,
-                             oids[i], nr_objs, vnodes);
+               oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes);
                for (j = 0; j < nr_objs; j++) {
                        if (!vnode_is_local(vnodes[j]))
                                continue;
diff --git a/sheep/request.c b/sheep/request.c
index 5bd917b..161365e 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -23,14 +23,12 @@ static void del_requeue_request(struct request *req)
 
 static bool is_access_local(struct request *req, uint64_t oid)
 {
-       const struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
+       struct sd_vnode *obj_vnodes[SD_MAX_COPIES];
        int nr_copies;
        int i;
 
        nr_copies = get_req_copy_number(req);
-       oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
-                     nr_copies, obj_vnodes);
-
+       oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes);
        for (i = 0; i < nr_copies; i++) {
                if (vnode_is_local(obj_vnodes[i]))
                        return true;
@@ -310,7 +308,7 @@ static void queue_gateway_request(struct request *req)
                        return;
 
 queue_work:
-       if (req->vinfo->nr_vnodes == 0) {
+       if (RB_EMPTY_ROOT(&req->vinfo->vroot)) {
                sd_err("there is no living nodes");
                req->rp.result = SD_RES_HALT;
                put_request(req);
-- 
1.7.9.5

-- 
sheepdog mailing list
sheepdog@lists.wpkg.org
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to