Signed-off-by: Liu Yuan <namei.u...@gmail.com> --- dog/dog.c | 6 +- dog/dog.h | 4 +- dog/vdi.c | 24 +++--- include/internal_proto.h | 1 - include/sheep.h | 201 +++++++++++++++------------------------------- sheep/gateway.c | 10 +-- sheep/group.c | 12 ++- sheep/plain_store.c | 5 +- sheep/recovery.c | 9 +-- sheep/request.c | 8 +- 10 files changed, 105 insertions(+), 175 deletions(-)
diff --git a/dog/dog.c b/dog/dog.c index 9ec41c1..0009c92 100644 --- a/dog/dog.c +++ b/dog/dog.c @@ -49,8 +49,8 @@ static void usage(const struct command *commands, int status); uint32_t sd_epoch; struct sd_node sd_nodes[SD_MAX_NODES]; -struct sd_vnode sd_vnodes[SD_MAX_VNODES]; -int sd_nodes_nr, sd_vnodes_nr; +int sd_nodes_nr; +struct rb_root sd_vroot = RB_ROOT; int update_node_list(int max_nodes) { @@ -92,7 +92,7 @@ int update_node_list(int max_nodes) } memcpy(sd_nodes, buf, size); - sd_vnodes_nr = nodes_to_vnodes(sd_nodes, sd_nodes_nr, sd_vnodes); + nodes_to_vnodes(sd_nodes, sd_nodes_nr, &sd_vroot); sd_epoch = hdr.epoch; out: if (buf) diff --git a/dog/dog.h b/dog/dog.h index e1c499e..9b17eb7 100644 --- a/dog/dog.h +++ b/dog/dog.h @@ -56,8 +56,8 @@ extern bool verbose; extern uint32_t sd_epoch; extern struct sd_node sd_nodes[SD_MAX_NODES]; -extern struct sd_vnode sd_vnodes[SD_MAX_VNODES]; -extern int sd_nodes_nr, sd_vnodes_nr; +extern struct rb_root sd_vroot; +extern int sd_nodes_nr; bool is_current(const struct sd_inode *i); char *strnumber(uint64_t _size); diff --git a/dog/vdi.c b/dog/vdi.c index 82821ac..b4e8821 100644 --- a/dog/vdi.c +++ b/dog/vdi.c @@ -904,14 +904,12 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies) int i, j, ret; struct sd_req hdr; struct sd_rsp *rsp = (struct sd_rsp *)&hdr; - struct sd_vnode *vnodes; - const struct sd_vnode *vnode_buf[SD_MAX_COPIES]; + struct sd_vnode *vnode_buf[SD_MAX_COPIES]; struct epoch_log *logs; - int vnodes_nr, nr_logs, log_length; + int nr_logs, log_length; log_length = sd_epoch * sizeof(struct epoch_log); logs = xmalloc(log_length); - vnodes = xmalloc(sizeof(*vnodes) * SD_MAX_VNODES); sd_init_req(&hdr, SD_OP_STAT_CLUSTER); hdr.data_length = log_length; @@ -927,6 +925,9 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies) nr_logs = rsp->data_length / sizeof(struct epoch_log); for (i = nr_logs - 1; i >= 0; i--) { + struct rb_root vroot; + struct sd_vnode *v; + printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n", oid, logs[i].epoch, nr_copies); printf("---------------------------------------------------\n"); @@ -943,22 +944,23 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies) } continue; } - vnodes_nr = nodes_to_vnodes(logs[i].nodes, - logs[i].nr_nodes, vnodes); - oid_to_vnodes(vnodes, vnodes_nr, oid, nr_copies, vnode_buf); + nodes_to_vnodes(logs[i].nodes, logs[i].nr_nodes, &vroot); + oid_to_vnodes(oid, &vroot, nr_copies, vnode_buf); for (j = 0; j < nr_copies; j++) { const struct node_id *n = &vnode_buf[j]->node->nid; printf("%s\n", addr_to_str(n->addr, n->port)); } + rb_for_each_entry(v, &vroot, rb) { + rb_erase(&v->rb, &vroot); + free(v); + } } free(logs); - free(vnodes); return EXIT_SUCCESS; error: free(logs); - free(vnodes); return EXIT_SYSFAIL; } @@ -1531,7 +1533,7 @@ static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid, uint64_t *done, struct work_queue *wq) { struct vdi_check_info *info; - const struct sd_vnode *tgt_vnodes[SD_MAX_COPIES]; + struct sd_vnode *tgt_vnodes[SD_MAX_COPIES]; int nr_copies = inode->nr_copies; info = xzalloc(sizeof(*info) + sizeof(info->vcw[0]) * nr_copies); @@ -1541,7 +1543,7 @@ static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid, info->done = done; info->wq = wq; - oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes); + oid_to_vnodes(oid, &sd_vroot, nr_copies, tgt_vnodes); for (int i = 0; i < nr_copies; i++) { info->vcw[i].info = info; info->vcw[i].vnode = tgt_vnodes[i]; diff --git a/include/internal_proto.h b/include/internal_proto.h index 8544c75..7d1df2c 100644 --- a/include/internal_proto.h +++ b/include/internal_proto.h @@ -29,7 +29,6 @@ #define SD_MAX_NODES 1024 #define SD_DEFAULT_VNODES 128 -#define SD_MAX_VNODES (SD_MAX_NODES * SD_DEFAULT_VNODES) /* * Operations with opcodes above 0x80 are considered part of the inter-sheep diff --git a/include/sheep.h b/include/sheep.h index bbb7721..0abd053 100644 --- a/include/sheep.h +++ b/include/sheep.h @@ -1,5 +1,6 @@ /* * Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2012-2013 Taobao Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version @@ -17,19 +18,18 @@ #include "bitops.h" #include "list.h" #include "net.h" +#include "rbtree.h" struct sd_vnode { + struct rb_node rb; const struct sd_node *node; - uint64_t id; + uint64_t hash; }; struct vnode_info { - struct sd_vnode vnodes[SD_MAX_VNODES]; - int nr_vnodes; - + struct rb_root vroot; struct sd_node nodes[SD_MAX_NODES]; int nr_nodes; - int nr_zones; refcnt_t refcnt; }; @@ -56,133 +56,77 @@ static inline void sd_init_req(struct sd_req *req, uint8_t opcode) req->proto_ver = opcode < 0x80 ? SD_PROTO_VER : SD_SHEEP_PROTO_VER; } -static inline int same_zone(const struct sd_vnode *e, int n1, int n2) +static inline int same_zone(struct sd_vnode *v1, struct sd_vnode *v2) { - return e[n1].node->zone == e[n2].node->zone; + return v1->node->zone == v2->node->zone; } -/* Get the first vnode's index which is matching the OID */ -static inline int get_vnode_first_idx(const struct sd_vnode *entries, - int nr_entries, uint64_t oid) +static inline int vnode_cmp(const struct sd_vnode *node1, + const struct sd_vnode *node2) { - uint64_t id = sd_hash_oid(oid); - int start, end, pos; - - assert(nr_entries > 0); - - start = 0; - end = nr_entries - 1; - - if (id > entries[end].id || id < entries[start].id) - return (end + 1) % nr_entries; - - for (;;) { - pos = (end - start) / 2 + start; - if (entries[pos].id < id) { - if (entries[pos + 1].id >= id) - return (pos + 1) % nr_entries; - start = pos; - } else - end = pos; - } + return intcmp(node1->hash, node2->hash); } -/* Get next vnode's index according to the PREV_IDXS */ -static inline int get_vnode_next_idx(const struct sd_vnode *entries, - int nr_entries, int *prev_idxs, - int nr_prev_idxs) +/* If v1_hash < oid_hash <= v2_hash, then oid is resident on v2 */ +static inline struct sd_vnode * +oid_to_first_vnode(uint64_t oid, struct rb_root *root) { - int i, idx, first_idx; - bool found; - - first_idx = prev_idxs[0]; - idx = prev_idxs[nr_prev_idxs - 1]; - for (;;) { - idx = (idx + 1) % nr_entries; - if (unlikely(idx == first_idx)) - panic("can't find next new idx"); - - for (found = false, i = 0; i < nr_prev_idxs; i++) { - if (same_zone(entries, idx, prev_idxs[i])) { - found = true; - break; - } - } - - if (!found) - return idx; - } + struct sd_vnode dummy = { + .hash = sd_hash_oid(oid), + }; + return rb_nsearch(root, &dummy, rb, vnode_cmp); } -/* Get the n'th vnode's index which is matching the OID */ -static inline int get_vnode_nth_idx(const struct sd_vnode *entries, - int nr_entries, uint64_t oid, int nth) +/* Replica are placed along the ring one by one with different zones */ +static inline void oid_to_vnodes(uint64_t oid, struct rb_root *root, + int nr_copies, + struct sd_vnode **vnodes) { - int nr_idxs = 0, idxs[SD_MAX_COPIES]; - - idxs[nr_idxs++] = get_vnode_first_idx(entries, nr_entries, oid); - - if (!nth) - return idxs[nth]; - - while (nr_idxs <= nth) { - idxs[nr_idxs] = get_vnode_next_idx(entries, nr_entries, - idxs, nr_idxs); - nr_idxs++; + struct sd_vnode *next = oid_to_first_vnode(oid, root); + + vnodes[0] = next; + for (int i = 1; i < nr_copies; i++) { +next: + next = rb_entry(rb_next(&next->rb), struct sd_vnode, rb); + if (!next) /* Wrap around */ + next = rb_entry(rb_first(root), struct sd_vnode, rb); + if (unlikely(next == vnodes[0])) + panic("can't find a valid vnode"); + for (int j = 0; j < i; j++) + if (same_zone(vnodes[j], next)) + goto next; + vnodes[i] = next; } - - return idxs[nth]; } -static inline const struct sd_vnode *oid_to_vnode(const struct sd_vnode *entries, - int nr_entries, uint64_t oid, - int copy_idx) +static inline const struct sd_vnode * +oid_to_vnode(uint64_t oid, struct rb_root *root, int copy_idx) { - int idx = get_vnode_nth_idx(entries, nr_entries, oid, copy_idx); + struct sd_vnode *vnodes[SD_MAX_COPIES]; - return &entries[idx]; + oid_to_vnodes(oid, root, copy_idx + 1, vnodes); + + return vnodes[copy_idx]; } -static inline const struct sd_node *oid_to_node(const struct sd_vnode *entries, - int nr_entries, uint64_t oid, - int copy_idx) +static inline const struct sd_node * +oid_to_node(uint64_t oid, struct rb_root *root, int copy_idx) { const struct sd_vnode *vnode; - vnode = oid_to_vnode(entries, nr_entries, oid, copy_idx); + vnode = oid_to_vnode(oid, root, copy_idx); return vnode->node; } -static inline void oid_to_vnodes(const struct sd_vnode *entries, int nr_entries, - uint64_t oid, int nr_copies, - const struct sd_vnode **vnodes) -{ - int idx, idxs[SD_MAX_COPIES], i; - - if (nr_entries == 0) - return; - - idx = get_vnode_first_idx(entries, nr_entries, oid); - idxs[0] = idx; - vnodes[0] = &entries[idx]; - - for (i = 1; i < nr_copies; i++) { - idx = get_vnode_next_idx(entries, nr_entries, idxs, i); - idxs[i] = idx; - vnodes[i] = &entries[idx]; - } -} - -static inline void oid_to_nodes(const struct sd_vnode *entries, int nr_entries, - uint64_t oid, int nr_copies, +static inline void oid_to_nodes(uint64_t oid, struct rb_root *root, + int nr_copies, const struct sd_node **nodes) { - int i; - const struct sd_vnode *vnodes[SD_MAX_COPIES]; + struct sd_vnode *vnodes[SD_MAX_COPIES]; - oid_to_vnodes(entries, nr_entries, oid, nr_copies, vnodes); - for (i = 0; i < nr_copies; i++) + oid_to_vnodes(oid, root, nr_copies, vnodes); + for (int i = 0; i < nr_copies; i++) nodes[i] = vnodes[i]->node; } @@ -273,39 +217,24 @@ static inline bool node_eq(const struct sd_node *a, const struct sd_node *b) return node_cmp(a, b) == 0; } -static inline int vnode_cmp(const struct sd_vnode *node1, - const struct sd_vnode *node2) +static inline void +nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes, struct rb_root *root) { - return intcmp(node1->id, node2->id); -} - -static inline int node_to_vnodes(const struct sd_node *n, - struct sd_vnode *vnodes) -{ - int nr = 0; - uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid), io_addr)); - - for (int i = 0; i < n->nr_vnodes; i++) { - hval = sd_hash_next(hval); - vnodes[nr].id = hval; - vnodes[nr].node = n; - nr++; + for (int j = 0; j < nr_nodes; j++) { + const struct sd_node *n = nodes + j; + uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid), + io_addr)); + + for (int i = 0; i < n->nr_vnodes; i++) { + struct sd_vnode *v = xmalloc(sizeof(*v)); + + hval = sd_hash_next(hval); + v->hash = hval; + v->node = n; + if (unlikely(rb_insert(root, v, rb, vnode_cmp))) + panic("vdisk hash collison"); + } } - - return nr; -} - -static inline int nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes, - struct sd_vnode *vnodes) -{ - int nr_vnodes = 0; - - for (int i = 0; i < nr_nodes; i++) - nr_vnodes += node_to_vnodes(nodes + i, vnodes + nr_vnodes); - - xqsort(vnodes, nr_vnodes, vnode_cmp); - - return nr_vnodes; } #define MAX_NODE_STR_LEN 256 diff --git a/sheep/gateway.c b/sheep/gateway.c index 33f04e0..475b1bc 100644 --- a/sheep/gateway.c +++ b/sheep/gateway.c @@ -31,7 +31,7 @@ int gateway_read_obj(struct request *req) struct sd_req fwd_hdr; struct sd_rsp *rsp = (struct sd_rsp *)&fwd_hdr; const struct sd_vnode *v; - const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; + struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; uint64_t oid = req->rq.obj.oid; int nr_copies, j; @@ -43,8 +43,7 @@ int gateway_read_obj(struct request *req) nr_copies = get_req_copy_number(req); - oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, - nr_copies, obj_vnodes); + oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (!vnode_is_local(v)) @@ -239,11 +238,10 @@ static int init_target_nodes(struct request *req, uint64_t oid, const struct sd_node **target_nodes) { int nr_to_send; - const struct vnode_info *vinfo = req->vinfo; + struct vnode_info *vinfo = req->vinfo; nr_to_send = get_req_copy_number(req); - oid_to_nodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_to_send, - target_nodes); + oid_to_nodes(oid, &vinfo->vroot, nr_to_send, target_nodes); return nr_to_send; } diff --git a/sheep/group.c b/sheep/group.c index e2ee266..378d084 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -101,8 +101,14 @@ main_fn struct vnode_info *get_vnode_info(void) void put_vnode_info(struct vnode_info *vnode_info) { if (vnode_info) { - if (refcount_dec(&vnode_info->refcnt) == 0) + if (refcount_dec(&vnode_info->refcnt) == 0) { + struct sd_vnode *v; + rb_for_each_entry(v, &vnode_info->vroot, rb) { + rb_erase(&v->rb, &vnode_info->vroot); + free(v); + } free(vnode_info); + } } } @@ -139,14 +145,14 @@ struct vnode_info *alloc_vnode_info(const struct sd_node *nodes, vnode_info = xzalloc(sizeof(*vnode_info)); + INIT_RB_ROOT(&vnode_info->vroot); vnode_info->nr_nodes = nr_nodes; memcpy(vnode_info->nodes, nodes, sizeof(*nodes) * nr_nodes); xqsort(vnode_info->nodes, nr_nodes, node_cmp); recalculate_vnodes(vnode_info->nodes, nr_nodes); - vnode_info->nr_vnodes = nodes_to_vnodes(vnode_info->nodes, nr_nodes, - vnode_info->vnodes); + nodes_to_vnodes(vnode_info->nodes, nr_nodes, &vnode_info->vroot); vnode_info->nr_zones = get_zones_nr_from(nodes, nr_nodes); refcount_set(&vnode_info->refcnt, 1); return vnode_info; diff --git a/sheep/plain_store.c b/sheep/plain_store.c index 7cba585..68fa4e3 100644 --- a/sheep/plain_store.c +++ b/sheep/plain_store.c @@ -392,13 +392,12 @@ static bool oid_stale(uint64_t oid) struct vnode_info *vinfo; const struct sd_vnode *v; bool ret = true; - const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; + struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; vinfo = get_vnode_info(); nr_copies = get_obj_copy_number(oid, vinfo->nr_zones); - oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid, - nr_copies, obj_vnodes); + oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { v = obj_vnodes[i]; if (vnode_is_local(v)) { diff --git a/sheep/recovery.c b/sheep/recovery.c index 6a02adf..758e4b5 100644 --- a/sheep/recovery.c +++ b/sheep/recovery.c @@ -184,7 +184,7 @@ static int recover_object_from_replica(struct recovery_obj_work *row, for (int i = 0; i < nr_copies; i++) { const struct sd_vnode *vnode; - vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i); + vnode = oid_to_vnode(oid, &old->vroot, i); if (vnode_is_local(vnode)) { start = i; @@ -197,7 +197,7 @@ static int recover_object_from_replica(struct recovery_obj_work *row, const struct sd_node *node; int idx = (i + start) % nr_copies; - node = oid_to_node(old->vnodes, old->nr_vnodes, oid, idx); + node = oid_to_node(oid, &old->vroot, idx); if (invalid_node(node, row->base.cur_vinfo)) continue; @@ -712,7 +712,7 @@ static void screen_object_list(struct recovery_list_work *rlw, uint64_t *oids, size_t nr_oids) { struct recovery_work *rw = &rlw->base; - const struct sd_vnode *vnodes[SD_MAX_COPIES]; + struct sd_vnode *vnodes[SD_MAX_COPIES]; uint64_t old_count = rlw->count; uint64_t nr_objs; uint64_t i, j; @@ -724,8 +724,7 @@ static void screen_object_list(struct recovery_list_work *rlw, nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones); - oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes, - oids[i], nr_objs, vnodes); + oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes); for (j = 0; j < nr_objs; j++) { if (!vnode_is_local(vnodes[j])) continue; diff --git a/sheep/request.c b/sheep/request.c index 5bd917b..161365e 100644 --- a/sheep/request.c +++ b/sheep/request.c @@ -23,14 +23,12 @@ static void del_requeue_request(struct request *req) static bool is_access_local(struct request *req, uint64_t oid) { - const struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; + struct sd_vnode *obj_vnodes[SD_MAX_COPIES]; int nr_copies; int i; nr_copies = get_req_copy_number(req); - oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid, - nr_copies, obj_vnodes); - + oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes); for (i = 0; i < nr_copies; i++) { if (vnode_is_local(obj_vnodes[i])) return true; @@ -310,7 +308,7 @@ static void queue_gateway_request(struct request *req) return; queue_work: - if (req->vinfo->nr_vnodes == 0) { + if (RB_EMPTY_ROOT(&req->vinfo->vroot)) { sd_err("there is no living nodes"); req->rp.result = SD_RES_HALT; put_request(req); -- 1.7.9.5 -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog