From: levin li <xingke....@taobao.com> The new joined node doesn't have the vdi copy list, or have incomplete vdi copy list, so we need to fetch the copy list data from other nodes
Signed-off-by: levin li <xingke....@taobao.com> --- include/internal_proto.h | 1 + sheep/group.c | 11 +++++-- sheep/ops.c | 24 +++++++++++++-- sheep/recovery.c | 42 +++++++++++++++++++++++++-- sheep/sheep_priv.h | 5 ++- sheep/vdi.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 142 insertions(+), 11 deletions(-) diff --git a/include/internal_proto.h b/include/internal_proto.h index 83d98f1..3d70ba9 100644 --- a/include/internal_proto.h +++ b/include/internal_proto.h @@ -63,6 +63,7 @@ #define SD_OP_ENABLE_RECOVER 0xA8 #define SD_OP_DISABLE_RECOVER 0xA9 #define SD_OP_INFO_RECOVER 0xAA +#define SD_OP_GET_VDI_COPIES 0xAB /* internal flags for hdr.flags, must be above 0x80 */ #define SD_FLAG_CMD_RECOVERY 0x0080 diff --git a/sheep/group.c b/sheep/group.c index 05ffb3e..fd631ec 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -853,6 +853,8 @@ static void update_cluster_info(struct join_message *msg, if (msg->inc_epoch) { if (!sys->disable_recovery) { + int is_newly_joined = 0; + uatomic_inc(&sys->epoch); log_current_epoch(); clear_exceptional_node_lists(); @@ -863,8 +865,11 @@ static void update_cluster_info(struct join_message *msg, nodes, nr_nodes); } - start_recovery(current_vnode_info, - old_vnode_info); + if (node_eq(joined, &sys->this_node)) + is_newly_joined = 1; + + start_recovery(current_vnode_info, old_vnode_info, + is_newly_joined); } else prepare_recovery(joined, nodes, nr_nodes); } @@ -1148,7 +1153,7 @@ void sd_leave_handler(struct sd_node *left, struct sd_node *members, case SD_STATUS_OK: uatomic_inc(&sys->epoch); log_current_epoch(); - start_recovery(current_vnode_info, old_vnode_info); + start_recovery(current_vnode_info, old_vnode_info, 0); if (!have_enough_zones()) sys->status = SD_STATUS_HALT; diff --git a/sheep/ops.c b/sheep/ops.c index efaf979..ce0f8a4 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -282,13 +282,16 @@ static int cluster_shutdown(const struct sd_req *req, struct sd_rsp *rsp, static int cluster_enable_recover(const struct sd_req *req, struct sd_rsp *rsp, void *data) { - int i; + int i, is_newly_joined = 0; struct vnode_info *old_vnode_info, *vnode_info; if (nr_joining_nodes) { - for (i = 0; i < nr_joining_nodes; i++) + for (i = 0; i < nr_joining_nodes; i++) { all_nodes[nr_all_nodes++] = joining_nodes[i]; + if (node_eq(&joining_nodes[i], &sys->this_node)) + is_newly_joined = 1; + } old_vnode_info = get_vnode_info(); vnode_info = alloc_vnode_info(all_nodes, nr_all_nodes); @@ -298,7 +301,7 @@ static int cluster_enable_recover(const struct sd_req *req, log_current_epoch(); clear_exceptional_node_lists(); - start_recovery(vnode_info, old_vnode_info); + start_recovery(vnode_info, old_vnode_info, is_newly_joined); put_vnode_info(old_vnode_info); } @@ -447,6 +450,13 @@ static int local_get_obj_list(struct request *req) (struct sd_list_rsp *)&req->rp, req->data); } +static int local_get_vdi_copies(struct request *req) +{ + req->rp.data_length = fill_vdi_copy_list(req->data); + + return SD_RES_SUCCESS; +} + static int local_get_epoch(struct request *req) { uint32_t epoch = req->rq.obj.tgt_epoch; @@ -510,7 +520,7 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp, sys->status = SD_STATUS_HALT; vnode_info = get_vnode_info(); - start_recovery(vnode_info, old_vnode_info); + start_recovery(vnode_info, old_vnode_info, 1); put_vnode_info(vnode_info); out: put_vnode_info(old_vnode_info); @@ -989,6 +999,12 @@ static struct sd_op_template sd_ops[] = { .process_work = local_get_obj_list, }, + [SD_OP_GET_VDI_COPIES] = { + .name = "GET_VDI_COPIES", + .type = SD_OP_TYPE_LOCAL, + .process_work = local_get_vdi_copies, + }, + [SD_OP_GET_EPOCH] = { .name = "GET_EPOCH", .type = SD_OP_TYPE_LOCAL, diff --git a/sheep/recovery.c b/sheep/recovery.c index 5164aa7..3fdcad2 100644 --- a/sheep/recovery.c +++ b/sheep/recovery.c @@ -592,7 +592,38 @@ static inline bool node_is_gateway_only(void) return sys->this_node.nr_vnodes == 0 ? true : false; } -int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo) +static void prepare_vdi_copy_list(struct work *work) +{ + struct recovery_work *rw = container_of(work, struct recovery_work, + work); + struct sd_node *nodes = rw->cur_vinfo->nodes; + int i, nr_nodes = rw->cur_vinfo->nr_nodes; + + for (i = 0; i < nr_nodes; i++) { + if (node_eq(nodes + i, &sys->this_node)) + continue; + + fetch_vdi_copies_from(nodes + i); + } +} + +static void finish_vdi_copy_list(struct work *work) +{ + struct recovery_work *rw = container_of(work, struct recovery_work, + work); + + if (next_rw) { + run_next_rw(rw); + return; + } + + rw->work.fn = prepare_object_list; + rw->work.done = finish_object_list; + queue_work(sys->recovery_wqueue, &rw->work); +} + +int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo, + int is_newly_joined) { struct recovery_work *rw; @@ -613,8 +644,13 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo) rw->cur_vinfo = grab_vnode_info(cur_vinfo); rw->old_vinfo = grab_vnode_info(old_vinfo); - rw->work.fn = prepare_object_list; - rw->work.done = finish_object_list; + if (is_newly_joined) { + rw->work.fn = prepare_vdi_copy_list; + rw->work.done = finish_vdi_copy_list; + } else { + rw->work.fn = prepare_object_list; + rw->work.done = finish_object_list; + } if (sd_store->begin_recover) { struct siocb iocb = { 0 }; diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 3f763c4..335e337 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -207,6 +207,8 @@ int get_max_copy_number(void); int get_req_copy_number(struct request *req); int add_vdi_copies(uint32_t vid, int nr_copies); int load_vdi_copies(void); +int fetch_vdi_copies_from(struct sd_node *node); +int fill_vdi_copy_list(void *data); int vdi_exist(uint32_t vid); int add_vdi(struct vdi_iocb *iocb, uint32_t *new_vid); @@ -272,7 +274,8 @@ uint64_t get_cluster_ctime(void); int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *); int objlist_cache_cleanup(uint32_t vid); -int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo); +int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo, + int is_newly_joined); void resume_recovery_work(void); bool oid_in_recovery(uint64_t oid); int is_recovery_init(void); diff --git a/sheep/vdi.c b/sheep/vdi.c index 0581ad6..72fbd7b 100644 --- a/sheep/vdi.c +++ b/sheep/vdi.c @@ -18,6 +18,11 @@ #include "sheepdog_proto.h" #include "sheep_priv.h" +struct vdi_copy { + uint32_t vid; + uint32_t nr_copies; +}; + struct vdi_copy_entry { uint32_t vid; unsigned int nr_copies; @@ -226,6 +231,71 @@ out: return ret; } +int fetch_vdi_copies_from(struct sd_node *node) +{ + char host[128]; + struct sd_req hdr; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; + struct vdi_copy *vc; + int fd, ret, i; + unsigned int wlen, rlen; + int count = 1 << 15; + + addr_to_str(host, sizeof(host), node->nid.addr, 0); + dprintf("fetch vdi copy list from %s:%d\n", host, node->nid.port); + fd = connect_to(host, node->nid.port); + if (fd < 0) { + dprintf("fail: %m\n"); + return SD_RES_NETWORK_ERROR; + } + + sd_init_req(&hdr, SD_OP_GET_VDI_COPIES); + hdr.epoch = sys->epoch; + hdr.data_length = count * sizeof(*vc); + rlen = hdr.data_length; + wlen = 0; + + vc = xzalloc(rlen); + + ret = exec_req(fd, &hdr, (char *)vc, &wlen, &rlen); + close(fd); + + if (ret || rsp->result != SD_RES_SUCCESS) { + eprintf("fail to get VDI copy list (%d, %d)\n", + ret, rsp->result); + ret = SD_RES_NETWORK_ERROR; + goto out; + } + + count = rsp->data_length / sizeof(*vc); + dprintf("got %d vdi copy data\n", count); + for (i = 0; i < count; i++) + add_vdi_copies(vc[i].vid, vc[i].nr_copies); +out: + free(vc); + return ret; +} + +int fill_vdi_copy_list(void *data) +{ + int nr = 0; + struct rb_node *n; + struct vdi_copy *vc = data; + struct vdi_copy_entry *entry; + + pthread_rwlock_rdlock(&vdi_copy_lock); + for (n = rb_first(&vdi_copy_root); n; n = rb_next(n)) { + entry = rb_entry(n, struct vdi_copy_entry, node); + vc->vid = entry->vid; + vc->nr_copies = entry->nr_copies; + vc++; + nr++; + } + pthread_rwlock_unlock(&vdi_copy_lock); + + return nr * sizeof(*vc); +} + int vdi_exist(uint32_t vid) { struct sheepdog_inode *inode; -- 1.7.1 -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog