We cannot call exec_req in cluster_force_recover (in the main thread). With this patch, sheep gets epoch info in the worker thread, and notifies it to all the node.
Signed-off-by: MORITA Kazutaka <morita.kazut...@lab.ntt.co.jp> --- collie/cluster.c | 13 ++++++++--- sheep/group.c | 4 ++-- sheep/ops.c | 64 ++++++++++++++++++++++++++++++++++++++++-------------- sheep/sheep_priv.h | 2 ++ 4 files changed, 62 insertions(+), 21 deletions(-) diff --git a/collie/cluster.c b/collie/cluster.c index da2effa..0c5ac13 100644 --- a/collie/cluster.c +++ b/collie/cluster.c @@ -325,7 +325,9 @@ static int cluster_force_recover(int argc, char **argv) { int ret; struct sd_req hdr; + struct sd_rsp *rsp = (struct sd_rsp *)&hdr; char str[123] = {'\0'}; + struct sd_node nodes[SD_MAX_NODES]; if (!cluster_cmd_data.force) { int i, l; @@ -341,10 +343,15 @@ static int cluster_force_recover(int argc, char **argv) } sd_init_req(&hdr, SD_OP_FORCE_RECOVER); + hdr.data_length = sizeof(nodes); - ret = send_light_req(&hdr, sdhost, sdport); - if (ret) { - fprintf(stderr, "failed to execute request\n"); + ret = collie_exec_req(sdhost, sdport, &hdr, nodes); + if (ret < 0) + return EXIT_SYSFAIL; + + if (rsp->result != SD_RES_SUCCESS) { + fprintf(stderr, "failed to execute request, %s\n", + sd_strerror(rsp->result)); return EXIT_FAILURE; } diff --git a/sheep/group.c b/sheep/group.c index e70389f..85fe8d5 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -166,8 +166,8 @@ void put_vnode_info(struct vnode_info *vnode_info) } } -static struct vnode_info *alloc_vnode_info(const struct sd_node *nodes, - size_t nr_nodes) +struct vnode_info *alloc_vnode_info(const struct sd_node *nodes, + size_t nr_nodes) { struct vnode_info *vnode_info; diff --git a/sheep/ops.c b/sheep/ops.c index a6494b7..96b25eb 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -480,13 +480,10 @@ static int local_get_epoch(struct request *req) return SD_RES_SUCCESS; } -static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp, - void *data) +static int cluster_force_recover_work(struct request *req) { - struct vnode_info *old_vnode_info, *vnode_info; - int ret = SD_RES_SUCCESS; - uint8_t c; - uint16_t f; + struct vnode_info *old_vnode_info; + uint32_t epoch = sys_epoch(); /* * We should manually recover the cluster when @@ -494,8 +491,48 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp, * 2) some nodes are physically down (same epoch condition). * In both case, the nodes(s) stat is WAIT_FOR_JOIN. */ - if (sys->status != SD_STATUS_WAIT_FOR_JOIN) + if (sys->status != SD_STATUS_WAIT_FOR_JOIN || req->vinfo == NULL) + return SD_RES_FORCE_RECOVER; + + old_vnode_info = get_vnode_info_epoch(epoch, req->vinfo); + if (!old_vnode_info) { + sd_printf(SDOG_EMERG, "cannot get vnode info for epoch %d", + epoch); + put_vnode_info(old_vnode_info); return SD_RES_FORCE_RECOVER; + } + + if (req->rq.data_length < + sizeof(*old_vnode_info->nodes) * old_vnode_info->nr_nodes) { + sd_eprintf("too small buffer size, %d", req->rq.data_length); + return SD_RES_INVALID_PARMS; + } + + req->rp.epoch = epoch; + req->rp.data_length = sizeof(*old_vnode_info->nodes) * + old_vnode_info->nr_nodes; + memcpy(req->data, old_vnode_info->nodes, req->rp.data_length); + + put_vnode_info(old_vnode_info); + + return SD_RES_SUCCESS; +} + +static int cluster_force_recover_main(const struct sd_req *req, + struct sd_rsp *rsp, + void *data) +{ + struct vnode_info *old_vnode_info, *vnode_info; + int ret = SD_RES_SUCCESS; + uint8_t c; + uint16_t f; + struct sd_node *nodes = data; + size_t nr_nodes = rsp->data_length / sizeof(*nodes); + + if (rsp->epoch != sys->epoch) { + sd_eprintf("epoch was incremented while cluster_force_recover"); + return SD_RES_FORCE_RECOVER; + } ret = get_cluster_copies(&c); if (ret) { @@ -511,14 +548,6 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp, sys->nr_copies = c; sys->flags = f; - vnode_info = get_vnode_info(); - old_vnode_info = get_vnode_info_epoch(sys->epoch, vnode_info); - if (!old_vnode_info) { - sd_printf(SDOG_EMERG, "cannot get vnode info for epoch %d", - sys->epoch); - goto err; - } - sys->epoch++; /* some nodes are left, so we get a new epoch */ ret = log_current_epoch(); if (ret) { @@ -531,6 +560,8 @@ static int cluster_force_recover(const struct sd_req *req, struct sd_rsp *rsp, else sys->status = SD_STATUS_HALT; + vnode_info = get_vnode_info(); + old_vnode_info = alloc_vnode_info(nodes, nr_nodes); start_recovery(vnode_info, old_vnode_info); put_vnode_info(vnode_info); put_vnode_info(old_vnode_info); @@ -993,7 +1024,8 @@ static struct sd_op_template sd_ops[] = { .name = "FORCE_RECOVER", .type = SD_OP_TYPE_CLUSTER, .force = true, - .process_main = cluster_force_recover, + .process_work = cluster_force_recover_work, + .process_main = cluster_force_recover_main, }, [SD_OP_SNAPSHOT] = { diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 7722473..8f33e40 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -274,6 +274,8 @@ bool have_enough_zones(void); struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info); struct vnode_info *get_vnode_info(void); void put_vnode_info(struct vnode_info *vinfo); +struct vnode_info *alloc_vnode_info(const struct sd_node *nodes, + size_t nr_nodes); struct vnode_info *get_vnode_info_epoch(uint32_t epoch, struct vnode_info *cur_vinfo); void wait_get_vdis_done(void); -- 1.8.1.3.566.gaa39828 -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog