Current auto recovery to consume as much as possible the resources of sheepdog node.
So, this patch intended to allow the speed throttling of auto-recovery. By speed throttling, reduce the resource consumption at auto recovery. Add new options to sheep and dog commands. Its options are "interval" and "object processing number". see examples follows. ex) sheep -R max=5,interval=1000 /var/lib/sheepdog dog node recovery set-throttle 5 1000 dog node recovery get-throttle Signed-off-by: Yasuhito Fukuda <fukuda.yasuh...@po.ntts.co.jp> --- dog/node.c | 146 +++++++++++++++++++++++++++++++++++++++++++++- include/internal_proto.h | 2 + include/sheepdog_proto.h | 6 ++ sheep/ops.c | 38 ++++++++++++ sheep/recovery.c | 139 ++++++++++++++++++++++++++++++++++++++++++- sheep/sheep.c | 41 +++++++++++++ sheep/sheep_priv.h | 4 + 7 files changed, 370 insertions(+), 6 deletions(-) diff --git a/dog/node.c b/dog/node.c index a4e9142..d4c8fe7 100644 --- a/dog/node.c +++ b/dog/node.c @@ -183,7 +183,7 @@ static int node_recovery_progress(void) return result < 0 ? EXIT_SYSFAIL : EXIT_SUCCESS; } -static int node_recovery(int argc, char **argv) +static int node_recovery_info(int argc, char **argv) { struct sd_node *n; int ret, i = 0; @@ -235,6 +235,120 @@ static int node_recovery(int argc, char **argv) return EXIT_SUCCESS; } +static int node_recovery_set(int argc, char **argv) +{ + char *p; + struct recovery_throttling *rthrottling; + + rthrottling = xmalloc(sizeof(struct recovery_throttling)); + + if (!argv[optind] || !argv[optind + 1]) { + sd_err("Invalid interval max (%s), interval (%s)", + argv[optind], argv[optind + 1]); + exit(EXIT_USAGE); + } + + rthrottling->max_exec_count = strtoul(argv[optind], &p, 10); + if (argv[optind] == p || rthrottling->max_exec_count < 0 || + UINT32_MAX <= rthrottling->max_exec_count || errno != 0 || + *p != '\0') { + sd_err("Invalid max (%s)", argv[optind]); + exit(EXIT_USAGE); + } + + optind++; + + rthrottling->queue_work_interval = strtoull(argv[optind], &p, 10); + if (argv[optind] == p || rthrottling->queue_work_interval < 0 || + UINT64_MAX <= rthrottling->queue_work_interval || errno != 0 || + *p != '\0') { + sd_err("Invalid interval (%s)", argv[optind]); + exit(EXIT_USAGE); + } + + if ((rthrottling->max_exec_count == 0 && + rthrottling->queue_work_interval != 0) || + (rthrottling->max_exec_count != 0 && + rthrottling->queue_work_interval == 0)) { + sd_err("Invalid interval max (%"PRIu32"), interval (%"PRIu64")", + rthrottling->max_exec_count, rthrottling->queue_work_interval); + exit(EXIT_USAGE); + } + + int ret = 0; + struct sd_req req; + struct sd_rsp *rsp = (struct sd_rsp *)&req; + + sd_init_req(&req, SD_OP_SET_RECOVERY); + req.flags = SD_FLAG_CMD_WRITE; + req.data_length = sizeof(struct recovery_throttling); + ret = dog_exec_req(&sd_nid, &req, rthrottling); + + if (ret < 0) + ret = EXIT_SYSFAIL; + + if (rsp->result == SD_RES_SUCCESS) + ret = EXIT_SUCCESS; + else + ret = EXIT_FAILURE; + + switch (ret) { + case EXIT_FAILURE: + case EXIT_SYSFAIL: + sd_err("Failed to execute request"); + ret = -1; + break; + case EXIT_SUCCESS: + /* do nothing */ + break; + default: + sd_err("unknown return code: %d", ret); + ret = -1; + break; + } + + free(rthrottling); + return ret; +} + +static int node_recovery_get(int argc, char **argv) +{ + struct recovery_throttling rthrottling; + int ret = 0; + + struct sd_req req; + struct sd_rsp *rsp = (struct sd_rsp *)&req; + + sd_init_req(&req, SD_OP_GET_RECOVERY); + req.data_length = sizeof(rthrottling); + + ret = dog_exec_req(&sd_nid, &req, &rthrottling); + if (ret < 0) + ret = EXIT_SYSFAIL; + + if (rsp->result == SD_RES_SUCCESS) + ret = EXIT_SUCCESS; + else + ret = EXIT_FAILURE; + + switch (ret) { + case EXIT_FAILURE: + case EXIT_SYSFAIL: + sd_err("Failed to execute request"); + ret = -1; + break; + case EXIT_SUCCESS: + sd_info("max (%"PRIu32"), interval (%"PRIu64")", + rthrottling.max_exec_count, rthrottling.queue_work_interval); + break; + default: + sd_err("unknown return code: %d", ret); + ret = -1; + break; + } + return ret; +} + static struct sd_node *idx_to_node(struct rb_root *nroot, int idx) { struct sd_node *n = rb_entry(rb_first(nroot), struct sd_node, rb); @@ -538,6 +652,31 @@ static struct sd_option node_options[] = { { 0, NULL, false, NULL }, }; +static struct subcommand node_recovery_cmd[] = { + {"info", NULL, "aphPrT", "show recovery information of nodes (default)", + NULL, CMD_NEED_NODELIST, node_recovery_info, node_options}, + {"set-throttle", "<max> <interval>", NULL, "set new throttling", NULL, + CMD_NEED_ARG|CMD_NEED_NODELIST, node_recovery_set, node_options}, + {"get-throttle", NULL, NULL, "get current throttling", NULL, + CMD_NEED_NODELIST, node_recovery_get, node_options}, + {NULL}, +}; + +static int node_recovery(int argc, char **argv) +{ + int ret; + if (argc == optind) { + ret = update_node_list(SD_MAX_NODES); + if (ret < 0) { + sd_err("Failed to get node list"); + exit(EXIT_SYSFAIL); + } + return node_recovery_info(argc, argv); + } + + return do_generic_subcommand(node_recovery_cmd, argc, argv); +} + static int node_log_level_set(int argc, char **argv) { int ret = 0; @@ -632,8 +771,9 @@ static struct subcommand node_cmd[] = { CMD_NEED_NODELIST, node_list}, {"info", NULL, "aprhT", "show information about each node", NULL, CMD_NEED_NODELIST, node_info}, - {"recovery", NULL, "aphPrT", "show recovery information of nodes", NULL, - CMD_NEED_NODELIST, node_recovery, node_options}, + {"recovery", "<max> <interval>", "aphPrT", + "show recovery information or set/get recovery speed throttling of nodes", + node_recovery_cmd, 0, node_recovery, node_options}, {"md", "[disks]", "aprAfhT", "See 'dog node md' for more information", node_md_cmd, CMD_NEED_ARG, node_md, node_options}, {"stat", NULL, "aprwhT", "show stat information about the node", NULL, diff --git a/include/internal_proto.h b/include/internal_proto.h index 3f5d77f..f6ba18e 100644 --- a/include/internal_proto.h +++ b/include/internal_proto.h @@ -111,6 +111,8 @@ #define SD_OP_VDI_STATE_SNAPSHOT_CTL 0xC7 #define SD_OP_INODE_COHERENCE 0xC8 #define SD_OP_READ_DEL_VDIS 0xC9 +#define SD_OP_GET_RECOVERY 0xCA +#define SD_OP_SET_RECOVERY 0xCB /* internal flags for hdr.flags, must be above 0x80 */ #define SD_FLAG_CMD_RECOVERY 0x0080 diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h index 4f0c48c..5f6d157 100644 --- a/include/sheepdog_proto.h +++ b/include/sheepdog_proto.h @@ -272,6 +272,12 @@ struct generation_reference { int32_t count; }; +struct recovery_throttling { + uint32_t max_exec_count; + uint64_t queue_work_interval; + bool throttling; +}; + struct sd_inode { char name[SD_MAX_VDI_LEN]; char tag[SD_MAX_VDI_TAG_LEN]; diff --git a/sheep/ops.c b/sheep/ops.c index e4daca2..2b4a769 100644 --- a/sheep/ops.c +++ b/sheep/ops.c @@ -1444,6 +1444,30 @@ static int cluster_inode_coherence(const struct sd_req *req, !!req->inode_coherence.validate, &sender->nid); } +static int local_get_recovery(struct request *req) +{ + struct recovery_throttling rthrottling; + + rthrottling = get_recovery(); + memcpy(req->data, &rthrottling, sizeof(rthrottling)); + req->rp.data_length = sizeof(rthrottling); + + return SD_RES_SUCCESS; +} + +static int local_set_recovery(struct request *req) +{ + struct recovery_throttling *rthrottling; + + rthrottling = xmalloc(sizeof(struct recovery_throttling)); + + memcpy(rthrottling, req->data, sizeof(struct recovery_throttling)); + set_recovery(rthrottling); + + free(rthrottling); + return SD_RES_SUCCESS; +} + static struct sd_op_template sd_ops[] = { /* cluster operations */ @@ -1891,6 +1915,20 @@ static struct sd_op_template sd_ops[] = { .type = SD_OP_TYPE_PEER, .process_work = peer_decref_object, }, + + [SD_OP_GET_RECOVERY] = { + .name = "GET_RECOVERY", + .type = SD_OP_TYPE_LOCAL, + .force = true, + .process_work = local_get_recovery, + }, + + [SD_OP_SET_RECOVERY] = { + .name = "SET_RECOVERY", + .type = SD_OP_TYPE_LOCAL, + .force = true, + .process_work = local_set_recovery, + }, }; const struct sd_op_template *get_sd_op(uint8_t opcode) diff --git a/sheep/recovery.c b/sheep/recovery.c index 85dad21..325122b 100644 --- a/sheep/recovery.c +++ b/sheep/recovery.c @@ -78,6 +78,15 @@ struct recovery_info { struct sd_mutex vinfo_lock; struct sd_node *excluded; + + uint32_t max_exec_count; + uint64_t queue_work_interval; + bool throttling; +}; + +struct recovery_timer { + void (*callback)(void *); + void *data; }; static struct recovery_info *next_rinfo; @@ -900,6 +909,91 @@ void resume_suspended_recovery(void) } } +static void recovery_timer_handler(int fd, int events, void *data) +{ + struct recovery_timer *t = data; + uint64_t val; + + if (read(fd, &val, sizeof(val)) < 0) + return; + t->callback(t->data); + unregister_event(fd); + close(fd); +} + +static void add_recovery_timer(struct recovery_timer *t, unsigned int mseconds) +{ + struct itimerspec it; + int tfd; + + tfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (tfd < 0) { + sd_err("timerfd_create: %m"); + return; + } + + memset(&it, 0, sizeof(it)); + it.it_value.tv_sec = mseconds / 1000; + it.it_value.tv_nsec = (mseconds % 1000) * 1000000; + + if (timerfd_settime(tfd, 0, &it, NULL) < 0) { + sd_err("timerfd_settime: %m"); + return; + } + + if (register_event(tfd, recovery_timer_handler, t) < 0) + sd_err("failed to register timer fd"); +} + +static void recover_next_object_delay(void *arg) +{ + struct recovery_info *rinfo = main_thread_get(current_rinfo); + uint32_t nr_threads = md_nr_disks() * 2; + double thread_unit_exec = 0; + double mod = 0; + + if (!rinfo) + return; + + thread_unit_exec = (double) rinfo->max_exec_count / nr_threads; + mod = rinfo->max_exec_count % nr_threads; + + if (rinfo->max_exec_count <= nr_threads || mod != 0) { + if (rand() % 100 + 1 <= (mod / nr_threads) * 100) + thread_unit_exec = ceil(thread_unit_exec); + else + thread_unit_exec = floor(thread_unit_exec); + } + + for (int i = 0; i < thread_unit_exec; i++) { + rinfo = main_thread_get(current_rinfo); + + if (!rinfo) + return; + + if (rinfo->next - rinfo->done > rinfo->max_exec_count) + break; + + recover_next_object(rinfo); + } + + if (rinfo->throttling != sys->rthrottling.throttling) { + rinfo->max_exec_count = sys->rthrottling.max_exec_count; + rinfo->queue_work_interval = + sys->rthrottling.queue_work_interval; + rinfo->throttling = sys->rthrottling.throttling; + } + + if (rinfo->throttling) { + static struct recovery_timer rt = { + .callback = recover_next_object_delay, + .data = &rt, + }; + add_recovery_timer(&rt, rinfo->queue_work_interval); + } else + recover_next_object(rinfo); +} + static void recover_object_main(struct work *work) { struct recovery_work *rw = container_of(work, struct recovery_work, @@ -935,7 +1029,16 @@ static void recover_object_main(struct work *work) if (rinfo->done >= rinfo->count) goto finish_recovery; - recover_next_object(rinfo); + if (!rinfo->throttling && !sys->rthrottling.throttling) + recover_next_object(rinfo); + else if (!rinfo->throttling && sys->rthrottling.throttling) { + static struct recovery_timer rt = { + .callback = recover_next_object_delay, + .data = &rt, + }; + add_recovery_timer(&rt, sys->rthrottling.queue_work_interval); + } + free_recovery_obj_work(row); return; finish_recovery: @@ -982,8 +1085,17 @@ static void finish_object_list(struct work *work) return; } - for (uint32_t i = 0; i < nr_threads; i++) - recover_next_object(rinfo); + for (uint32_t i = 0; i < nr_threads; i++) { + if (rinfo->throttling) { + static struct recovery_timer rt = { + .callback = recover_next_object_delay, + .data = &rt, + }; + add_recovery_timer(&rt, rinfo->queue_work_interval); + } else + recover_next_object(rinfo); + } + return; } @@ -1143,6 +1255,9 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo, rinfo->max_epoch = sys->cinfo.epoch; rinfo->vinfo_array = xzalloc(sizeof(struct vnode_info *) * rinfo->max_epoch); + rinfo->max_exec_count = sys->rthrottling.max_exec_count; + rinfo->queue_work_interval = sys->rthrottling.queue_work_interval; + rinfo->throttling = sys->rthrottling.throttling; sd_init_mutex(&rinfo->vinfo_lock); if (epoch_lifted) rinfo->notify_complete = true; /* Reweight or node recovery */ @@ -1236,3 +1351,21 @@ void get_recovery_state(struct recovery_state *state) state->nr_finished = rinfo->done; state->nr_total = rinfo->count; } + +void set_recovery(struct recovery_throttling *rthrottling) +{ + sys->rthrottling.max_exec_count = rthrottling->max_exec_count; + sys->rthrottling.queue_work_interval = + rthrottling->queue_work_interval; + if (rthrottling->max_exec_count > 0 && + rthrottling->queue_work_interval > 0) + sys->rthrottling.throttling = true; + else + sys->rthrottling.throttling = false; +} + +struct recovery_throttling get_recovery(void) +{ + return sys->rthrottling; +} + diff --git a/sheep/sheep.c b/sheep/sheep.c index ef45a33..9fc7610 100644 --- a/sheep/sheep.c +++ b/sheep/sheep.c @@ -115,6 +115,12 @@ static const char log_help[] = " syslog syslog of the system\n" " stdout standard output\n"; +static const char recovery_help[] = +"Available arguments:\n" +"\tmax=: object recovery process maximum count of each interval\n" +"\tinterval=: object recovery interval time (millisec)\n" +"Example:\n\t$ sheep -R max=50,interval=1000 ...\n"; + static struct sd_option sheep_options[] = { {'b', "bindaddr", true, "specify IP address of interface to listen on", bind_help}, @@ -137,6 +143,8 @@ static struct sd_option sheep_options[] = { {'P', "pidfile", true, "create a pid file"}, {'r', "http", true, "enable http service. (default: disabled)", http_help}, + {'R', "recovery", true, "specify the recovery speed throttling", + recovery_help}, {'u', "upgrade", false, "upgrade to the latest data layout"}, {'v', "version", false, "show the version"}, {'w', "cache", true, "enable object cache", cache_help}, @@ -424,6 +432,26 @@ static struct option_parser journal_parsers[] = { { NULL, NULL }, }; +static uint32_t max_exec_count; +static uint64_t queue_work_interval; +static int max_exec_count_parser(const char *s) +{ + max_exec_count = strtol(s, NULL, 10); + return 0; +} + +static int queue_work_interval_parser(const char *s) +{ + queue_work_interval = strtol(s, NULL, 10); + return 0; +} + +static struct option_parser recovery_parsers[] = { + { "max=", max_exec_count_parser }, + { "interval=", queue_work_interval_parser }, + { NULL, NULL }, +}; + static size_t get_nr_nodes(void) { struct vnode_info *vinfo; @@ -633,6 +661,10 @@ int main(int argc, char **argv) sys->node_status = SD_NODE_STATUS_INITIALIZATION; + sys->rthrottling.max_exec_count = 0; + sys->rthrottling.queue_work_interval = 0; + sys->rthrottling.throttling = false; + install_crash_handler(crash_handler); signal(SIGPIPE, SIG_IGN); @@ -751,6 +783,15 @@ int main(int argc, char **argv) case 'h': usage(0); break; + case 'R': + if (option_parse(optarg, ",", recovery_parsers) < 0) + exit(1); + sys->rthrottling.max_exec_count = max_exec_count; + sys->rthrottling.queue_work_interval + = queue_work_interval; + if (max_exec_count > 0 && queue_work_interval > 0) + sys->rthrottling.throttling = true; + break; case 'v': fprintf(stdout, "Sheepdog daemon version %s\n", PACKAGE_VERSION); diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h index 4ac08f8..170e8ff 100644 --- a/sheep/sheep_priv.h +++ b/sheep/sheep_priv.h @@ -143,6 +143,8 @@ struct system_info { bool gateway_only; bool nosync; + struct recovery_throttling rthrottling; + struct work_queue *net_wqueue; struct work_queue *gateway_wqueue; struct work_queue *io_wqueue; @@ -428,6 +430,8 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool, bool oid_in_recovery(uint64_t oid); bool node_in_recovery(void); void get_recovery_state(struct recovery_state *state); +void set_recovery(struct recovery_throttling *rthrottling); +struct recovery_throttling get_recovery(void); int read_backend_object(uint64_t oid, char *data, unsigned int datalen, uint64_t offset); -- 1.7.1 -- NTTソフトウェア株式会社 クラウド事業部 第一事業ユニット(C一BU) 福田康人(FUKUDA Yasuhito) E-mail:fukuda.yasuh...@po.ntts.co.jp 〒220-0012 横浜市西区みなとみらい4-4-5 横浜アイマークプレイス13階 TEL:045-212-7393/FAX:045-662-7856 -- sheepdog mailing list sheepdog@lists.wpkg.org http://lists.wpkg.org/mailman/listinfo/sheepdog