From: levin li <[email protected]> When epoch changes, new node join or old node leave, we should recalculate the vnode_info for every sd_node, and the disk space is stored in sd_node, transfered to every other node together with join message.
Signed-off-by: levin li <[email protected]> --- include/internal_proto.h | 3 ++- sheep/cluster.h | 1 + sheep/group.c | 24 +++++++++++++++++++++++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/internal_proto.h b/include/internal_proto.h index 4f1b0a0..0394e05 100644 --- a/include/internal_proto.h +++ b/include/internal_proto.h @@ -19,7 +19,7 @@ #include <stdint.h> -#define SD_SHEEP_PROTO_VER 0x05 +#define SD_SHEEP_PROTO_VER 0x06 #define SD_DEFAULT_REDUNDANCY 3 #define SD_MAX_REDUNDANCY 8 @@ -177,6 +177,7 @@ struct sd_node { struct node_id nid; uint16_t nr_vnodes; uint32_t zone; + uint32_t space; }; struct epoch_log { diff --git a/sheep/cluster.h b/sheep/cluster.h index 153e33f..75596a8 100644 --- a/sheep/cluster.h +++ b/sheep/cluster.h @@ -198,5 +198,6 @@ void sd_notify_handler(struct sd_node *sender, void *msg, size_t msg_len); bool sd_block_handler(struct sd_node *sender); enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void *opaque); +void recalculate_vnodes(struct sd_node *nodes, int nr_nodes); #endif diff --git a/sheep/group.c b/sheep/group.c index cb86050..03044cc 100644 --- a/sheep/group.c +++ b/sheep/group.c @@ -211,7 +211,9 @@ struct vnode_info *alloc_vnode_info(struct sd_node *nodes, memcpy(vnode_info->nodes, nodes, sizeof(*nodes) * nr_nodes); qsort(vnode_info->nodes, nr_nodes, sizeof(*nodes), node_id_cmp); - vnode_info->nr_vnodes = nodes_to_vnodes(nodes, nr_nodes, + recalculate_vnodes(vnode_info->nodes, nr_nodes); + + vnode_info->nr_vnodes = nodes_to_vnodes(vnode_info->nodes, nr_nodes, vnode_info->vnodes); vnode_info->nr_zones = get_zones_nr_from(nodes, nr_nodes); uatomic_set(&vnode_info->refcnt, 1); @@ -806,6 +808,24 @@ static void prepare_recovery(struct sd_node *joined, current_vnode_info = alloc_vnode_info(nodes, nr_nodes); } +void recalculate_vnodes(struct sd_node *nodes, int nr_nodes) +{ + int i; + uint64_t avg_size = 0; + float factor; + + for (i = 0; i < nr_nodes; i++) + avg_size += nodes[i].space; + avg_size /= nr_nodes; + + for (i = 0; i < nr_nodes; i++) { + factor = (float)nodes[i].space / (float)avg_size; + nodes[i].nr_vnodes = SD_DEFAULT_VNODES * factor; + dprintf("node %d has %d vnodes, free space %" PRIu32 "\n", + nodes[i].nid.port, nodes[i].nr_vnodes, nodes[i].space); + } +} + static void update_cluster_info(struct join_message *msg, struct sd_node *joined, struct sd_node *nodes, size_t nr_nodes) @@ -1196,6 +1216,8 @@ int create_cluster(int port, int64_t zone, int nr_vnodes, sys->this_node.zone = zone; dprintf("zone id = %u\n", sys->this_node.zone); + sys->this_node.space = sys->disk_space; + if (get_latest_epoch() > 0) { sys->status = SD_STATUS_WAIT_FOR_JOIN; -- 1.7.1 -- sheepdog mailing list [email protected] http://lists.wpkg.org/mailman/listinfo/sheepdog
