This patch fixes the case where you do the following: 1: start all nodes 2: isolate node n1 3: Kill corosync on n1 4: unisolate node n1 5: start corosync on n1 6: start cpg on all nodes 7: isolate node n1 8: Kill corosync on n1 9: unisolate node n1 10: start corosync on n1 11: Waiting for config change on n2
To achieve this we need to mcast some info around the cluster when cpg service gets a sync event. We need to do this to determine the correct downlist to use. The problem is that all nodes do not have a consistent view of the cluster (a joining node think lots of other nodes are joining whilst the others think only one is joining). So I have used the number of "old_members" and then the nodeid as a way of desciding the sync master. So each node send it's nodeid, number of old_members and downlist. The sync master is the node with the largest num_old_members and smallest nodeid. Would it be useful to make this information public? i.e. in objdb "runtime/cpg/sync_master=<nodeid>" Then if you are interested you could read this value, get notifications etc... Regards Angus Signed-off-by: Angus Salkeld <asalk...@redhat.com> --- services/cpg.c | 215 ++++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 153 insertions(+), 62 deletions(-) diff --git a/services/cpg.c b/services/cpg.c index 8ce74ae..e076942 100644 --- a/services/cpg.c +++ b/services/cpg.c @@ -75,7 +75,8 @@ enum cpg_message_req_types { MESSAGE_REQ_EXEC_CPG_PROCLEAVE = 1, MESSAGE_REQ_EXEC_CPG_JOINLIST = 2, MESSAGE_REQ_EXEC_CPG_MCAST = 3, - MESSAGE_REQ_EXEC_CPG_DOWNLIST = 4 + MESSAGE_REQ_EXEC_CPG_DOWNLIST_OLD = 4, + MESSAGE_REQ_EXEC_CPG_DOWNLIST = 5 }; /* @@ -127,6 +128,13 @@ enum cpg_sync_state { CPGSYNC_JOINLIST }; +enum cpg_downlist_state_e { + CPG_DOWNLIST_NONE, + CPG_DOWNLIST_WAITING_FOR_MESSAGES, + CPG_DOWNLIST_APPLYING, +}; +static enum cpg_downlist_state_e downlist_state; +static struct list_head downlist_messages_head; struct cpg_pd { void *conn; @@ -202,6 +210,10 @@ static void message_handler_req_exec_cpg_mcast ( const void *message, unsigned int nodeid); +static void message_handler_req_exec_cpg_downlist_old ( + const void *message, + unsigned int nodeid); + static void message_handler_req_exec_cpg_downlist ( const void *message, unsigned int nodeid); @@ -212,6 +224,8 @@ static void exec_cpg_joinlist_endian_convert (void *msg); static void exec_cpg_mcast_endian_convert (void *msg); +static void exec_cpg_downlist_endian_convert_old (void *msg); + static void exec_cpg_downlist_endian_convert (void *msg); static void message_handler_req_lib_cpg_join (void *conn, const void *message); @@ -326,6 +340,10 @@ static struct corosync_exec_handler cpg_exec_engine[] = .exec_endian_convert_fn = exec_cpg_mcast_endian_convert }, { /* 4 */ + .exec_handler_fn = message_handler_req_exec_cpg_downlist_old, + .exec_endian_convert_fn = exec_cpg_downlist_endian_convert_old + }, + { /* 5 */ .exec_handler_fn = message_handler_req_exec_cpg_downlist, .exec_endian_convert_fn = exec_cpg_downlist_endian_convert }, @@ -415,34 +433,30 @@ struct req_exec_cpg_mcast { mar_uint8_t message[] __attribute__((aligned(8))); }; -struct req_exec_cpg_downlist { +struct req_exec_cpg_downlist_old { coroipc_request_header_t header __attribute__((aligned(8))); mar_uint32_t left_nodes __attribute__((aligned(8))); mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); }; -static struct req_exec_cpg_downlist g_req_exec_cpg_downlist; +struct req_exec_cpg_downlist { + coroipc_request_header_t header __attribute__((aligned(8))); + /* merge decisions */ + mar_uint32_t old_members __attribute__((aligned(8))); + /* downlist below */ + mar_uint32_t left_nodes __attribute__((aligned(8))); + mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); +}; -static int memb_list_remove_value (unsigned int *list, - size_t list_entries, int value) -{ - int j; - int found = 0; +struct downlist_msg { + mar_uint32_t sender_nodeid; + mar_uint32_t old_members __attribute__((aligned(8))); + mar_uint32_t left_nodes __attribute__((aligned(8))); + mar_uint32_t nodeids[PROCESSOR_COUNT_MAX] __attribute__((aligned(8))); + struct list_head list; +}; - for (j = 0; j < list_entries; j++) { - if (list[j] == value) { - /* mark next values to be copied down */ - found = 1; - } - else if (found) { - list[j-1] = list[j]; - } - } - if (found) - return (list_entries - 1); - else - return list_entries; -} +static struct req_exec_cpg_downlist g_req_exec_cpg_downlist; static void cpg_sync_init_v2 ( const unsigned int *trans_list, @@ -451,7 +465,6 @@ static void cpg_sync_init_v2 ( size_t member_list_entries, const struct memb_ring_id *ring_id) { - unsigned int lowest_nodeid = 0xffffffff; int entries; int i, j; int found; @@ -465,6 +478,11 @@ static void cpg_sync_init_v2 ( last_sync_ring_id.nodeid = ring_id->rep.nodeid; last_sync_ring_id.seq = ring_id->seq; + downlist_state = CPG_DOWNLIST_WAITING_FOR_MESSAGES; + entries = 0; + /* + * Determine list of nodeids for downlist message + */ for (i = 0; i < my_old_member_list_entries; i++) { found = 0; for (j = 0; j < trans_list_entries; j++) { @@ -474,35 +492,8 @@ static void cpg_sync_init_v2 ( } } if (found == 0) { - my_member_list_entries = memb_list_remove_value ( - my_member_list, my_member_list_entries, - my_old_member_list[i]); - } - } - - for (i = 0; i < my_member_list_entries; i++) { - if (my_member_list[i] < lowest_nodeid) { - lowest_nodeid = my_member_list[i]; - } - } - - entries = 0; - if (lowest_nodeid == api->totem_nodeid_get()) { - /* - * Determine list of nodeids for downlist message - */ - for (i = 0; i < my_old_member_list_entries; i++) { - found = 0; - for (j = 0; j < trans_list_entries; j++) { - if (my_old_member_list[i] == trans_list[j]) { - found = 1; - break; - } - } - if (found == 0) { - g_req_exec_cpg_downlist.nodeids[entries++] = - my_old_member_list[i]; - } + g_req_exec_cpg_downlist.nodeids[entries++] = + my_old_member_list[i]; } } g_req_exec_cpg_downlist.left_nodes = entries; @@ -525,17 +516,69 @@ static int cpg_sync_process (void) return (res); } +static struct downlist_msg* downlist_master_choose (void) +{ + struct downlist_msg *cmp; + struct downlist_msg *best = NULL; + struct list_head *iter; + + for (iter = downlist_messages_head.next; + iter != &downlist_messages_head; + iter = iter->next) { + + cmp = list_entry(iter, struct downlist_msg, list); + if (best == NULL) { + best = cmp; + continue; + } + + if (cmp->old_members < best->old_members) { + continue; + } + else if (cmp->old_members > best->old_members) { + best = cmp; + } + else if (cmp->sender_nodeid < best->sender_nodeid) { + best = cmp; + } + + } + return best; +} + +static void downlist_messages_delete (void) +{ + struct downlist_msg *stored_msg; + struct list_head *iter, *iter_next; + + for (iter = downlist_messages_head.next; + iter != &downlist_messages_head; + iter = iter_next) { + + iter_next = iter->next; + + stored_msg = list_entry(iter, struct downlist_msg, list); + list_del (&stored_msg->list); + free (stored_msg); + } +} + static void cpg_sync_activate (void) { memcpy (my_old_member_list, my_member_list, my_member_list_entries * sizeof (unsigned int)); my_old_member_list_entries = my_member_list_entries; + downlist_messages_delete (); + downlist_state = CPG_DOWNLIST_NONE; + notify_lib_totem_membership (NULL, my_member_list_entries, my_member_list); } static void cpg_sync_abort (void) { + downlist_messages_delete (); + downlist_state = CPG_DOWNLIST_NONE; } static int notify_lib_totem_membership ( @@ -711,6 +754,7 @@ static int cpg_exec_init_fn (struct corosync_api_v1 *corosync_api) #ifdef COROSYNC_SOLARIS logsys_subsys_init(); #endif + list_init (&downlist_messages_head); api = corosync_api; return (0); } @@ -817,12 +861,17 @@ static void exec_cpg_joinlist_endian_convert (void *msg_v) } } +static void exec_cpg_downlist_endian_convert_old (void *msg) +{ +} + static void exec_cpg_downlist_endian_convert (void *msg) { struct req_exec_cpg_downlist *req_exec_cpg_downlist = msg; unsigned int i; req_exec_cpg_downlist->left_nodes = swab32(req_exec_cpg_downlist->left_nodes); + req_exec_cpg_downlist->old_members = swab32(req_exec_cpg_downlist->old_members); for (i = 0; i < req_exec_cpg_downlist->left_nodes; i++) { req_exec_cpg_downlist->nodeids[i] = swab32(req_exec_cpg_downlist->nodeids[i]); @@ -908,7 +957,13 @@ static void do_proc_join( MESSAGE_RES_CPG_CONFCHG_CALLBACK); } -static void message_handler_req_exec_cpg_downlist ( +static void message_handler_req_exec_cpg_downlist_old ( + const void *message, + unsigned int nodeid) +{ +} + +static void message_handler_req_exec_cpg_downlist( const void *message, unsigned int nodeid) { @@ -916,27 +971,61 @@ static void message_handler_req_exec_cpg_downlist ( int i; mar_cpg_address_t left_list[1]; struct list_head *iter; + struct downlist_msg *stored_msg; + int found; - /* - FOR OPTIMALIZATION - Make list of lists - */ + if (downlist_state != CPG_DOWNLIST_WAITING_FOR_MESSAGES) { + log_printf (LOGSYS_LEVEL_WARNING, "downlist left_list: %d received in state %d", + req_exec_cpg_downlist->left_nodes, downlist_state); + return; + } + + stored_msg = malloc (sizeof (struct downlist_msg)); + stored_msg->sender_nodeid = nodeid; + stored_msg->old_members = req_exec_cpg_downlist->old_members; + stored_msg->left_nodes = req_exec_cpg_downlist->left_nodes; + memcpy (stored_msg->nodeids, req_exec_cpg_downlist->nodeids, + req_exec_cpg_downlist->left_nodes * sizeof (mar_uint32_t)); + list_init (&stored_msg->list); + list_add (&stored_msg->list, &downlist_messages_head); - log_printf (LOGSYS_LEVEL_DEBUG, "downlist left_list: %d\n", req_exec_cpg_downlist->left_nodes); + for (i = 0; i < my_member_list_entries; i++) { + found = 0; + for (iter = downlist_messages_head.next; + iter != &downlist_messages_head; + iter = iter->next) { + stored_msg = list_entry(iter, struct downlist_msg, list); + if (my_member_list[i] == stored_msg->sender_nodeid) { + found = 1; + } + } + if (!found) { + return; + } + } + + downlist_state = CPG_DOWNLIST_APPLYING; + stored_msg = downlist_master_choose (); + + log_printf (LOGSYS_LEVEL_DEBUG, "chosen downlist from node %s", + api->totem_ifaces_print(stored_msg->sender_nodeid)); + + /* send events */ for (iter = process_info_list_head.next; iter != &process_info_list_head; ) { struct process_info *pi = list_entry(iter, struct process_info, list); iter = iter->next; - for (i = 0; i < req_exec_cpg_downlist->left_nodes; i++) { - if (pi->nodeid == req_exec_cpg_downlist->nodeids[i]) { + for (i = 0; i < stored_msg->left_nodes; i++) { + if (pi->nodeid == stored_msg->nodeids[i]) { left_list[0].nodeid = pi->nodeid; left_list[0].pid = pi->pid; left_list[0].reason = CONFCHG_CPG_REASON_NODEDOWN; notify_lib_joinlist(&pi->group, NULL, - 0, NULL, - 1, left_list, - MESSAGE_RES_CPG_CONFCHG_CALLBACK); + 0, NULL, + 1, left_list, + MESSAGE_RES_CPG_CONFCHG_CALLBACK); list_del (&pi->list); free (pi); break; @@ -1081,6 +1170,8 @@ static int cpg_exec_send_downlist(void) g_req_exec_cpg_downlist.header.id = SERVICE_ID_MAKE(CPG_SERVICE, MESSAGE_REQ_EXEC_CPG_DOWNLIST); g_req_exec_cpg_downlist.header.size = sizeof(struct req_exec_cpg_downlist); + g_req_exec_cpg_downlist.old_members = my_old_member_list_entries; + iov.iov_base = (void *)&g_req_exec_cpg_downlist; iov.iov_len = g_req_exec_cpg_downlist.header.size; -- 1.6.6.1 _______________________________________________ Openais mailing list Openais@lists.linux-foundation.org https://lists.linux-foundation.org/mailman/listinfo/openais