Better defensive programming against invalid join messages.

We have a cluster which got into a state where the protocol mismatched,
and the malloc metadata in the heap got corrupted. This patch would
detect the potential heap corruption and make a version mismatch
more easily detectable. It is also good defensive programming
against invalid join messages.

Signed-off-by: Shevek <[email protected]>

---
 sheep/cluster.h           |    2 +-
 sheep/cluster/accord.c    |    2 +-
 sheep/cluster/corosync.c  |    2 +-
 sheep/cluster/local.c     |    2 +-
 sheep/cluster/zookeeper.c |    2 +-
 sheep/group.c             |   24 +++++++++++++++++-------
 6 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/sheep/cluster.h b/sheep/cluster.h
index d543e99..759d0eb 100644
--- a/sheep/cluster.h
+++ b/sheep/cluster.h
@@ -190,6 +190,6 @@ void sd_leave_handler(struct sd_node *left, struct
sd_node *members,
                size_t nr_members);
 void sd_notify_handler(struct sd_node *sender, void *msg, size_t
msg_len);
 enum cluster_join_result sd_check_join_cb(struct sd_node *joining,
-               void *opaque);
+               void *opaque, size_t opaque_len);
 
 #endif
diff --git a/sheep/cluster/accord.c b/sheep/cluster/accord.c
index 1fdca91..dad1c2f 100644
--- a/sheep/cluster/accord.c
+++ b/sheep/cluster/accord.c
@@ -576,7 +576,7 @@ static int accord_dispatch(void)
        case EVENT_JOIN:
                if (ev.blocked) {
                        if (node_cmp(&ev.nodes[0], &this_node) == 0) {
-                               res = sd_check_join_cb(&ev.sender, ev.buf);
+                               res = sd_check_join_cb(&ev.sender, ev.buf, 
ev.buf_len);
                                ev.join_result = res;
                                ev.blocked = 0;
 
diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
index 4a588e9..217d060 100644
--- a/sheep/cluster/corosync.c
+++ b/sheep/cluster/corosync.c
@@ -296,7 +296,7 @@ static int __corosync_dispatch_one(struct
corosync_event *cevent)
                                return 0;
 
                        res = sd_check_join_cb(&cevent->sender.ent,
-                                                    cevent->msg);
+                                                    cevent->msg, 
cevent->msg_len);
                        if (res == CJ_RES_MASTER_TRANSFER)
                                nr_cpg_nodes = 0;
 
diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
index fd84615..3d75e68 100644
--- a/sheep/cluster/local.c
+++ b/sheep/cluster/local.c
@@ -404,7 +404,7 @@ static int local_dispatch(void)
        case EVENT_JOIN:
                if (ev->blocked) {
                        if (node_cmp(&ev->nodes[0], &this_node) == 0) {
-                               res = sd_check_join_cb(&ev->sender, ev->buf);
+                               res = sd_check_join_cb(&ev->sender, ev->buf, 
ev->buf_len);
                                ev->join_result = res;
                                ev->blocked = 0;
                                msync(ev, sizeof(*ev), MS_SYNC);
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 491056a..335c261 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -777,7 +777,7 @@ static int zk_dispatch(void)
                        dprintf("one sheep joined[up], nr_nodes:%ld, sender:%s, 
joined:%d
\n",
                                        nr_zk_nodes, 
node_to_str(&ev.sender.node), ev.sender.joined);
                        if (is_master(zhandle, &this_node)) {
-                               res = sd_check_join_cb(&ev.sender.node, ev.buf);
+                               res = sd_check_join_cb(&ev.sender.node, ev.buf, 
ev.buf_len);
                                ev.join_result = res;
                                ev.blocked = 0;
                                ev.sender.joined = 1;
diff --git a/sheep/group.c b/sheep/group.c
index c7fd387..1017437 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -449,12 +449,6 @@ out:
 
 static void join(struct sd_node *joining, struct join_message *msg)
 {
-       if (msg->proto_ver != SD_SHEEP_PROTO_VER) {
-               eprintf("joining node sent a message with the wrong protocol 
version
\n");
-               msg->result = SD_RES_VER_MISMATCH;
-               return;
-       }
-
        msg->result = get_cluster_status(joining, msg->nodes, msg->nr_nodes,
                                         msg->ctime, msg->epoch,
                                         &msg->cluster_status, &msg->inc_epoch);
@@ -735,11 +729,27 @@ static void __sd_leave(struct event_struct
*cevent)
        }
 }
 
-enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void
*opaque)
+enum cluster_join_result sd_check_join_cb(struct sd_node *joining, void
*opaque, size_t opaque_len)
 {
        struct join_message *jm = opaque;
        struct node *node;
 
+       if (opaque_len < sizeof(*jm)) {
+               eprintf("joining node sent a message which is too short\n");
+               // msg->result = SD_RES_VER_MISMATCH;   // This may segfault 
anyway if
msg_len == 0
+               return CJ_RES_FAIL;
+       }
+       if (opaque_len != get_join_message_size(jm)) {
+               eprintf("joining node sent a message size %zu, expected %zu\n",
opaque_len, get_join_message_size(jm));
+               jm->result = SD_RES_VER_MISMATCH;
+               return CJ_RES_FAIL;
+       }
+       if (jm->proto_ver != SD_SHEEP_PROTO_VER) {
+               eprintf("joining node sent a message with the wrong protocol 
version
\n");
+               jm->result = SD_RES_VER_MISMATCH;
+               return CJ_RES_FAIL;
+       }
+
        if (node_cmp(joining, &sys->this_node) == 0) {
                struct sd_node entries[SD_MAX_NODES];
                int nr_entries;
-- 
1.7.5.4



-- 
sheepdog mailing list
[email protected]
http://lists.wpkg.org/mailman/listinfo/sheepdog

Reply via email to