Hi Gary,

Those variables e.g node_sync_window_closed have been used before headless sync complete. If there is a failover during the headless sync, the new active will start the headless sync again, so those variables have not been needed to checkpoint. But here the scenario happens in split brain, in which the new active is in separated network instead of coming from headless, so I guess we do need checkpoint it, but the checkpoint should be done after the headless sync ?

And the change in timer.h seems not much relates to this ticket?

Thanks

Minh

On 5/6/19 2:03 pm, Gary Lee wrote:
The 'lost' nodes will be rebooted, thus there is no need
to queue sync messages from these nodes.

In addition, node_sync_window_closed is not reliable as it's not
check pointed. We should remove all uses of it in another ticket?

Instead, check if the timer is running.
---
  src/amf/amfd/cb.h      |  2 ++
  src/amf/amfd/ndproc.cc | 30 ++++++++++++++++++++++--------
  src/amf/amfd/timer.h   | 12 ++++++------
  3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/amf/amfd/cb.h b/src/amf/amfd/cb.h
index 89cf15d..8902d78 100644
--- a/src/amf/amfd/cb.h
+++ b/src/amf/amfd/cb.h
@@ -237,6 +237,8 @@ typedef struct cl_cb_tag {
     */
    bool active_services_exist;
    bool all_nodes_synced;
+  // @todo this should be checkpointed to standby? otherwise
+  // after a controller failover, it will still be false?
    bool node_sync_window_closed;
/*
diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc
index 5f5cbcd..20008d9 100644
--- a/src/amf/amfd/ndproc.cc
+++ b/src/amf/amfd/ndproc.cc
@@ -345,19 +345,26 @@ void avd_nd_sisu_state_info_evh(AVD_CL_CB *cb, AVD_EVT 
*evt) {
        evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.node_id,
        evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.msg_id);
- if (cb->node_sync_window_closed == false) {
+  const SaClmNodeIdT node_id =
+    evt->info.avnd_msg->msg_info.n2d_nd_sisu_state_info.node_id;
+
+  if (cb->failover_list.find(node_id) != cb->failover_list.end()) {
+    // ignore msg
+    LOG_WA("sisu_state_info messages received from lost node (%x)",
+           node_id);
+  } else if (cb->node_sync_tmr.is_active == true) {
      AVD_EVT_QUEUE *state_info_evt = new AVD_EVT_QUEUE();
      state_info_evt->evt = new AVD_EVT{};
      state_info_evt->evt->rcv_evt = evt->rcv_evt;
      state_info_evt->evt->info.avnd_msg = n2d_msg;
      cb->evt_queue.push(state_info_evt);
+    return;
    } else {
      LOG_WA(
-        "Ignore this sisu_state_info message since node sync window has 
closed");
-    avsv_dnd_msg_free(n2d_msg);
+      "Ignore this sisu_state_info message since node sync window has closed");
    }
- TRACE_LEAVE();
+  avsv_dnd_msg_free(n2d_msg);
  }
/*****************************************************************************
@@ -387,19 +394,26 @@ void avd_nd_compcsi_state_info_evh(AVD_CL_CB *cb, AVD_EVT 
*evt) {
        evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.node_id,
        evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.msg_id);
- if (cb->node_sync_window_closed == false) {
+  const SaClmNodeIdT node_id =
+    evt->info.avnd_msg->msg_info.n2d_nd_csicomp_state_info.node_id;
+
+  if (cb->failover_list.find(node_id) != cb->failover_list.end()) {
+    // ignore msg
+    LOG_WA("compcsi_state_info messages received from lost node (%x)",
+           node_id);
+  } else if (cb->node_sync_tmr.is_active == true) {
      AVD_EVT_QUEUE *state_info_evt = new AVD_EVT_QUEUE();
      state_info_evt->evt = new AVD_EVT{};
      state_info_evt->evt->rcv_evt = evt->rcv_evt;
      state_info_evt->evt->info.avnd_msg = n2d_msg;
      cb->evt_queue.push(state_info_evt);
+    return;
    } else {
      LOG_WA(
-        "Ignore this compcsi_state_info message since node sync window has 
closed");
-    avsv_dnd_msg_free(n2d_msg);
+      "Ignore this compcsi_state_info message since node sync window has 
closed");
    }
- TRACE_LEAVE();
+  avsv_dnd_msg_free(n2d_msg);
  }
/**
diff --git a/src/amf/amfd/timer.h b/src/amf/amfd/timer.h
index 5316879..6db04c7 100644
--- a/src/amf/amfd/timer.h
+++ b/src/amf/amfd/timer.h
@@ -52,12 +52,12 @@ typedef enum avd_tmr_type {
/* AVD Timer definition */
  typedef struct avd_tmr_tag {
-  tmr_t tmr_id;
-  AVD_TMR_TYPE type;
-  SaClmNodeIdT node_id;
-  std::string spons_si_name;
-  std::string dep_si_name;
-  bool is_active;
+  tmr_t tmr_id{};
+  AVD_TMR_TYPE type{AVD_TMR_MAX};
+  SaClmNodeIdT node_id{};
+  std::string spons_si_name{};
+  std::string dep_si_name{};
+  bool is_active{};
  } AVD_TMR;
/* macro to start the cluster init timer. The cb structure


_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to