* increase timeout in etcd3 plugin
* reduce retry frequency on KV store. This can overload the
  KV store even more when the KV store is already timing out
* when 'watch key' returns, try to use the value returned instead of
  unnecessarily reading it again
---
 src/osaf/consensus/consensus.cc         | 53 ++++++++++++++++---------
 src/osaf/consensus/consensus.h          |  7 +++-
 src/osaf/consensus/plugins/etcd3.plugin |  2 +-
 src/rde/rded/rde_cb.h                   |  1 +
 src/rde/rded/rde_main.cc                |  7 +++-
 src/rde/rded/role.cc                    |  5 +++
 6 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/src/osaf/consensus/consensus.cc b/src/osaf/consensus/consensus.cc
index ac5573585..800b776e6 100644
--- a/src/osaf/consensus/consensus.cc
+++ b/src/osaf/consensus/consensus.cc
@@ -432,24 +432,17 @@ SaAisErrorT Consensus::WriteTakeoverResult(
   return rc;
 }
 
-SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) {
+SaAisErrorT Consensus::ParseTakeoverRequest(const std::string& request,
+                                            std::vector<std::string>& tokens) {
   TRACE_ENTER();
 
-  std::string request;
-  SaAisErrorT rc;
-
-  rc = KeyValue::Get(kTakeoverRequestKeyname, request);
-  if (rc != SA_AIS_OK) {
-    // it doesn't always exist, don't log an error
-    TRACE("Could not read takeover request (%d)", rc);
-    return SA_AIS_ERR_FAILED_OPERATION;
-  }
-
   if (request.empty() == true) {
     // on node shutdown, this could be empty
     return SA_AIS_ERR_UNAVAILABLE;
   }
 
+  TRACE("Found '%s'", request.c_str());
+
   tokens.clear();
   Split(request, tokens);
   if (tokens.size() != 4) {
@@ -457,12 +450,28 @@ SaAisErrorT 
Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) {
     return SA_AIS_ERR_LIBRARY;
   }
 
-  TRACE("Found '%s'", request.c_str());
   return SA_AIS_OK;
 }
 
+SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) {
+  TRACE_ENTER();
+
+  std::string request;
+  SaAisErrorT rc;
+
+  rc = KeyValue::Get(kTakeoverRequestKeyname, request);
+  if (rc != SA_AIS_OK) {
+    // it doesn't always exist, don't log an error
+    TRACE("Could not read takeover request (%d)", rc);
+    return SA_AIS_ERR_FAILED_OPERATION;
+  }
+
+  return ParseTakeoverRequest(request, tokens);
+}
+
 Consensus::TakeoverState Consensus::HandleTakeoverRequest(
-    const uint64_t cluster_size) {
+    const uint64_t cluster_size,
+    const std::string& request) {
   TRACE_ENTER();
 
   if (use_consensus_ == false) {
@@ -470,15 +479,21 @@ Consensus::TakeoverState Consensus::HandleTakeoverRequest(
   }
 
   SaAisErrorT rc;
-  uint32_t retries = 0;
   std::vector<std::string> tokens;
 
-  // get request from KV store
-  rc = ReadTakeoverRequest(tokens);
-  while (rc == SA_AIS_ERR_FAILED_OPERATION && retries < kMaxRetry) {
-    ++retries;
-    std::this_thread::sleep_for(kSleepInterval);
+  if (request.empty() == true) {
+    LOG_NO("Empty takeover request from watch command. Read it again.");
+    // if the plugin did not return a value with 'watch',
+    // read request from KV store
+    uint32_t retries = 0;
     rc = ReadTakeoverRequest(tokens);
+    while (rc == SA_AIS_ERR_FAILED_OPERATION && retries < kMaxRetry) {
+      ++retries;
+      std::this_thread::sleep_for(kSleepInterval);
+      rc = ReadTakeoverRequest(tokens);
+    }
+  } else {
+    rc = ParseTakeoverRequest(request, tokens);
   }
 
   if (rc != SA_AIS_OK) {
diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h
index 865078349..fbcbb2037 100644
--- a/src/osaf/consensus/consensus.h
+++ b/src/osaf/consensus/consensus.h
@@ -76,14 +76,15 @@ class Consensus {
   const std::string TakeoverStateStr[4] = {"UNDEFINED", "NEW", "ACCEPTED",
                                            "REJECTED"};
 
-  TakeoverState HandleTakeoverRequest(const uint64_t cluster_size);
+  TakeoverState HandleTakeoverRequest(const uint64_t cluster_size,
+                                      const std::string& request);
 
  private:
   bool use_consensus_ = false;
   bool use_remote_fencing_ = false;
   const std::string kTestKeyname = "opensaf_write_test";
   const std::chrono::milliseconds kSleepInterval =
-      std::chrono::milliseconds(500);  // in ms
+      std::chrono::milliseconds(1000);  // in ms
   static constexpr uint32_t kLockTimeout = 0;  // lock is persistent by default
   static constexpr uint32_t kMaxTakeoverRetry = 20;
   static constexpr uint32_t kMaxRetry = 30;
@@ -95,6 +96,8 @@ class Consensus {
                                     const std::string& proposed_owner,
                                     const uint64_t cluster_size);
 
+  SaAisErrorT ParseTakeoverRequest(const std::string& request,
+                                   std::vector<std::string>& tokens);
   SaAisErrorT ReadTakeoverRequest(std::vector<std::string>& tokens);
 
   SaAisErrorT WriteTakeoverResult(const std::string& current_owner,
diff --git a/src/osaf/consensus/plugins/etcd3.plugin 
b/src/osaf/consensus/plugins/etcd3.plugin
index 1023ea08f..75ef8bb15 100644
--- a/src/osaf/consensus/plugins/etcd3.plugin
+++ b/src/osaf/consensus/plugins/etcd3.plugin
@@ -19,7 +19,7 @@
 readonly keyname="opensaf_consensus_lock"
 readonly directory="/opensaf/"
 readonly etcd_options=""
-readonly etcd_timeout="5s"
+readonly etcd_timeout="10s"
 
 export ETCDCTL_API=3
 
diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
index 877687341..d3f5a24a5 100644
--- a/src/rde/rded/rde_cb.h
+++ b/src/rde/rded/rde_cb.h
@@ -68,6 +68,7 @@ struct rde_msg {
   NODE_ID fr_node_id;
   union {
     rde_peer_info peer_info;
+    char* takeover_request = nullptr;
   } info;
 };
 
diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
index c59aa4536..e5813e414 100644
--- a/src/rde/rded/rde_main.cc
+++ b/src/rde/rded/rde_main.cc
@@ -161,13 +161,16 @@ static void handle_mbx_event() {
       rde_cb->monitor_takeover_req_thread_running = false;
 
       if (role->role() == PCS_RDA_ACTIVE) {
-        LOG_NO("Received takeover request. Our network size is %zu",
+        LOG_NO("Received takeover request '%s'. Our network size is %zu",
+                msg->info.takeover_request,
                rde_cb->cluster_members.size());
 
         Consensus consensus_service;
         Consensus::TakeoverState state =
             consensus_service.HandleTakeoverRequest(
-                rde_cb->cluster_members.size());
+                rde_cb->cluster_members.size(),
+                msg->info.takeover_request);
+        delete[] msg->info.takeover_request;
 
         if (state == Consensus::TakeoverState::ACCEPTED) {
           LOG_NO("Accepted takeover request");
diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
index a03372413..0567fdfcf 100644
--- a/src/rde/rded/role.cc
+++ b/src/rde/rded/role.cc
@@ -54,6 +54,11 @@ void Role::MonitorCallback(const std::string& key, const 
std::string& new_value,
     // don't send this to the main thread straight away, as it will
     // need some time to process topology changes.
     msg->type = RDE_MSG_TAKEOVER_REQUEST_CALLBACK;
+    size_t len = new_value.length() + 1;
+    msg->info.takeover_request = new char[len];
+    strncpy(msg->info.takeover_request, new_value.c_str(), len);
+    LOG_NO("Sending takeover request '%s' to main thread",
+          msg->info.takeover_request);
     std::this_thread::sleep_for(std::chrono::seconds(4));
   } else {
     msg->type = RDE_MSG_NEW_ACTIVE_CALLBACK;
-- 
2.17.1


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to