Hi

I will push this next Monday if no one has comments.

Thanks
Gary

On 25/6/18, 1:10 pm, "Gary Lee" <gary....@dektech.com.au> wrote:

    * increase timeout in etcd3 plugin
    * reduce retry frequency on KV store. This can overload the
      KV store even more when the KV store is already timing out
    * when 'watch key' returns, try to use the value returned instead of
      unnecessarily reading it again
    ---
     src/osaf/consensus/consensus.cc         | 53 ++++++++++++++++---------
     src/osaf/consensus/consensus.h          |  7 +++-
     src/osaf/consensus/plugins/etcd3.plugin |  2 +-
     src/rde/rded/rde_cb.h                   |  1 +
     src/rde/rded/rde_main.cc                |  7 +++-
     src/rde/rded/role.cc                    |  5 +++
     6 files changed, 51 insertions(+), 24 deletions(-)
    
    diff --git a/src/osaf/consensus/consensus.cc 
b/src/osaf/consensus/consensus.cc
    index ac5573585..800b776e6 100644
    --- a/src/osaf/consensus/consensus.cc
    +++ b/src/osaf/consensus/consensus.cc
    @@ -432,24 +432,17 @@ SaAisErrorT Consensus::WriteTakeoverResult(
       return rc;
     }
     
    -SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& 
tokens) {
    +SaAisErrorT Consensus::ParseTakeoverRequest(const std::string& request,
    +                                            std::vector<std::string>& 
tokens) {
       TRACE_ENTER();
     
    -  std::string request;
    -  SaAisErrorT rc;
    -
    -  rc = KeyValue::Get(kTakeoverRequestKeyname, request);
    -  if (rc != SA_AIS_OK) {
    -    // it doesn't always exist, don't log an error
    -    TRACE("Could not read takeover request (%d)", rc);
    -    return SA_AIS_ERR_FAILED_OPERATION;
    -  }
    -
       if (request.empty() == true) {
         // on node shutdown, this could be empty
         return SA_AIS_ERR_UNAVAILABLE;
       }
     
    +  TRACE("Found '%s'", request.c_str());
    +
       tokens.clear();
       Split(request, tokens);
       if (tokens.size() != 4) {
    @@ -457,12 +450,28 @@ SaAisErrorT 
Consensus::ReadTakeoverRequest(std::vector<std::string>& tokens) {
         return SA_AIS_ERR_LIBRARY;
       }
     
    -  TRACE("Found '%s'", request.c_str());
       return SA_AIS_OK;
     }
     
    +SaAisErrorT Consensus::ReadTakeoverRequest(std::vector<std::string>& 
tokens) {
    +  TRACE_ENTER();
    +
    +  std::string request;
    +  SaAisErrorT rc;
    +
    +  rc = KeyValue::Get(kTakeoverRequestKeyname, request);
    +  if (rc != SA_AIS_OK) {
    +    // it doesn't always exist, don't log an error
    +    TRACE("Could not read takeover request (%d)", rc);
    +    return SA_AIS_ERR_FAILED_OPERATION;
    +  }
    +
    +  return ParseTakeoverRequest(request, tokens);
    +}
    +
     Consensus::TakeoverState Consensus::HandleTakeoverRequest(
    -    const uint64_t cluster_size) {
    +    const uint64_t cluster_size,
    +    const std::string& request) {
       TRACE_ENTER();
     
       if (use_consensus_ == false) {
    @@ -470,15 +479,21 @@ Consensus::TakeoverState 
Consensus::HandleTakeoverRequest(
       }
     
       SaAisErrorT rc;
    -  uint32_t retries = 0;
       std::vector<std::string> tokens;
     
    -  // get request from KV store
    -  rc = ReadTakeoverRequest(tokens);
    -  while (rc == SA_AIS_ERR_FAILED_OPERATION && retries < kMaxRetry) {
    -    ++retries;
    -    std::this_thread::sleep_for(kSleepInterval);
    +  if (request.empty() == true) {
    +    LOG_NO("Empty takeover request from watch command. Read it again.");
    +    // if the plugin did not return a value with 'watch',
    +    // read request from KV store
    +    uint32_t retries = 0;
         rc = ReadTakeoverRequest(tokens);
    +    while (rc == SA_AIS_ERR_FAILED_OPERATION && retries < kMaxRetry) {
    +      ++retries;
    +      std::this_thread::sleep_for(kSleepInterval);
    +      rc = ReadTakeoverRequest(tokens);
    +    }
    +  } else {
    +    rc = ParseTakeoverRequest(request, tokens);
       }
     
       if (rc != SA_AIS_OK) {
    diff --git a/src/osaf/consensus/consensus.h b/src/osaf/consensus/consensus.h
    index 865078349..fbcbb2037 100644
    --- a/src/osaf/consensus/consensus.h
    +++ b/src/osaf/consensus/consensus.h
    @@ -76,14 +76,15 @@ class Consensus {
       const std::string TakeoverStateStr[4] = {"UNDEFINED", "NEW", "ACCEPTED",
                                                "REJECTED"};
     
    -  TakeoverState HandleTakeoverRequest(const uint64_t cluster_size);
    +  TakeoverState HandleTakeoverRequest(const uint64_t cluster_size,
    +                                      const std::string& request);
     
      private:
       bool use_consensus_ = false;
       bool use_remote_fencing_ = false;
       const std::string kTestKeyname = "opensaf_write_test";
       const std::chrono::milliseconds kSleepInterval =
    -      std::chrono::milliseconds(500);  // in ms
    +      std::chrono::milliseconds(1000);  // in ms
       static constexpr uint32_t kLockTimeout = 0;  // lock is persistent by 
default
       static constexpr uint32_t kMaxTakeoverRetry = 20;
       static constexpr uint32_t kMaxRetry = 30;
    @@ -95,6 +96,8 @@ class Consensus {
                                         const std::string& proposed_owner,
                                         const uint64_t cluster_size);
     
    +  SaAisErrorT ParseTakeoverRequest(const std::string& request,
    +                                   std::vector<std::string>& tokens);
       SaAisErrorT ReadTakeoverRequest(std::vector<std::string>& tokens);
     
       SaAisErrorT WriteTakeoverResult(const std::string& current_owner,
    diff --git a/src/osaf/consensus/plugins/etcd3.plugin 
b/src/osaf/consensus/plugins/etcd3.plugin
    index 1023ea08f..75ef8bb15 100644
    --- a/src/osaf/consensus/plugins/etcd3.plugin
    +++ b/src/osaf/consensus/plugins/etcd3.plugin
    @@ -19,7 +19,7 @@
     readonly keyname="opensaf_consensus_lock"
     readonly directory="/opensaf/"
     readonly etcd_options=""
    -readonly etcd_timeout="5s"
    +readonly etcd_timeout="10s"
     
     export ETCDCTL_API=3
     
    diff --git a/src/rde/rded/rde_cb.h b/src/rde/rded/rde_cb.h
    index 877687341..d3f5a24a5 100644
    --- a/src/rde/rded/rde_cb.h
    +++ b/src/rde/rded/rde_cb.h
    @@ -68,6 +68,7 @@ struct rde_msg {
       NODE_ID fr_node_id;
       union {
         rde_peer_info peer_info;
    +    char* takeover_request = nullptr;
       } info;
     };
     
    diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc
    index c59aa4536..e5813e414 100644
    --- a/src/rde/rded/rde_main.cc
    +++ b/src/rde/rded/rde_main.cc
    @@ -161,13 +161,16 @@ static void handle_mbx_event() {
           rde_cb->monitor_takeover_req_thread_running = false;
     
           if (role->role() == PCS_RDA_ACTIVE) {
    -        LOG_NO("Received takeover request. Our network size is %zu",
    +        LOG_NO("Received takeover request '%s'. Our network size is %zu",
    +                msg->info.takeover_request,
                    rde_cb->cluster_members.size());
     
             Consensus consensus_service;
             Consensus::TakeoverState state =
                 consensus_service.HandleTakeoverRequest(
    -                rde_cb->cluster_members.size());
    +                rde_cb->cluster_members.size(),
    +                msg->info.takeover_request);
    +        delete[] msg->info.takeover_request;
     
             if (state == Consensus::TakeoverState::ACCEPTED) {
               LOG_NO("Accepted takeover request");
    diff --git a/src/rde/rded/role.cc b/src/rde/rded/role.cc
    index a03372413..0567fdfcf 100644
    --- a/src/rde/rded/role.cc
    +++ b/src/rde/rded/role.cc
    @@ -54,6 +54,11 @@ void Role::MonitorCallback(const std::string& key, const 
std::string& new_value,
         // don't send this to the main thread straight away, as it will
         // need some time to process topology changes.
         msg->type = RDE_MSG_TAKEOVER_REQUEST_CALLBACK;
    +    size_t len = new_value.length() + 1;
    +    msg->info.takeover_request = new char[len];
    +    strncpy(msg->info.takeover_request, new_value.c_str(), len);
    +    LOG_NO("Sending takeover request '%s' to main thread",
    +          msg->info.takeover_request);
         std::this_thread::sleep_for(std::chrono::seconds(4));
       } else {
         msg->type = RDE_MSG_NEW_ACTIVE_CALLBACK;
    -- 
    2.17.1
    
    



------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to