Sometimes etcd server take long time to handle the request and command "etcdctl
get"
will be timeout. Consensus understands that etcd may be downed and need to
self-fence event though etcd server is handling the request.
The "etcdctl get" command in "watch" should be do the retries and the dial
timeout should be increased.
---
src/osaf/consensus/plugins/etcd3.plugin | 119 ++++++++++++++++++++------------
1 file changed, 76 insertions(+), 43 deletions(-)
diff --git a/src/osaf/consensus/plugins/etcd3.plugin
b/src/osaf/consensus/plugins/etcd3.plugin
index d92688557..56f15a06a 100644
--- a/src/osaf/consensus/plugins/etcd3.plugin
+++ b/src/osaf/consensus/plugins/etcd3.plugin
@@ -21,7 +21,7 @@ readonly takeover_request="takeover_request"
readonly node_name_file="/etc/opensaf/node_name"
readonly directory="/opensaf/"
readonly etcd_options=""
-readonly etcd_timeout="3s"
+readonly etcd_timeout="5s"
readonly heartbeat_interval=2
export ETCDCTL_API=3
@@ -131,9 +131,13 @@ create_key() {
put \""$directory$key"\" \""$value"\" "$lease_param"
"
- output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<<
"$transaction")
- if [[ "$output" == *"OK"* ]]; then
- return 0
+ if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<<
"$transaction")
+ then
+ if [[ "$output" == *"OK"* ]]; then
+ return 0
+ fi
+ else
+ return 3
fi
if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout get
"$directory$key" | tail -n1)
@@ -227,9 +231,15 @@ lock() {
put \""$directory$keyname"\" \""$owner"\"
"
- output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<<
"$transaction")
- if [[ "$output" == *"OK"* ]]; then
- return 0
+ if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<<
"$transaction")
+ then
+ if [[ "$output" == *"OK"* ]]; then
+ return 0
+ fi
+ else
+ # Lost connectivity with etcd server
+ echo "$output"
+ return 2
fi
# key already exists, make sure it's empty
@@ -238,9 +248,15 @@ lock() {
put \""$directory$keyname"\" \""$owner"\"
"
- output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<<
"$transaction")
- if [[ "$output" == *"OK"* ]]; then
- return 0
+ if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<<
"$transaction")
+ then
+ if [[ "$output" == *"OK"* ]]; then
+ return 0
+ fi
+ else
+ # Lost connectivity with etcd server
+ echo "$output"
+ return 2
fi
current_owner=$(etcdctl $etcd_options --dial-timeout $etcd_timeout get
"$directory$keyname" | tail -n1)
@@ -294,9 +310,15 @@ unlock() {
put \""$directory$keyname"\" \"\"
"
- output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<<
"$transaction")
- if [[ "$output" == *"OK"* ]]; then
- return 0
+ if output=$(etcdctl $etcd_options --dial-timeout $etcd_timeout txn <<<
"$transaction")
+ then
+ if [[ "$output" == *"OK"* ]]; then
+ return 0
+ fi
+ else
+ # Lost connectivity with etcd server
+ echo "$output"
+ return 2
fi
# failed! check we own the lock
@@ -332,41 +354,52 @@ unlock() {
# non-zero - failure
watch() {
readonly watch_key="$1"
-
- # get baseline
- orig_value=$(get "$watch_key")
- result=$?
-
- if [ "$result" -le "1" ]; then
- while true
- do
- sleep $heartbeat_interval
- current_value=$(get "$watch_key")
- result=$?
- if [ "$result" -gt "1" ]; then
- # etcd down?
- if [ "$watch_key" == "$takeover_request" ]; then
- hostname=`cat $node_name_file`
- echo "$hostname SC-0 10000000 UNDEFINED"
+ orig_value=0
+ num_tried=0
+
+ while true
+ do
+ # get baseline
+ output=$(get "$watch_key")
+ result=$?
+
+ if [ "$result" -le "1" ]; then
+ if [ "$orig_value" == "0" ]; then
+ orig_value=$output
+ fi
+ while true
+ do
+ sleep $heartbeat_interval
+ current_value=$(get "$watch_key")
+ result=$?
+ if [ "$result" -gt "1" ]; then
+ # Lost connectivity with etcd server. Try again
+ if [ "$watch_key" == "$takeover_request" ]; then
+ num_tried=1
+ break
+ else
+ return 1
+ fi
+ elif [ "$orig_value" != "$current_value" ]; then
+ echo $current_value
return 0
- else
- return 1
fi
- elif [ "$orig_value" != "$current_value" ]; then
- echo $current_value
+ done
+ else
+ # Lost connectivity with etcd server
+ num_tried=$((num_tried + 1))
+ if [ $num_tried -lt 2 ]; then
+ continue
+ fi
+ if [ "$watch_key" == "$takeover_request" ]; then
+ hostname=`cat $node_name_file`
+ echo "$hostname SC-0 10000000 UNDEFINED"
return 0
+ else
+ return 1
fi
- done
- else
- # etcd down?
- if [ "$watch_key" == "$takeover_request" ]; then
- hostname=`cat $node_name_file`
- echo "$hostname SC-0 10000000 UNDEFINED"
- return 0
- else
- return 1
fi
- fi
+ done
}
# argument parsing
--
2.15.1
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel