In the event of network partitioning that results in new etcd leader
change, the 'get' api in the bigger partition is not available for a
few seconds. Therefore, the SC in bigger partition can not promote
but self-fence instead.
This patch adds etcd_tolerance_timeout so the SC in bigger partition
can retry the promotion. However, the SC meanwhile in the smaller
partiton also shares the same etcd_tolerance_timeout retries, hence
the etcd_tolerance_timeout delays the self-fence of SC in smaller
partition. The patch therefore checks the healthiness of self endpoint
where the SC should apply the etcd_tolerance_timeout retries.
---
src/osaf/consensus/plugins/etcd3.plugin | 44 +++++++++++++++----------
1 file changed, 26 insertions(+), 18 deletions(-)
diff --git a/src/osaf/consensus/plugins/etcd3.plugin
b/src/osaf/consensus/plugins/etcd3.plugin
index 6252eedcb..34a975e05 100644
--- a/src/osaf/consensus/plugins/etcd3.plugin
+++ b/src/osaf/consensus/plugins/etcd3.plugin
@@ -23,6 +23,7 @@ readonly directory="/opensaf/"
readonly etcd_options=""
readonly etcd_timeout="3s"
readonly heartbeat_interval=2
+readonly etcd_tolerance_timeout=6
export ETCDCTL_API=3
@@ -332,11 +333,10 @@ unlock() {
# non-zero - failure
watch() {
readonly watch_key="$1"
-
# get baseline
orig_value=$(get "$watch_key")
result=$?
-
+ tol_counter=0
if [ "$result" -le 1 ]; then
if [ "$result" -eq 0 ] && [ "$watch_key" == "$takeover_request" ]; then
state=$(echo $orig_value | awk '{print $4}')
@@ -353,25 +353,33 @@ watch() {
current_value=$(get "$watch_key")
result=$?
if [ "$result" -gt 1 ]; then
- # etcd down?
- if [ "$watch_key" == "$takeover_request" ]; then
- hostname=`cat $node_name_file`
- echo "$hostname SC-0 10000000 UNDEFINED"
- return 0
- else
- return 1
+ # etcd down?, check the healthiness of self endpoint
+ $(etcdctl endpoint health >/dev/null 2>&1)
+ is_healthy=$?
+ ((tol_counter=tol_counter+heartbeat_interval))
+ if [ $tol_counter -ge $etcd_tolerance_timeout ] || [ $is_healthy -ne 0
]; then
+ if [ "$watch_key" == "$takeover_request" ]; then
+ hostname=`cat $node_name_file`
+ echo "$hostname SC-0 10000000 UNDEFINED"
+ return 0
+ else
+ return 1
+ fi
fi
- elif [ "$orig_value" != "$current_value" ]; then
- if [ "$watch_key" == "$takeover_request" ]; then
- state=$(echo $orig_value | awk '{print $4}')
- if [ "$state" == "REJECTED" ] && [ -z "$current_value" ]; then
- # value is cleared after lease time, keep watching
- orig_value=""
- continue
+ else
+ tol_counter=0
+ if [ "$orig_value" != "$current_value" ]; then
+ if [ "$watch_key" == "$takeover_request" ]; then
+ state=$(echo $orig_value | awk '{print $4}')
+ if [ "$state" == "REJECTED" ] && [ -z "$current_value" ]; then
+ # value is cleared after lease time, keep watching
+ orig_value=""
+ continue
+ fi
fi
+ echo $current_value
+ return 0
fi
- echo $current_value
- return 0
fi
done
else
--
2.20.1
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel