Gehel has submitted this change and it was merged.

Change subject: Improve robustness of es-tool
......................................................................


Improve robustness of es-tool

Make set_allocation_state try REPLICATION_ENABLE_ATTEMPTS times
before definitive failure + added timeout arguments to be passed.

Bug: T128786
Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4
---
M modules/elasticsearch/files/es-tool
1 file changed, 28 insertions(+), 20 deletions(-)

Approvals:
  Gehel: Looks good to me, approved
  Nicko: Looks good to me, but someone else must approve
  jenkins-bot: Verified



diff --git a/modules/elasticsearch/files/es-tool 
b/modules/elasticsearch/files/es-tool
index 7138433..b2469f7 100755
--- a/modules/elasticsearch/files/es-tool
+++ b/modules/elasticsearch/files/es-tool
@@ -31,12 +31,14 @@
 # Helper functions go here
 def cluster_health():
     es = Elasticsearch(args.server)
-    return es.cluster.health()["status"]
+    return es.cluster.health(master_timeout=args.master_timeout,
+                             timeout=args.timeout)["status"]
 
 
 def cluster_status(columns=None):
     es = Elasticsearch(args.server)
-    cluster_health = es.cluster.health()
+    cluster_health = es.cluster.health(master_timeout=args.master_timeout,
+                                       timeout=args.timeout)
     if columns is None:
         columns = sorted(cluster_health)
     values = [cluster_health[x] for x in columns]
@@ -51,6 +53,8 @@
 def set_setting(setting, value, settingtype="transient"):
         es = Elasticsearch(args.server)
         res = es.cluster.put_settings(
+            timeout=args.timeout,
+            master_timeout=args.master_timeout,
             body={
                 settingtype: {
                     setting: value
@@ -64,7 +68,15 @@
 
 
 def set_allocation_state(status):
-    return set_setting("cluster.routing.allocation.enable", status)
+    for attempt in range(REPLICATION_ENABLE_ATTEMPTS):
+        try:
+            if set_setting("cluster.routing.allocation.enable", status):
+                return True
+        except:
+            time.sleep(3)
+            print "failed! -- retrying (%d/%d)" % (attempt,
+                                                   REPLICATION_ENABLE_ATTEMPTS)
+    return False
 
 
 def set_banned_nodes(nodelist, node_type):
@@ -74,7 +86,8 @@
 
 def get_banned_nodes(node_type):
     es = Elasticsearch(args.server)
-    res = es.cluster.get_settings()
+    res = es.cluster.get_settings(master_timeout=args.master_timeout,
+                                  timeout=args.timeout)
     try:
         bannedstr = res["transient"]["cluster"]["routing"]["allocation"][
             "exclude"][node_type]
@@ -221,23 +234,13 @@
 
     # Turn replication back on so things will recover fully
     printu("Enabling all replication...")
-    for attempt in range(REPLICATION_ENABLE_ATTEMPTS):
-        try:
-            if not set_allocation_state("all"):
-                print "failed! -- You will still need to enable replication",
-                print "again with `es-tool start-replication`"
-                return os.EX_UNAVAILABLE
-            else:
-                break
-        except ConnectionError:
-            print "failed! -- retrying (%d/%d)" % (attempt,
-                                                   REPLICATION_ENABLE_ATTEMPTS)
-            time.sleep(3)
-    else:
-        print "failed! -- You will still need to enable replication again",
-        print "with `es-tool start-replication`"
+
+    if not set_allocation_state("all"):
+        print "failed! -- You will still need to enable replication",
+        print "again with `es-tool start-replication`"
         return os.EX_UNAVAILABLE
-    printu("ok\n")
+    else:
+        printu("ok\n")
 
     # Wait a bit
     time.sleep(5)
@@ -345,6 +348,11 @@
                     help="IP address or hostname, used by (un)ban-node")
 parser.add_argument("--server", metavar='S', type=str, default="localhost",
                     help="Server to work on, default localhost")
+parser.add_argument("--timeout", metavar='T', type=int, default=10,
+                    help="Timeout (in second), default 10")
+parser.add_argument("--master_timeout", metavar='MT', type=int, default=30,
+                    help="Timeout to connect to the master node (in second), "
+                         "default 30")
 args = parser.parse_args()
 
 try:

-- 
To view, visit https://gerrit.wikimedia.org/r/282472
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4
Gerrit-PatchSet: 4
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Adedommelin <adedomme...@tuxz.net>
Gerrit-Reviewer: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Gehel <gleder...@wikimedia.org>
Gerrit-Reviewer: Nicko <nicko.gla...@gmail.com>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to