Gehel has submitted this change and it was merged. Change subject: Improve robustness of es-tool ......................................................................
Improve robustness of es-tool Make set_allocation_state try REPLICATION_ENABLE_ATTEMPTS times before definitive failure + added timeout arguments to be passed. Bug: T128786 Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4 --- M modules/elasticsearch/files/es-tool 1 file changed, 28 insertions(+), 20 deletions(-) Approvals: Gehel: Looks good to me, approved Nicko: Looks good to me, but someone else must approve jenkins-bot: Verified diff --git a/modules/elasticsearch/files/es-tool b/modules/elasticsearch/files/es-tool index 7138433..b2469f7 100755 --- a/modules/elasticsearch/files/es-tool +++ b/modules/elasticsearch/files/es-tool @@ -31,12 +31,14 @@ # Helper functions go here def cluster_health(): es = Elasticsearch(args.server) - return es.cluster.health()["status"] + return es.cluster.health(master_timeout=args.master_timeout, + timeout=args.timeout)["status"] def cluster_status(columns=None): es = Elasticsearch(args.server) - cluster_health = es.cluster.health() + cluster_health = es.cluster.health(master_timeout=args.master_timeout, + timeout=args.timeout) if columns is None: columns = sorted(cluster_health) values = [cluster_health[x] for x in columns] @@ -51,6 +53,8 @@ def set_setting(setting, value, settingtype="transient"): es = Elasticsearch(args.server) res = es.cluster.put_settings( + timeout=args.timeout, + master_timeout=args.master_timeout, body={ settingtype: { setting: value @@ -64,7 +68,15 @@ def set_allocation_state(status): - return set_setting("cluster.routing.allocation.enable", status) + for attempt in range(REPLICATION_ENABLE_ATTEMPTS): + try: + if set_setting("cluster.routing.allocation.enable", status): + return True + except: + time.sleep(3) + print "failed! -- retrying (%d/%d)" % (attempt, + REPLICATION_ENABLE_ATTEMPTS) + return False def set_banned_nodes(nodelist, node_type): @@ -74,7 +86,8 @@ def get_banned_nodes(node_type): es = Elasticsearch(args.server) - res = es.cluster.get_settings() + res = es.cluster.get_settings(master_timeout=args.master_timeout, + timeout=args.timeout) try: bannedstr = res["transient"]["cluster"]["routing"]["allocation"][ "exclude"][node_type] @@ -221,23 +234,13 @@ # Turn replication back on so things will recover fully printu("Enabling all replication...") - for attempt in range(REPLICATION_ENABLE_ATTEMPTS): - try: - if not set_allocation_state("all"): - print "failed! -- You will still need to enable replication", - print "again with `es-tool start-replication`" - return os.EX_UNAVAILABLE - else: - break - except ConnectionError: - print "failed! -- retrying (%d/%d)" % (attempt, - REPLICATION_ENABLE_ATTEMPTS) - time.sleep(3) - else: - print "failed! -- You will still need to enable replication again", - print "with `es-tool start-replication`" + + if not set_allocation_state("all"): + print "failed! -- You will still need to enable replication", + print "again with `es-tool start-replication`" return os.EX_UNAVAILABLE - printu("ok\n") + else: + printu("ok\n") # Wait a bit time.sleep(5) @@ -345,6 +348,11 @@ help="IP address or hostname, used by (un)ban-node") parser.add_argument("--server", metavar='S', type=str, default="localhost", help="Server to work on, default localhost") +parser.add_argument("--timeout", metavar='T', type=int, default=10, + help="Timeout (in second), default 10") +parser.add_argument("--master_timeout", metavar='MT', type=int, default=30, + help="Timeout to connect to the master node (in second), " + "default 30") args = parser.parse_args() try: -- To view, visit https://gerrit.wikimedia.org/r/282472 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4 Gerrit-PatchSet: 4 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Adedommelin <adedomme...@tuxz.net> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <gleder...@wikimedia.org> Gerrit-Reviewer: Nicko <nicko.gla...@gmail.com> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits