[MediaWiki-commits] [Gerrit] Improve robustness of es-tool - change (operations/puppet)

2016-04-25 Thread Gehel (Code Review)
Gehel has submitted this change and it was merged.

Change subject: Improve robustness of es-tool
..


Improve robustness of es-tool

Make set_allocation_state try REPLICATION_ENABLE_ATTEMPTS times
before definitive failure + added timeout arguments to be passed.

Bug: T128786
Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4
---
M modules/elasticsearch/files/es-tool
1 file changed, 28 insertions(+), 20 deletions(-)

Approvals:
  Gehel: Looks good to me, approved
  Nicko: Looks good to me, but someone else must approve
  jenkins-bot: Verified



diff --git a/modules/elasticsearch/files/es-tool 
b/modules/elasticsearch/files/es-tool
index 7138433..b2469f7 100755
--- a/modules/elasticsearch/files/es-tool
+++ b/modules/elasticsearch/files/es-tool
@@ -31,12 +31,14 @@
 # Helper functions go here
 def cluster_health():
 es = Elasticsearch(args.server)
-return es.cluster.health()["status"]
+return es.cluster.health(master_timeout=args.master_timeout,
+ timeout=args.timeout)["status"]
 
 
 def cluster_status(columns=None):
 es = Elasticsearch(args.server)
-cluster_health = es.cluster.health()
+cluster_health = es.cluster.health(master_timeout=args.master_timeout,
+   timeout=args.timeout)
 if columns is None:
 columns = sorted(cluster_health)
 values = [cluster_health[x] for x in columns]
@@ -51,6 +53,8 @@
 def set_setting(setting, value, settingtype="transient"):
 es = Elasticsearch(args.server)
 res = es.cluster.put_settings(
+timeout=args.timeout,
+master_timeout=args.master_timeout,
 body={
 settingtype: {
 setting: value
@@ -64,7 +68,15 @@
 
 
 def set_allocation_state(status):
-return set_setting("cluster.routing.allocation.enable", status)
+for attempt in range(REPLICATION_ENABLE_ATTEMPTS):
+try:
+if set_setting("cluster.routing.allocation.enable", status):
+return True
+except:
+time.sleep(3)
+print "failed! -- retrying (%d/%d)" % (attempt,
+   REPLICATION_ENABLE_ATTEMPTS)
+return False
 
 
 def set_banned_nodes(nodelist, node_type):
@@ -74,7 +86,8 @@
 
 def get_banned_nodes(node_type):
 es = Elasticsearch(args.server)
-res = es.cluster.get_settings()
+res = es.cluster.get_settings(master_timeout=args.master_timeout,
+  timeout=args.timeout)
 try:
 bannedstr = res["transient"]["cluster"]["routing"]["allocation"][
 "exclude"][node_type]
@@ -221,23 +234,13 @@
 
 # Turn replication back on so things will recover fully
 printu("Enabling all replication...")
-for attempt in range(REPLICATION_ENABLE_ATTEMPTS):
-try:
-if not set_allocation_state("all"):
-print "failed! -- You will still need to enable replication",
-print "again with `es-tool start-replication`"
-return os.EX_UNAVAILABLE
-else:
-break
-except ConnectionError:
-print "failed! -- retrying (%d/%d)" % (attempt,
-   REPLICATION_ENABLE_ATTEMPTS)
-time.sleep(3)
-else:
-print "failed! -- You will still need to enable replication again",
-print "with `es-tool start-replication`"
+
+if not set_allocation_state("all"):
+print "failed! -- You will still need to enable replication",
+print "again with `es-tool start-replication`"
 return os.EX_UNAVAILABLE
-printu("ok\n")
+else:
+printu("ok\n")
 
 # Wait a bit
 time.sleep(5)
@@ -345,6 +348,11 @@
 help="IP address or hostname, used by (un)ban-node")
 parser.add_argument("--server", metavar='S', type=str, default="localhost",
 help="Server to work on, default localhost")
+parser.add_argument("--timeout", metavar='T', type=int, default=10,
+help="Timeout (in second), default 10")
+parser.add_argument("--master_timeout", metavar='MT', type=int, default=30,
+help="Timeout to connect to the master node (in second), "
+ "default 30")
 args = parser.parse_args()
 
 try:

-- 
To view, visit https://gerrit.wikimedia.org/r/282472
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4
Gerrit-PatchSet: 4
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Adedommelin 
Gerrit-Reviewer: DCausse 
Gerrit-Reviewer: EBernhardson 
Gerrit-Reviewer: Gehel 
Gerrit-Reviewer: Nicko 

[MediaWiki-commits] [Gerrit] Improve robustness of es-tool - change (operations/puppet)

2016-04-09 Thread Adedommelin (Code Review)
Adedommelin has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/282472

Change subject: Improve robustness of es-tool
..

Improve robustness of es-tool

Make set_allocation_state try REPLICATION_ENABLE_ATTEMPTS times
before definitive failure.

Bug: T128786
Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4
---
M modules/elasticsearch/files/es-tool
1 file changed, 15 insertions(+), 17 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/72/282472/1

diff --git a/modules/elasticsearch/files/es-tool 
b/modules/elasticsearch/files/es-tool
index 7138433..889ee18 100755
--- a/modules/elasticsearch/files/es-tool
+++ b/modules/elasticsearch/files/es-tool
@@ -64,7 +64,15 @@
 
 
 def set_allocation_state(status):
-return set_setting("cluster.routing.allocation.enable", status)
+for attempt in range(REPLICATION_ENABLE_ATTEMPTS):
+try:
+if set_setting("cluster.routing.allocation.enable", status):
+return True
+except:
+time.sleep(3)
+print "failed! -- retrying (%d/%d)" % (attempt,
+   REPLICATION_ENABLE_ATTEMPTS)
+return False
 
 
 def set_banned_nodes(nodelist, node_type):
@@ -221,23 +229,13 @@
 
 # Turn replication back on so things will recover fully
 printu("Enabling all replication...")
-for attempt in range(REPLICATION_ENABLE_ATTEMPTS):
-try:
-if not set_allocation_state("all"):
-print "failed! -- You will still need to enable replication",
-print "again with `es-tool start-replication`"
-return os.EX_UNAVAILABLE
-else:
-break
-except ConnectionError:
-print "failed! -- retrying (%d/%d)" % (attempt,
-   REPLICATION_ENABLE_ATTEMPTS)
-time.sleep(3)
-else:
-print "failed! -- You will still need to enable replication again",
-print "with `es-tool start-replication`"
+
+if not set_allocation_state("all"):
+print "failed! -- You will still need to enable replication",
+print "again with `es-tool start-replication`"
 return os.EX_UNAVAILABLE
-printu("ok\n")
+else:
+printu("ok\n")
 
 # Wait a bit
 time.sleep(5)

-- 
To view, visit https://gerrit.wikimedia.org/r/282472
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Adedommelin 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits