Gehel has submitted this change and it was merged.
Change subject: Improve robustness of es-tool
..
Improve robustness of es-tool
Make set_allocation_state try REPLICATION_ENABLE_ATTEMPTS times
before definitive failure + added timeout arguments to be passed.
Bug: T128786
Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4
---
M modules/elasticsearch/files/es-tool
1 file changed, 28 insertions(+), 20 deletions(-)
Approvals:
Gehel: Looks good to me, approved
Nicko: Looks good to me, but someone else must approve
jenkins-bot: Verified
diff --git a/modules/elasticsearch/files/es-tool
b/modules/elasticsearch/files/es-tool
index 7138433..b2469f7 100755
--- a/modules/elasticsearch/files/es-tool
+++ b/modules/elasticsearch/files/es-tool
@@ -31,12 +31,14 @@
# Helper functions go here
def cluster_health():
es = Elasticsearch(args.server)
-return es.cluster.health()["status"]
+return es.cluster.health(master_timeout=args.master_timeout,
+ timeout=args.timeout)["status"]
def cluster_status(columns=None):
es = Elasticsearch(args.server)
-cluster_health = es.cluster.health()
+cluster_health = es.cluster.health(master_timeout=args.master_timeout,
+ timeout=args.timeout)
if columns is None:
columns = sorted(cluster_health)
values = [cluster_health[x] for x in columns]
@@ -51,6 +53,8 @@
def set_setting(setting, value, settingtype="transient"):
es = Elasticsearch(args.server)
res = es.cluster.put_settings(
+timeout=args.timeout,
+master_timeout=args.master_timeout,
body={
settingtype: {
setting: value
@@ -64,7 +68,15 @@
def set_allocation_state(status):
-return set_setting("cluster.routing.allocation.enable", status)
+for attempt in range(REPLICATION_ENABLE_ATTEMPTS):
+try:
+if set_setting("cluster.routing.allocation.enable", status):
+return True
+except:
+time.sleep(3)
+print "failed! -- retrying (%d/%d)" % (attempt,
+ REPLICATION_ENABLE_ATTEMPTS)
+return False
def set_banned_nodes(nodelist, node_type):
@@ -74,7 +86,8 @@
def get_banned_nodes(node_type):
es = Elasticsearch(args.server)
-res = es.cluster.get_settings()
+res = es.cluster.get_settings(master_timeout=args.master_timeout,
+ timeout=args.timeout)
try:
bannedstr = res["transient"]["cluster"]["routing"]["allocation"][
"exclude"][node_type]
@@ -221,23 +234,13 @@
# Turn replication back on so things will recover fully
printu("Enabling all replication...")
-for attempt in range(REPLICATION_ENABLE_ATTEMPTS):
-try:
-if not set_allocation_state("all"):
-print "failed! -- You will still need to enable replication",
-print "again with `es-tool start-replication`"
-return os.EX_UNAVAILABLE
-else:
-break
-except ConnectionError:
-print "failed! -- retrying (%d/%d)" % (attempt,
- REPLICATION_ENABLE_ATTEMPTS)
-time.sleep(3)
-else:
-print "failed! -- You will still need to enable replication again",
-print "with `es-tool start-replication`"
+
+if not set_allocation_state("all"):
+print "failed! -- You will still need to enable replication",
+print "again with `es-tool start-replication`"
return os.EX_UNAVAILABLE
-printu("ok\n")
+else:
+printu("ok\n")
# Wait a bit
time.sleep(5)
@@ -345,6 +348,11 @@
help="IP address or hostname, used by (un)ban-node")
parser.add_argument("--server", metavar='S', type=str, default="localhost",
help="Server to work on, default localhost")
+parser.add_argument("--timeout", metavar='T', type=int, default=10,
+help="Timeout (in second), default 10")
+parser.add_argument("--master_timeout", metavar='MT', type=int, default=30,
+help="Timeout to connect to the master node (in second), "
+ "default 30")
args = parser.parse_args()
try:
--
To view, visit https://gerrit.wikimedia.org/r/282472
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ib5a6888a4b257be04ffc9a84b56477a88a22efd4
Gerrit-PatchSet: 4
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Adedommelin
Gerrit-Reviewer: DCausse
Gerrit-Reviewer: EBernhardson
Gerrit-Reviewer: Gehel
Gerrit-Reviewer: Nicko