Repository: spark Updated Branches: refs/heads/master f6bcd3e53 -> 8eb5609d8
[SPARK-22754][DEPLOY] Check whether spark.executor.heartbeatInterval bigger⦠⦠than spark.network.timeout or not ## What changes were proposed in this pull request? If spark.executor.heartbeatInterval bigger than spark.network.timeout,it will almost always cause exception below. `Job aborted due to stage failure: Task 4763 in stage 3.0 failed 4 times, most recent failure: Lost task 4763.3 in stage 3.0 (TID 22383, executor id: 4761, host: xxx): ExecutorLostFailure (executor 4761 exited caused by one of the running tasks) Reason: Executor heartbeat timed out after 154022 ms` Since many users do not get that point.He will set spark.executor.heartbeatInterval incorrectly. This patch check this case when submit applications. ## How was this patch tested? Test in cluster Author: zhoukang <zhoukang199...@gmail.com> Closes #19942 from caneGuy/zhoukang/check-heartbeat. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8eb5609d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8eb5609d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8eb5609d Branch: refs/heads/master Commit: 8eb5609d8d961e54aa1ed0632f15f5e570fa627a Parents: f6bcd3e Author: zhoukang <zhoukang199...@gmail.com> Authored: Wed Dec 13 11:47:33 2017 -0800 Committer: Marcelo Vanzin <van...@cloudera.com> Committed: Wed Dec 13 11:47:33 2017 -0800 ---------------------------------------------------------------------- core/src/main/scala/org/apache/spark/SparkConf.scala | 8 ++++++++ core/src/test/scala/org/apache/spark/SparkConfSuite.scala | 10 ++++++++++ 2 files changed, 18 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/8eb5609d/core/src/main/scala/org/apache/spark/SparkConf.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala index 4b1286d..d77303e 100644 --- a/core/src/main/scala/org/apache/spark/SparkConf.scala +++ b/core/src/main/scala/org/apache/spark/SparkConf.scala @@ -564,6 +564,14 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging with Seria val encryptionEnabled = get(NETWORK_ENCRYPTION_ENABLED) || get(SASL_ENCRYPTION_ENABLED) require(!encryptionEnabled || get(NETWORK_AUTH_ENABLED), s"${NETWORK_AUTH_ENABLED.key} must be enabled when enabling encryption.") + + val executorTimeoutThreshold = getTimeAsSeconds("spark.network.timeout", "120s") + val executorHeartbeatInterval = getTimeAsSeconds("spark.executor.heartbeatInterval", "10s") + // If spark.executor.heartbeatInterval bigger than spark.network.timeout, + // it will almost always cause ExecutorLostFailure. See SPARK-22754. + require(executorTimeoutThreshold > executorHeartbeatInterval, "The value of " + + s"spark.network.timeout=${executorTimeoutThreshold}s must be no less than the value of " + + s"spark.executor.heartbeatInterval=${executorHeartbeatInterval}s.") } /** http://git-wip-us.apache.org/repos/asf/spark/blob/8eb5609d/core/src/test/scala/org/apache/spark/SparkConfSuite.scala ---------------------------------------------------------------------- diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala index c771eb4..bff808e 100644 --- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala @@ -329,6 +329,16 @@ class SparkConfSuite extends SparkFunSuite with LocalSparkContext with ResetSyst conf.validateSettings() } + test("spark.network.timeout should bigger than spark.executor.heartbeatInterval") { + val conf = new SparkConf() + conf.validateSettings() + + conf.set("spark.network.timeout", "5s") + intercept[IllegalArgumentException] { + conf.validateSettings() + } + } + } class Class1 {} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org