Github user jiangxb1987 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/21898#discussion_r205960887
  
    --- Diff: core/src/main/scala/org/apache/spark/BarrierTaskContextImpl.scala 
---
    @@ -39,8 +44,51 @@ private[spark] class BarrierTaskContextImpl(
           taskMemoryManager, localProperties, metricsSystem, taskMetrics)
         with BarrierTaskContext {
     
    -  // TODO SPARK-24817 implement global barrier.
    -  override def barrier(): Unit = {}
    +  private val barrierCoordinator: RpcEndpointRef = {
    +    val env = SparkEnv.get
    +    RpcUtils.makeDriverRef("barrierSync", env.conf, env.rpcEnv)
    +  }
    +
    +  private val timer = new Timer("Barrier task timer for barrier() calls.")
    +
    +  private var barrierEpoch = 0
    +
    +  private lazy val numTasks = localProperties.getProperty("numTasks", 
"0").toInt
    +
    +  override def barrier(): Unit = {
    +    logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt 
$stageAttemptNumber) has entered " +
    +      s"the global sync, current barrier epoch is $barrierEpoch.")
    +
    +    val startTime = System.currentTimeMillis()
    +    val timerTask = new TimerTask {
    +      override def run(): Unit = {
    +        logInfo(s"Task $taskAttemptId from Stage $stageId(Attempt 
$stageAttemptNumber) waiting " +
    +          s"under the global sync since $startTime, has been waiting for " 
+
    +          s"${(System.currentTimeMillis() - startTime) / 1000} seconds, 
current barrier epoch " +
    +          s"is $barrierEpoch.")
    +      }
    +    }
    +    // Log the update of global sync every 60 seconds.
    +    timer.schedule(timerTask, 60000, 60000)
    +
    +    try {
    +      barrierCoordinator.askSync[Unit](
    +        message = RequestToSync(numTasks, stageId, stageAttemptNumber, 
taskAttemptId, barrierEpoch),
    +        timeout = new RpcTimeout(31536000 /** = 3600 * 24 * 365 */ 
seconds, "barrierTimeout"))
    --- End diff --
    
    I set a fix timeout for RPC intentionally, so users shall get a 
SparkException thrown by BarrierCoordinator, instead of RPCTimeoutException 
from the RPC framework.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to