Github user squito commented on a diff in the pull request:

    https://github.com/apache/spark/pull/16901#discussion_r101064021
  
    --- Diff: 
core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala ---
    @@ -2161,6 +2161,63 @@ class DAGSchedulerSuite extends SparkFunSuite with 
LocalSparkContext with Timeou
         }
       }
     
    +  test("After fetching failed, success of old attempt of stage should be 
taken as valid.") {
    +    // Create 3 RDDs with shuffle dependencies on each other: rddA <--- 
rddB <--- rddC
    +    val rddA = new MyRDD(sc, 2, Nil)
    +    val shuffleDepA = new ShuffleDependency(rddA, new HashPartitioner(2))
    +    val shuffleIdA = shuffleDepA.shuffleId
    +
    +    val rddB = new MyRDD(sc, 2, List(shuffleDepA))
    +    val shuffleDepB = new ShuffleDependency(rddB, new HashPartitioner(2))
    +
    +    val rddC = new MyRDD(sc, 2, List(shuffleDepB))
    +
    +    submit(rddC, Array(0, 1))
    +    assert(taskSets(0).stageId === 0 && taskSets(0).stageAttemptId === 0)
    +
    +    // Complete both tasks in rddA.
    +    complete(taskSets(0), Seq(
    +      (Success, makeMapStatus("hostA", 2)),
    +      (Success, makeMapStatus("hostA", 2))))
    +
    +    // Fetch failed on hostA for task(partitionId=0) and 
task(partitionId=1) is still running.
    +    runEvent(makeCompletionEvent(
    +      taskSets(1).tasks(0),
    +      FetchFailed(makeBlockManagerId("hostA"), shuffleIdA, 0, 0,
    +        "Fetch failure of task: stageId=1, stageAttempt=0, partitionId=0"),
    +      result = null))
    +
    +    // Both original tasks in rddA should be marked as failed, because 
they ran on the
    +    // failed hostA, so both should be resubmitted. Complete them 
successfully.
    +    scheduler.resubmitFailedStages()
    +    assert(taskSets(2).stageId === 0 && taskSets(2).stageAttemptId === 1
    +      && taskSets(2).tasks.size === 2)
    +    complete(taskSets(2), Seq(
    +      (Success, makeMapStatus("hostB", 2)),
    +      (Success, makeMapStatus("hostB", 2))))
    +
    +    // Both tasks in rddB should be resubmitted, because none of them has 
succeeded.
    +    // Complete the task(partitionId=0) successfully. Task(partitionId=1) 
is still running.
    +    assert(taskSets(3).stageId === 1 && taskSets(3).stageAttemptId === 1 &&
    +      taskSets(3).tasks.size === 2)
    +    runEvent(makeCompletionEvent(
    +      taskSets(3).tasks(0), Success, makeMapStatus("hostB", 2)))
    +
    +    // Complete the task(partition=1) which is from the old 
attempt(stageId=1, stageAttempt=0)
    +    // successfully.
    +    runEvent(makeCompletionEvent(
    +      taskSets(1).tasks(1), Success, makeMapStatus("hostB", 2)))
    +
    +    // Thanks to the success from old attempt of stage(stageId=1, 
stageAttempt=0), there's no
    +    // pending partitions for stage(stageId=1) now, thus downstream stage 
should be submitted,
    +    // though there's still a running task(stageId=1, stageAttempt=1, 
partitionId=1)
    +    // in the active stage attempt.
    +    assert(taskSets.size === 5 && 
taskSets(4).tasks(0).isInstanceOf[ResultTask[_, _]])
    +    complete(taskSets(4), Seq(
    --- End diff --
    
    https://issues.apache.org/jira/browse/SPARK-19596


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to