[GitHub] [spark] sarutak commented on a change in pull request #33253: [SPARK-36038][CORE] Speculation metrics summary at stage level

GitBox Wed, 08 Sep 2021 23:19:10 -0700


sarutak commented on a change in pull request #33253:
URL: https://github.com/apache/spark/pull/33253#discussion_r704911903




##########
File path: 
core/src/test/resources/HistoryServerExpectations/stage_with_speculation_summary_expectation.json
##########
@@ -0,0 +1,510 @@
+{
+  "status" : "COMPLETE",
+  "stageId" : 0,
+  "attemptId" : 0,
+  "numTasks" : 4,
+  "numActiveTasks" : 0,
+  "numCompleteTasks" : 4,
+  "numFailedTasks" : 0,
+  "numKilledTasks" : 1,
+  "numCompletedIndices" : 4,
+  "submissionTime" : "2021-08-10T23:27:53.488GMT",
+  "firstTaskLaunchedTime" : "2021-08-10T23:27:53.885GMT",
+  "completionTime" : "2021-08-10T23:28:57.679GMT",
+  "executorDeserializeTime" : 12793,
+  "executorDeserializeCpuTime" : 5317155711,
+  "executorRunTime" : 113648,
+  "executorCpuTime" : 284330976,
+  "resultSize" : 3360,
+  "jvmGcTime" : 0,
+  "resultSerializationTime" : 4,
+  "memoryBytesSpilled" : 0,
+  "diskBytesSpilled" : 0,
+  "peakExecutionMemory" : 0,
+  "inputBytes" : 0,
+  "inputRecords" : 0,
+  "outputBytes" : 0,
+  "outputRecords" : 0,
+  "shuffleRemoteBlocksFetched" : 0,
+  "shuffleLocalBlocksFetched" : 0,
+  "shuffleFetchWaitTime" : 0,
+  "shuffleRemoteBytesRead" : 0,
+  "shuffleRemoteBytesReadToDisk" : 0,
+  "shuffleLocalBytesRead" : 0,
+  "shuffleReadBytes" : 0,
+  "shuffleReadRecords" : 0,
+  "shuffleWriteBytes" : 0,
+  "shuffleWriteTime" : 0,
+  "shuffleWriteRecords" : 0,
+  "name" : "collect at <console>:27",
+  "details" : 
"org.apache.spark.rdd.RDD.collect(RDD.scala:1029)\n$line17.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:27)\n$line17.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:36)\n$line17.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:38)\n$line17.$read$$iw$$iw$$iw$$iw$$iw.<init>(<console>:40)\n$line17.$read$$iw$$iw$$iw$$iw.<init>(<console>:42)\n$line17.$read$$iw$$iw$$iw.<init>(<console>:44)\n$line17.$read$$iw$$iw.<init>(<console>:46)\n$line17.$read$$iw.<init>(<console>:48)\n$line17.$read.<init>(<console>:50)\n$line17.$read$.<init>(<console>:54)\n$line17.$read$.<clinit>(<console>)\n$line17.$eval$.$print$lzycompute(<console>:7)\n$line17.$eval$.$print(<console>:6)\n$line17.$eval.$print(<console>)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native
 
Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)
 \nscala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:745)",
+  "schedulingPool" : "default",
+  "rddIds" : [ 1, 0 ],
+  "accumulatorUpdates" : [ ],
+  "tasks" : {
+    "0" : {
+      "taskId" : 0,
+      "index" : 0,
+      "attempt" : 0,
+      "launchTime" : "2021-08-10T23:27:53.885GMT",
+      "duration" : 2234,
+      "executorId" : "7",
+      "host" : "ltx1-hcl12291.grid.linkedin.com",
+      "status" : "SUCCESS",
+      "taskLocality" : "PROCESS_LOCAL",
+      "speculative" : false,
+      "accumulatorUpdates" : [ ],
+      "taskMetrics" : {
+        "executorDeserializeTime" : 2048,
+        "executorDeserializeCpuTime" : 1171756284,
+        "executorRunTime" : 74,
+        "executorCpuTime" : 65263482,
+        "resultSize" : 840,
+        "jvmGcTime" : 0,
+        "resultSerializationTime" : 1,
+        "memoryBytesSpilled" : 0,
+        "diskBytesSpilled" : 0,
+        "peakExecutionMemory" : 0,
+        "inputMetrics" : {
+          "bytesRead" : 0,
+          "recordsRead" : 0
+        },
+        "outputMetrics" : {
+          "bytesWritten" : 0,
+          "recordsWritten" : 0
+        },
+        "shuffleReadMetrics" : {
+          "remoteBlocksFetched" : 0,
+          "localBlocksFetched" : 0,
+          "fetchWaitTime" : 0,
+          "remoteBytesRead" : 0,
+          "remoteBytesReadToDisk" : 0,
+          "localBytesRead" : 0,
+          "recordsRead" : 0
+        },
+        "shuffleWriteMetrics" : {
+          "bytesWritten" : 0,
+          "writeTime" : 0,
+          "recordsWritten" : 0
+        }
+      },
+      "executorLogs" : {
+        "stdout" : 
"http://ltx1-hcl12291.grid.linkedin.com:8042/node/containerlogs/container_e18_1628109047826_1317105_01_000008/vsowrira/stdout?start=-4096";,

Review comment:
       The URL is exposed. Is it OK?

##########
File path: 
core/src/test/resources/HistoryServerExpectations/stage_with_speculation_summary_expectation.json
##########
@@ -0,0 +1,510 @@
+{
+  "status" : "COMPLETE",
+  "stageId" : 0,
+  "attemptId" : 0,
+  "numTasks" : 4,
+  "numActiveTasks" : 0,
+  "numCompleteTasks" : 4,
+  "numFailedTasks" : 0,
+  "numKilledTasks" : 1,
+  "numCompletedIndices" : 4,
+  "submissionTime" : "2021-08-10T23:27:53.488GMT",
+  "firstTaskLaunchedTime" : "2021-08-10T23:27:53.885GMT",
+  "completionTime" : "2021-08-10T23:28:57.679GMT",
+  "executorDeserializeTime" : 12793,
+  "executorDeserializeCpuTime" : 5317155711,
+  "executorRunTime" : 113648,
+  "executorCpuTime" : 284330976,
+  "resultSize" : 3360,
+  "jvmGcTime" : 0,
+  "resultSerializationTime" : 4,
+  "memoryBytesSpilled" : 0,
+  "diskBytesSpilled" : 0,
+  "peakExecutionMemory" : 0,
+  "inputBytes" : 0,
+  "inputRecords" : 0,
+  "outputBytes" : 0,
+  "outputRecords" : 0,
+  "shuffleRemoteBlocksFetched" : 0,
+  "shuffleLocalBlocksFetched" : 0,
+  "shuffleFetchWaitTime" : 0,
+  "shuffleRemoteBytesRead" : 0,
+  "shuffleRemoteBytesReadToDisk" : 0,
+  "shuffleLocalBytesRead" : 0,
+  "shuffleReadBytes" : 0,
+  "shuffleReadRecords" : 0,
+  "shuffleWriteBytes" : 0,
+  "shuffleWriteTime" : 0,
+  "shuffleWriteRecords" : 0,
+  "name" : "collect at <console>:27",
+  "details" : 
"org.apache.spark.rdd.RDD.collect(RDD.scala:1029)\n$line17.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:27)\n$line17.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:36)\n$line17.$read$$iw$$iw$$iw$$iw$$iw$$iw.<init>(<console>:38)\n$line17.$read$$iw$$iw$$iw$$iw$$iw.<init>(<console>:40)\n$line17.$read$$iw$$iw$$iw$$iw.<init>(<console>:42)\n$line17.$read$$iw$$iw$$iw.<init>(<console>:44)\n$line17.$read$$iw$$iw.<init>(<console>:46)\n$line17.$read$$iw.<init>(<console>:48)\n$line17.$read.<init>(<console>:50)\n$line17.$read$.<init>(<console>:54)\n$line17.$read$.<clinit>(<console>)\n$line17.$eval$.$print$lzycompute(<console>:7)\n$line17.$eval$.$print(<console>:6)\n$line17.$eval.$print(<console>)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native
 
Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)
 \nscala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:745)",
+  "schedulingPool" : "default",
+  "rddIds" : [ 1, 0 ],
+  "accumulatorUpdates" : [ ],
+  "tasks" : {
+    "0" : {
+      "taskId" : 0,
+      "index" : 0,
+      "attempt" : 0,
+      "launchTime" : "2021-08-10T23:27:53.885GMT",
+      "duration" : 2234,
+      "executorId" : "7",
+      "host" : "ltx1-hcl12291.grid.linkedin.com",
+      "status" : "SUCCESS",
+      "taskLocality" : "PROCESS_LOCAL",
+      "speculative" : false,
+      "accumulatorUpdates" : [ ],
+      "taskMetrics" : {
+        "executorDeserializeTime" : 2048,
+        "executorDeserializeCpuTime" : 1171756284,
+        "executorRunTime" : 74,
+        "executorCpuTime" : 65263482,
+        "resultSize" : 840,
+        "jvmGcTime" : 0,
+        "resultSerializationTime" : 1,
+        "memoryBytesSpilled" : 0,
+        "diskBytesSpilled" : 0,
+        "peakExecutionMemory" : 0,
+        "inputMetrics" : {
+          "bytesRead" : 0,
+          "recordsRead" : 0
+        },
+        "outputMetrics" : {
+          "bytesWritten" : 0,
+          "recordsWritten" : 0
+        },
+        "shuffleReadMetrics" : {
+          "remoteBlocksFetched" : 0,
+          "localBlocksFetched" : 0,
+          "fetchWaitTime" : 0,
+          "remoteBytesRead" : 0,
+          "remoteBytesReadToDisk" : 0,
+          "localBytesRead" : 0,
+          "recordsRead" : 0
+        },
+        "shuffleWriteMetrics" : {
+          "bytesWritten" : 0,
+          "writeTime" : 0,
+          "recordsWritten" : 0
+        }
+      },
+      "executorLogs" : {
+        "stdout" : 
"http://ltx1-hcl12291.grid.linkedin.com:8042/node/containerlogs/container_e18_1628109047826_1317105_01_000008/vsowrira/stdout?start=-4096";,
+        "stderr" : 
"http://ltx1-hcl12291.grid.linkedin.com:8042/node/containerlogs/container_e18_1628109047826_1317105_01_000008/vsowrira/stderr?start=-4096";
+      },
+      "schedulerDelay" : 111,
+      "gettingResultTime" : 0
+    },
+    "1" : {
+      "taskId" : 1,
+      "index" : 1,
+      "attempt" : 0,
+      "launchTime" : "2021-08-10T23:27:53.903GMT",
+      "duration" : 2647,
+      "executorId" : "5",
+      "host" : "ltx1-hcl5290.grid.linkedin.com",

Review comment:
       ditto.

##########
File path: core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
##########
@@ -1208,6 +1232,33 @@ private[spark] class AppStatusListener(
     }
   }
 
+  private def killedTaskSummaryForSpeculationStageSummary(
+      reason: TaskEndReason,
+      oldSummary: Map[String, Int],
+      isSpeculative: Boolean): Map[String, Int] = {
+    reason match {
+      case k: TaskKilled if k.reason.contains("another attempt succeeded") =>
+        if (isSpeculative) {
+          oldSummary.updated("original attempt succeeded",
+            oldSummary.getOrElse("original attempt succeeded", 0) + 1)
+        } else {
+          oldSummary.updated("speculated attempt succeeded",
+            oldSummary.getOrElse("speculated attempt succeeded", 0) + 1)
+        }
+      // If the stage is finished and speculative tasks get killed, then the
+      // kill reason is "stage finished"
+      case k: TaskKilled if k.reason.contains("Stage finished") =>
+        if (isSpeculative) {
+          oldSummary.updated("original attempt succeeded",
+            oldSummary.getOrElse("original attempt succeeded", 0) + 1)
+        } else {
+          oldSummary

Review comment:
       I re-considered around this logic, and as I commented 
[here](https://github.com/apache/spark/pull/33253/files#diff-0d63c25e93eb1fbec25754f0bee7714c7e3c1538bc7370d94b147524db5c1f63R762),
 I'd like to suggest that `killedTaskSummaryForSpeculationStageSummary` should 
be only for `isSpeculative == true`, and simply have the messages `another 
attempt succeeded` and `Stage finished` rather than `original attempt 
succeeded` and `speculated attempt succeeded` ?
   
   Then, we don't need `killedTaskSummaryForSpeculationStageSummary` and can 
use the existing `killedTaskSummary`.

##########
File path: core/src/main/scala/org/apache/spark/status/LiveEntity.scala
##########
@@ -392,6 +392,30 @@ private class LiveExecutorStageSummary(
 
 }
 
+private class LiveSpeculationStageSummary(
+    stageId: Int,
+    attemptId: Int) extends LiveEntity {
+
+  var numTasks = 0
+  var numActiveTasks = 0
+  var numCompletedTasks = 0
+  var numFailedTasks = 0
+  var numKilledTasks = 0
+  var killedTasksSummary : Map[String, Int] = Map()

Review comment:
       It's nit but how about `killedSummary` like other `Live*` classes.

##########
File path: core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
##########
@@ -746,6 +752,24 @@ private[spark] class AppStatusListener(
         maybeUpdate(esummary, now)
       }
 
+      val speculationStageSummary = stage.speculationStageSummary
+      if (event.taskInfo.speculative) {
+        speculationStageSummary.numActiveTasks -= 1
+        speculationStageSummary.numCompletedTasks += completedDelta
+        speculationStageSummary.numFailedTasks += failedDelta
+        speculationStageSummary.numKilledTasks += killedDelta
+      }
+      speculationStageSummary.killedTasksSummary = 
killedTaskSummaryForSpeculationStageSummary(

Review comment:
       `SpeculationStageSummary` is about speculative tasks so 
`speculationStageSummary.killedTasksSummary` should be only for 
`event.taskInfo.speculative == true` ? 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] sarutak commented on a change in pull request #33253: [SPARK-36038][CORE] Speculation metrics summary at stage level

Reply via email to