pan3793 commented on code in PR #6997:
URL: https://github.com/apache/kyuubi/pull/6997#discussion_r2010661031
##########
kyuubi-server/src/main/scala/org/apache/kyuubi/operation/BatchJobSubmission.scala:
##########
@@ -250,50 +252,58 @@ class BatchJobSubmission(
private def submitAndMonitorBatchJob(): Unit = {
var appStatusFirstUpdated = false
var lastStarvationCheckTime = createTime
+
+ def doUpdateApplicationInfoMetadataIfNeeded(): Unit = {
+ updateApplicationInfoMetadataIfNeeded()
+ if (!appStatusFirstUpdated) {
+ // only the ApplicationInfo with non-empty id indicates that batch is
RUNNING
+ if (applicationId(_applicationInfo).isDefined) {
+ setStateIfNotCanceled(OperationState.RUNNING)
+ updateBatchMetadata()
+ appStatusFirstUpdated = true
+ } else {
+ val currentTime = System.currentTimeMillis()
+ if (currentTime - lastStarvationCheckTime >
applicationStarvationTimeout) {
+ lastStarvationCheckTime = currentTime
+ warn(s"Batch[$batchId] has not started, check the Kyuubi server to
ensure" +
+ s" that batch jobs can be submitted.")
+ }
+ }
+ }
+ }
+
try {
info(s"Submitting $batchType batch[$batchId] job:\n$builder")
val process = builder.start
- while (!applicationFailed(_applicationInfo) && process.isAlive) {
- updateApplicationInfoMetadataIfNeeded()
- if (!appStatusFirstUpdated) {
- // only the ApplicationInfo with non-empty id indicates that batch
is RUNNING
- if (applicationId(_applicationInfo).isDefined) {
- setStateIfNotCanceled(OperationState.RUNNING)
- updateBatchMetadata()
- appStatusFirstUpdated = true
- } else {
- val currentTime = System.currentTimeMillis()
- if (currentTime - lastStarvationCheckTime >
applicationStarvationTimeout) {
- lastStarvationCheckTime = currentTime
- warn(s"Batch[$batchId] has not started, check the Kyuubi server
to ensure" +
- s" that batch jobs can be submitted.")
- }
- }
- }
+ while (process.isAlive && !applicationFailed(_applicationInfo)) {
+ doUpdateApplicationInfoMetadataIfNeeded()
process.waitFor(applicationCheckInterval, TimeUnit.MILLISECONDS)
}
+ if (!process.isAlive) {
+ doUpdateApplicationInfoMetadataIfNeeded()
Review Comment:
just for note, this is the key change -
in the current round, app state is `NOT_FOUND` because the submit stage
exceeds the `kyuubi.engine.yarn.submit.timeout`, while during the
`process.waitFor` period, submit success, then `process.isAlive` returns false,
thus there is no chance to retrieve the app state from the cluster manager
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]