[ https://issues.apache.org/jira/browse/FLINK-26255?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17494833#comment-17494833 ]
Roman Khachatryan commented on FLINK-26255: ------------------------------------------- Likely a duplicate of FLINK-26231. > SplitAggregateITCase.testAggWithJoin failed on azure > ---------------------------------------------------- > > Key: FLINK-26255 > URL: https://issues.apache.org/jira/browse/FLINK-26255 > Project: Flink > Issue Type: Bug > Components: Runtime / State Backends > Affects Versions: 1.15.0 > Reporter: Roman Khachatryan > Assignee: Roman Khachatryan > Priority: Blocker > Fix For: 1.15.0 > > > [https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=31850&view=logs&j=0c940707-2659-5648-cbe6-a1ad63045f0a&t=075c2716-8010-5565-fe08-3c4bb45824a4&l=10497] > Acknowledge of a checkpoint failed, then the checkpoint expired, then > checkpoint failure threshold was reached and job failed. > {code} > Randomly selected true for execution.checkpointing.unaligned > Randomly selected PT2S for execution.checkpointing.alignment-timeout > Randomly selected true for state.backend.changelog.enabled > Randomly selected PT0.1S for > state.backend.changelog.periodic-materialize.interval > {code} > {code} > [ERROR] Tests run: 64, Failures: 0, Errors: 1, Skipped: 0, Time elapsed: > 700.545 s <<< FAILURE! - in > org.apache.flink.table.planner.runtime.stream.sql.SplitAggregateITCase > [ERROR] SplitAggregateITCase.testAggWithJoin Time elapsed: 601.77 s <<< > ERROR! > org.apache.flink.runtime.client.JobExecutionException: Job execution failed. > at > org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144) > at > org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniCl > usterJobClient.java:141) > at > java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616) > at > java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591) > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > at > java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) > at > org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$1(AkkaInvocationHandle > r.java:259) > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > at > java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) > at > org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1389) > at > org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java > :93) > at > org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadi > ngUtils.java:68) > at > org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextCla > ssLoader$2(ClassLoadingUtils.java:92) > at > java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774) > at > java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750) > at > java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488) > at > java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975) > at > org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$1.onComplete(AkkaFutureUtils.java:47) > at akka.dispatch.OnComplete.internal(Future.scala:300) > at akka.dispatch.OnComplete.internal(Future.scala:297) > at akka.dispatch.japi$CallbackBridge.apply(Future.scala:224) > at akka.dispatch.japi$CallbackBridge.apply(Future.scala:221) > at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60) > at > org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$DirectExecutionContext.execute(AkkaFut > ureUtils.java:65) > at > scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:68) > at > scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:284) > at > scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:284) > at > scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:284) > ... > Caused by: org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint > tolerable failure threshold. > at > org.apache.flink.runtime.checkpoint.CheckpointFailureManager.checkFailureAgainstCounter(Checkpo > intFailureManager.java:160) > at > org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleJobLevelCheckpointException( > CheckpointFailureManager.java:123) > at > org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(Checkpoi > ntFailureManager.java:90) > at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoor > dinator.java:2046) > at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoor > dinator.java:2025) > at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.access$600(CheckpointCoordinator.java > :98) > at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator$CheckpointCanceller.run(CheckpointCoo > rdinator.java:2104) > at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThread > PoolExecutor.java:180) > at > java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExe > cutor.java:293) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > {code} > {code} > 12:18:11,760 [jobmanager-io-thread-5] WARN > org.apache.flink.runtime.jobmaster.JobMaster [] - Error while > processing AcknowledgeCheckpoint message > java.lang.IllegalStateException: Attempt to reference unknown state: > 4a798990-1428-424c-813a-2ec1c4fcee8f-KeyGroupRange{startKeyGroup=0, > endKeyGroup=31}-000019.sst > at > org.apache.flink.util.Preconditions.checkState(Preconditions.java:193) > ~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.SharedStateRegistryImpl.registerReference(SharedStateRegistryImpl.java:82) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle.registerSharedStates(IncrementalRemoteKeyedStateHandle.java:317) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.SharedStateRegistryImpl.registerAll(SharedStateRegistryImpl.java:172) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.state.changelog.ChangelogStateBackendHandle$ChangelogStateBackendHandleImpl.registerSharedStates(ChangelogStateBackendHandle.java:124) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedState(OperatorSubtaskState.java:229) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedStates(OperatorSubtaskState.java:219) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.TaskStateSnapshot.registerSharedStates(TaskStateSnapshot.java:189) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveAcknowledgeMessage(CheckpointCoordinator.java:1114) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$acknowledgeCheckpoint$1(ExecutionGraphHandler.java:89) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119) > ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT] > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > [?:1.8.0_292] > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > [?:1.8.0_292] > at java.lang.Thread.run(Thread.java:748) [?:1.8.0_292] > {code} -- This message was sent by Atlassian Jira (v8.20.1#820001)