[ 
https://issues.apache.org/jira/browse/FLINK-26239?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17503456#comment-17503456
 ] 

Yun Gao commented on FLINK-26239:
---------------------------------

I'll first close this issue since it seems already fixed and it indeed not 
reproduced for a long time. We could reopen it if it reoccured. 

> EventTimeWindowCheckpointingITCase.testSlidingTimeWindow failed on azure
> ------------------------------------------------------------------------
>
>                 Key: FLINK-26239
>                 URL: https://issues.apache.org/jira/browse/FLINK-26239
>             Project: Flink
>          Issue Type: Bug
>          Components: Runtime / State Backends
>    Affects Versions: 1.15.0
>            Reporter: Yun Gao
>            Assignee: Roman Khachatryan
>            Priority: Critical
>              Labels: test-stability
>
> {code:java}
> 2022-02-17T11:46:39.1850375Z Feb 17 11:46:39 Starting 
> org.apache.flink.test.checkpointing.EventTimeWindowCheckpointingITCase#testSlidingTimeWindow[statebackend
>  type =ROCKSDB_INCREMENTAL, buffersPerChannel = 2].
> 2022-02-17T11:46:39.1854584Z 
> org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
> 2022-02-17T11:46:39.1855470Z  at 
> org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
> 2022-02-17T11:46:39.1856444Z  at 
> org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:141)
> 2022-02-17T11:46:39.1857393Z  at 
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> 2022-02-17T11:46:39.1858400Z  at 
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> 2022-02-17T11:46:39.1865249Z  at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
> 2022-02-17T11:46:39.1866299Z  at 
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
> 2022-02-17T11:46:39.1867590Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$1(AkkaInvocationHandler.java:259)
> 2022-02-17T11:46:39.1868546Z  at 
> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
> 2022-02-17T11:46:39.1869254Z  at 
> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
> 2022-02-17T11:46:39.1869828Z  at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
> 2022-02-17T11:46:39.1870367Z  at 
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
> 2022-02-17T11:46:39.1871131Z  at 
> org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1389)
> 2022-02-17T11:46:39.1872123Z  at 
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93)
> 2022-02-17T11:46:39.1875765Z  at 
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
> 2022-02-17T11:46:39.1877055Z  at 
> org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92)
> 2022-02-17T11:46:39.1878032Z  at 
> java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
> 2022-02-17T11:46:39.1879084Z  at 
> java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
> 2022-02-17T11:46:39.1879697Z  at 
> java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
> 2022-02-17T11:46:39.1880252Z  at 
> java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
> 2022-02-17T11:46:39.1880840Z  at 
> org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$1.onComplete(AkkaFutureUtils.java:47)
> 2022-02-17T11:46:39.1881357Z  at 
> akka.dispatch.OnComplete.internal(Future.scala:300)
> 2022-02-17T11:46:39.1881788Z  at 
> akka.dispatch.OnComplete.internal(Future.scala:297)
> ...
> 2022-02-17T11:46:39.1915003Z Caused by: 
> org.apache.flink.util.FlinkRuntimeException: Exceeded checkpoint tolerable 
> failure threshold.
> 2022-02-17T11:46:39.1915653Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointFailureManager.checkFailureAgainstCounter(CheckpointFailureManager.java:160)
> 2022-02-17T11:46:39.1916393Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleJobLevelCheckpointException(CheckpointFailureManager.java:123)
> 2022-02-17T11:46:39.1917125Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointFailureManager.handleCheckpointException(CheckpointFailureManager.java:90)
> 2022-02-17T11:46:39.1917819Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:2046)
> 2022-02-17T11:46:39.1918594Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.abortPendingCheckpoint(CheckpointCoordinator.java:2025)
> 2022-02-17T11:46:39.1919268Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.access$600(CheckpointCoordinator.java:98)
> 2022-02-17T11:46:39.1920052Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator$CheckpointCanceller.run(CheckpointCoordinator.java:2104)
> 2022-02-17T11:46:39.1920852Z  at 
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> 2022-02-17T11:46:39.1921390Z  at 
> java.util.concurrent.FutureTask.run(FutureTask.java:266)
> 2022-02-17T11:46:39.1922079Z  at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> 2022-02-17T11:46:39.1922785Z  at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> 2022-02-17T11:46:39.1923541Z  at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> 2022-02-17T11:46:39.1924108Z  at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> 2022-02-17T11:46:39.1924585Z  at java.lang.Thread.run(Thread.java:748) {code}
> From the log, the checkpoint seems to fail due to 
>  
> {code:java}
> java.lang.IllegalStateException: Attempt to reference unknown state: 
> f1a3e68c-3bac-4bd9-b68f-7968a1411a06-KeyGroupRange{startKeyGroup=0, 
> endKeyGroup=1}-000058.sst
>     at org.apache.flink.util.Preconditions.checkState(Preconditions.java:193) 
> ~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.state.SharedStateRegistryImpl.registerReference(SharedStateRegistryImpl.java:82)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle.registerSharedStates(IncrementalRemoteKeyedStateHandle.java:317)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.state.SharedStateRegistryImpl.registerAll(SharedStateRegistryImpl.java:172)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.state.changelog.ChangelogStateBackendHandle$ChangelogStateBackendHandleImpl.registerSharedStates(ChangelogStateBackendHandle.java:124)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedState(OperatorSubtaskState.java:229)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedStates(OperatorSubtaskState.java:219)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.checkpoint.TaskStateSnapshot.registerSharedStates(TaskStateSnapshot.java:189)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveAcknowledgeMessage(CheckpointCoordinator.java:1114)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$acknowledgeCheckpoint$1(ExecutionGraphHandler.java:89)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  [?:1.8.0_292]
>     at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  [?:1.8.0_292]
>     at java.lang.Thread.run(Thread.java:748) [?:1.8.0_292]
> 11:36:48,770 [jobmanager-io-thread-14] WARN  
> org.apache.flink.runtime.jobmaster.JobMaster                 [] - Error while 
> processing AcknowledgeCheckpoint message 
> java.lang.IllegalStateException: Attempt to reference unknown state: 
> e0c386e6-8fdd-4277-8f5c-d4eb942c97e0-KeyGroupRange{startKeyGroup=6, 
> endKeyGroup=7}-000057.sst
>     at org.apache.flink.util.Preconditions.checkState(Preconditions.java:193) 
> ~[flink-core-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.state.SharedStateRegistryImpl.registerReference(SharedStateRegistryImpl.java:82)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle.registerSharedStates(IncrementalRemoteKeyedStateHandle.java:317)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.state.SharedStateRegistryImpl.registerAll(SharedStateRegistryImpl.java:172)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.state.changelog.ChangelogStateBackendHandle$ChangelogStateBackendHandleImpl.registerSharedStates(ChangelogStateBackendHandle.java:124)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedState(OperatorSubtaskState.java:229)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.checkpoint.OperatorSubtaskState.registerSharedStates(OperatorSubtaskState.java:219)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.checkpoint.TaskStateSnapshot.registerSharedStates(TaskStateSnapshot.java:189)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.receiveAcknowledgeMessage(CheckpointCoordinator.java:1114)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$acknowledgeCheckpoint$1(ExecutionGraphHandler.java:89)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> org.apache.flink.runtime.scheduler.ExecutionGraphHandler.lambda$processCheckpointCoordinatorMessage$3(ExecutionGraphHandler.java:119)
>  ~[flink-runtime-1.15-SNAPSHOT.jar:1.15-SNAPSHOT]
>     at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  [?:1.8.0_292]
>     at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  [?:1.8.0_292]
>     at java.lang.Thread.run(Thread.java:748) [?:1.8.0_292] {code}
> https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=31736&view=logs&j=5c8e7682-d68f-54d1-16a2-a09310218a49&t=86f654fa-ab48-5c1a-25f4-7e7f6afb9bba&l=6325



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to