[ 
https://issues.apache.org/jira/browse/FLINK-22084?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17315618#comment-17315618
 ] 

Chesnay Schepler commented on FLINK-22084:
------------------------------------------

The test seems to use different maxParallelism values for subsequent job 
submissions; the first one uses 13, while in one of the test cases it later 
uses -1 to let Flink decide, and we come up with 128.

I would think that this use-case shouldn't even be supported because it 
technically modifies the maxParallelism, but apparently it was supported so far.

> RescalingITCase fails with adaptive scheduler
> ---------------------------------------------
>
>                 Key: FLINK-22084
>                 URL: https://issues.apache.org/jira/browse/FLINK-22084
>             Project: Flink
>          Issue Type: Bug
>          Components: Runtime / Checkpointing, Runtime / Coordination
>    Affects Versions: 1.13.0
>            Reporter: Dawid Wysakowicz
>            Assignee: Austin Cawley-Edwards
>            Priority: Blocker
>              Labels: test-stability
>             Fix For: 1.13.0
>
>
> https://dev.azure.com/apache-flink/apache-flink/_build/results?buildId=15934&view=logs&j=8fd9202e-fd17-5b26-353c-ac1ff76c8f28&t=a0a633b8-47ef-5c5a-2806-3c13b9e48228&l=4472
> {code}
> 2021-03-31T22:16:07.8416407Z [ERROR] 
> testSavepointRescalingOutKeyedStateDerivedMaxParallelism[backend = 
> rocksdb](org.apache.flink.test.checkpointing.RescalingITCase)  Time elapsed: 
> 9.945 s  <<< ERROR!
> 2021-03-31T22:16:07.8417534Z 
> org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
> 2021-03-31T22:16:07.8418516Z  at 
> org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
> 2021-03-31T22:16:07.8419281Z  at 
> org.apache.flink.test.util.TestUtils.submitJobAndWaitForResult(TestUtils.java:63)
> 2021-03-31T22:16:07.8420142Z  at 
> org.apache.flink.test.checkpointing.RescalingITCase.testSavepointRescalingKeyedState(RescalingITCase.java:251)
> 2021-03-31T22:16:07.8421173Z  at 
> org.apache.flink.test.checkpointing.RescalingITCase.testSavepointRescalingOutKeyedStateDerivedMaxParallelism(RescalingITCase.java:168)
> 2021-03-31T22:16:07.8421985Z  at 
> sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 2021-03-31T22:16:07.8422651Z  at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> 2021-03-31T22:16:07.8423649Z  at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> 2021-03-31T22:16:07.8424231Z  at 
> java.lang.reflect.Method.invoke(Method.java:498)
> 2021-03-31T22:16:07.8424657Z  at 
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:50)
> 2021-03-31T22:16:07.8425147Z  at 
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
> 2021-03-31T22:16:07.8425609Z  at 
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:47)
> 2021-03-31T22:16:07.8426183Z  at 
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
> 2021-03-31T22:16:07.8569060Z  at 
> org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26)
> 2021-03-31T22:16:07.8569781Z  at 
> org.apache.flink.util.TestNameProvider$1.evaluate(TestNameProvider.java:45)
> 2021-03-31T22:16:07.8570451Z  at 
> org.junit.rules.TestWatcher$1.evaluate(TestWatcher.java:55)
> 2021-03-31T22:16:07.8571040Z  at 
> org.junit.rules.RunRules.evaluate(RunRules.java:20)
> 2021-03-31T22:16:07.8571604Z  at 
> org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:325)
> 2021-03-31T22:16:07.8572303Z  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:78)
> 2021-03-31T22:16:07.8573259Z  at 
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:57)
> 2021-03-31T22:16:07.8573975Z  at 
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8574660Z  at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8575359Z  at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8576037Z  at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8576728Z  at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8577588Z  at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8578181Z  at 
> org.junit.runners.Suite.runChild(Suite.java:128)
> 2021-03-31T22:16:07.8578771Z  at 
> org.junit.runners.Suite.runChild(Suite.java:27)
> 2021-03-31T22:16:07.8579402Z  at 
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8580061Z  at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8580774Z  at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8581480Z  at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8582148Z  at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8582896Z  at 
> org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
> 2021-03-31T22:16:07.8583762Z  at 
> org.junit.rules.ExternalResource$1.evaluate(ExternalResource.java:48)
> 2021-03-31T22:16:07.8584427Z  at 
> org.junit.rules.RunRules.evaluate(RunRules.java:20)
> 2021-03-31T22:16:07.8585069Z  at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8585671Z  at 
> org.junit.runners.Suite.runChild(Suite.java:128)
> 2021-03-31T22:16:07.8586254Z  at 
> org.junit.runners.Suite.runChild(Suite.java:27)
> 2021-03-31T22:16:07.8586875Z  at 
> org.junit.runners.ParentRunner$3.run(ParentRunner.java:290)
> 2021-03-31T22:16:07.8587643Z  at 
> org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:71)
> 2021-03-31T22:16:07.8779731Z  at 
> org.junit.runners.ParentRunner.runChildren(ParentRunner.java:288)
> 2021-03-31T22:16:07.8780398Z  at 
> org.junit.runners.ParentRunner.access$000(ParentRunner.java:58)
> 2021-03-31T22:16:07.8781024Z  at 
> org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:268)
> 2021-03-31T22:16:07.8781702Z  at 
> org.junit.runners.ParentRunner.run(ParentRunner.java:363)
> 2021-03-31T22:16:07.8782346Z  at 
> org.apache.maven.surefire.junitcore.JUnitCore.run(JUnitCore.java:55)
> 2021-03-31T22:16:07.8783166Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.createRequestAndRun(JUnitCoreWrapper.java:137)
> 2021-03-31T22:16:07.8784006Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.executeEager(JUnitCoreWrapper.java:107)
> 2021-03-31T22:16:07.8784796Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.execute(JUnitCoreWrapper.java:83)
> 2021-03-31T22:16:07.8785556Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreWrapper.execute(JUnitCoreWrapper.java:75)
> 2021-03-31T22:16:07.8786346Z  at 
> org.apache.maven.surefire.junitcore.JUnitCoreProvider.invoke(JUnitCoreProvider.java:158)
> 2021-03-31T22:16:07.8787299Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.invokeProviderInSameClassLoader(ForkedBooter.java:384)
> 2021-03-31T22:16:07.8788104Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:345)
> 2021-03-31T22:16:07.8815851Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.execute(ForkedBooter.java:126)
> 2021-03-31T22:16:07.8816576Z  at 
> org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:418)
> 2021-03-31T22:16:07.8819737Z Caused by: 
> java.util.concurrent.CompletionException: 
> java.util.concurrent.CompletionException: java.lang.IllegalStateException: 
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max 
> parallelism mismatch between checkpoint/savepoint state and new program. 
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13 
> to new program with max parallelism 128. This indicates that the program has 
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8821554Z  at 
> org.apache.flink.runtime.concurrent.FutureUtils.lambda$switchExecutor$23(FutureUtils.java:1362)
> 2021-03-31T22:16:07.8822349Z  at 
> java.util.concurrent.CompletableFuture.uniHandle(CompletableFuture.java:836)
> 2021-03-31T22:16:07.8823178Z  at 
> java.util.concurrent.CompletableFuture$UniHandle.tryFire(CompletableFuture.java:811)
> 2021-03-31T22:16:07.8823948Z  at 
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> 2021-03-31T22:16:07.8824698Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRunAsync(AkkaRpcActor.java:440)
> 2021-03-31T22:16:07.8825485Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:208)
> 2021-03-31T22:16:07.8826318Z  at 
> org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77)
> 2021-03-31T22:16:07.8827203Z  at 
> org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:158)
> 2021-03-31T22:16:07.8827925Z  at 
> akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
> 2021-03-31T22:16:07.8828561Z  at 
> akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
> 2021-03-31T22:16:07.8829192Z  at 
> scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
> 2021-03-31T22:16:07.8829860Z  at 
> akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
> 2021-03-31T22:16:07.8830536Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
> 2021-03-31T22:16:07.8831187Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> 2021-03-31T22:16:07.8831853Z  at 
> scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
> 2021-03-31T22:16:07.8832477Z  at 
> akka.actor.Actor$class.aroundReceive(Actor.scala:517)
> 2021-03-31T22:16:07.8833155Z  at 
> akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
> 2021-03-31T22:16:07.8833784Z  at 
> akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
> 2021-03-31T22:16:07.8834351Z  at 
> akka.actor.ActorCell.invoke(ActorCell.scala:561)
> 2021-03-31T22:16:07.8835134Z  at 
> akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
> 2021-03-31T22:16:07.8835693Z  at akka.dispatch.Mailbox.run(Mailbox.scala:225)
> 2021-03-31T22:16:07.8836208Z  at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
> 2021-03-31T22:16:07.8836805Z  at 
> akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
> 2021-03-31T22:16:07.8837571Z  at 
> akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
> 2021-03-31T22:16:07.8838263Z  at 
> akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
> 2021-03-31T22:16:07.8838962Z  at 
> akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
> 2021-03-31T22:16:07.8841344Z Caused by: 
> java.util.concurrent.CompletionException: java.lang.IllegalStateException: 
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max 
> parallelism mismatch between checkpoint/savepoint state and new program. 
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13 
> to new program with max parallelism 128. This indicates that the program has 
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8843188Z  at 
> org.apache.flink.runtime.scheduler.adaptive.BackgroundTask.lambda$new$0(BackgroundTask.java:59)
> 2021-03-31T22:16:07.8843956Z  at 
> java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
> 2021-03-31T22:16:07.8844829Z  at 
> java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
> 2021-03-31T22:16:07.8845596Z  at 
> java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
> 2021-03-31T22:16:07.8846306Z  at 
> java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
> 2021-03-31T22:16:07.8846964Z  at 
> java.util.concurrent.FutureTask.run(FutureTask.java:266)
> 2021-03-31T22:16:07.8847833Z  at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
> 2021-03-31T22:16:07.8848782Z  at 
> java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
> 2021-03-31T22:16:07.8849613Z  at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> 2021-03-31T22:16:07.8850360Z  at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> 2021-03-31T22:16:07.8850983Z  at java.lang.Thread.run(Thread.java:748)
> 2021-03-31T22:16:07.8852976Z Caused by: java.lang.IllegalStateException: 
> Failed to rollback to checkpoint/savepoint Checkpoint Metadata. Max 
> parallelism mismatch between checkpoint/savepoint state and new program. 
> Cannot map operator 20ba6b65f97481d5570070de90e4e791 with max parallelism 13 
> to new program with max parallelism 128. This indicates that the program has 
> been changed in a non-compatible way after the checkpoint/savepoint.
> 2021-03-31T22:16:07.8854549Z  at 
> org.apache.flink.runtime.checkpoint.Checkpoints.loadAndValidateCheckpoint(Checkpoints.java:181)
> 2021-03-31T22:16:07.8855424Z  at 
> org.apache.flink.runtime.checkpoint.CheckpointCoordinator.restoreSavepoint(CheckpointCoordinator.java:1630)
> 2021-03-31T22:16:07.8856440Z  at 
> org.apache.flink.runtime.scheduler.DefaultExecutionGraphFactory.tryRestoreExecutionGraphFromSavepoint(DefaultExecutionGraphFactory.java:163)
> 2021-03-31T22:16:07.8862100Z  at 
> org.apache.flink.runtime.scheduler.DefaultExecutionGraphFactory.createAndRestoreExecutionGraph(DefaultExecutionGraphFactory.java:138)
> 2021-03-31T22:16:07.8863316Z  at 
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.createExecutionGraphAndRestoreState(AdaptiveScheduler.java:971)
> 2021-03-31T22:16:07.8864391Z  at 
> org.apache.flink.runtime.scheduler.adaptive.AdaptiveScheduler.lambda$createExecutionGraphAndRestoreStateAsync$24(AdaptiveScheduler.java:961)
> 2021-03-31T22:16:07.8865366Z  at 
> org.apache.flink.runtime.scheduler.adaptive.BackgroundTask.lambda$new$0(BackgroundTask.java:57)
> 2021-03-31T22:16:07.8865913Z  ... 10 more
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to