[ https://issues.apache.org/jira/browse/IGNITE-17036?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Alexander Lapin updated IGNITE-17036: ------------------------------------- Description: Reproducer on a feature branch ignite-14209 : {code:java} void testOneRebalance(@WorkDirectory Path workDir, TestInfo testInfo) throws Exception { TableDefinition schTbl1 = SchemaBuilders.tableBuilder("PUBLIC", "tbl1").columns( SchemaBuilders.column("key", ColumnType.INT64).build(), SchemaBuilders.column("val", ColumnType.INT32).asNullable(true).build() ).withPrimaryKey("key").build(); nodes.get(0).tableManager.createTable( "PUBLIC.tbl1", tblChanger -> SchemaConfigurationConverter.convert(schTbl1, tblChanger) .changeReplicas(1) .changePartitions(1)); assertEquals(1, nodes.get(0).clusterCfgMgr.configurationRegistry().getConfiguration(TablesConfiguration.KEY) .tables().get("PUBLIC.TBL1").replicas().value()); nodes.get(0).tableManager.alterTable("PUBLIC.TBL1", ch -> ch.changeReplicas(3)); waitPartitionAssignmentsSyncedToExpected(0, 3); nodes.get(0).tableManager.alterTable("PUBLIC.TBL1", ch -> ch.changeReplicas(2)); waitPartitionAssignmentsSyncedToExpected(0, 2); assertEquals(3, getAssignments(0, 0).size()); assertEquals(3, getAssignments(1, 0).size()); assertEquals(3, getAssignments(2, 0).size()); } {code} This code hangs with {noformat} 2022-05-26 12:14:00:871 +0300 [ERROR][Thread-89][MetaStorageServiceImpl] Unexpected exception java.util.concurrent.CompletionException: class org.apache.ignite.raft.jraft.rpc.impl.RaftException: ESTATEMACHINE:null at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331) at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346) at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088) at org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:617) at org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:556) at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859) at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837) at java.base/java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:479) at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) Caused by: class org.apache.ignite.raft.jraft.rpc.impl.RaftException: ESTATEMACHINE:null at org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:618) ... 9 more {noformat} Root cause is the bug in {{{}SimpleInMemoryKeyValueStorage{}}}, method {{SimpleInMemoryKeyValueStorage#doGetValue}} throws NPE when tries to get revision on the line {{if (lastVal.tombstone())}} because {{Value lastVal = lastRevVals.get(key)}} is null. *Upd* Root cause was in incorrect usage of exact revision matching. Actually we don't need such exact mapping at all. was: Reproducer on a feature branch ignite-14209 : {code:java} void testOneRebalance(@WorkDirectory Path workDir, TestInfo testInfo) throws Exception { TableDefinition schTbl1 = SchemaBuilders.tableBuilder("PUBLIC", "tbl1").columns( SchemaBuilders.column("key", ColumnType.INT64).build(), SchemaBuilders.column("val", ColumnType.INT32).asNullable(true).build() ).withPrimaryKey("key").build(); nodes.get(0).tableManager.createTable( "PUBLIC.tbl1", tblChanger -> SchemaConfigurationConverter.convert(schTbl1, tblChanger) .changeReplicas(1) .changePartitions(1)); assertEquals(1, nodes.get(0).clusterCfgMgr.configurationRegistry().getConfiguration(TablesConfiguration.KEY) .tables().get("PUBLIC.TBL1").replicas().value()); nodes.get(0).tableManager.alterTable("PUBLIC.TBL1", ch -> ch.changeReplicas(3)); waitPartitionAssignmentsSyncedToExpected(0, 3); nodes.get(0).tableManager.alterTable("PUBLIC.TBL1", ch -> ch.changeReplicas(2)); waitPartitionAssignmentsSyncedToExpected(0, 2); assertEquals(3, getAssignments(0, 0).size()); assertEquals(3, getAssignments(1, 0).size()); assertEquals(3, getAssignments(2, 0).size()); } {code} This code hangs with {noformat} 2022-05-26 12:14:00:871 +0300 [ERROR][Thread-89][MetaStorageServiceImpl] Unexpected exception java.util.concurrent.CompletionException: class org.apache.ignite.raft.jraft.rpc.impl.RaftException: ESTATEMACHINE:null at java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331) at java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346) at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632) at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) at java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088) at org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:617) at org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:556) at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859) at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837) at java.base/java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:479) at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) Caused by: class org.apache.ignite.raft.jraft.rpc.impl.RaftException: ESTATEMACHINE:null at org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:618) ... 9 more {noformat} Root cause is the bug in {{{}SimpleInMemoryKeyValueStorage{}}}, method {{SimpleInMemoryKeyValueStorage#doGetValue}} throws NPE when tries to get revision on the line {{if (lastVal.tombstone())}} because {{Value lastVal = lastRevVals.get(key)}} is null. > RaftException: ESTATEMACHINE:null when change replicas from 3 to 2 for a table > ------------------------------------------------------------------------------ > > Key: IGNITE-17036 > URL: https://issues.apache.org/jira/browse/IGNITE-17036 > Project: Ignite > Issue Type: Bug > Reporter: Mirza Aliev > Assignee: Alexander Lapin > Priority: Major > Labels: ignite-3 > > Reproducer on a feature branch ignite-14209 : > {code:java} > void testOneRebalance(@WorkDirectory Path workDir, TestInfo testInfo) throws > Exception { > TableDefinition schTbl1 = SchemaBuilders.tableBuilder("PUBLIC", > "tbl1").columns( > SchemaBuilders.column("key", ColumnType.INT64).build(), > SchemaBuilders.column("val", > ColumnType.INT32).asNullable(true).build() > ).withPrimaryKey("key").build(); > nodes.get(0).tableManager.createTable( > "PUBLIC.tbl1", > tblChanger -> SchemaConfigurationConverter.convert(schTbl1, > tblChanger) > .changeReplicas(1) > .changePartitions(1)); > assertEquals(1, > nodes.get(0).clusterCfgMgr.configurationRegistry().getConfiguration(TablesConfiguration.KEY) > .tables().get("PUBLIC.TBL1").replicas().value()); > nodes.get(0).tableManager.alterTable("PUBLIC.TBL1", ch -> > ch.changeReplicas(3)); > waitPartitionAssignmentsSyncedToExpected(0, 3); > nodes.get(0).tableManager.alterTable("PUBLIC.TBL1", ch -> > ch.changeReplicas(2)); > waitPartitionAssignmentsSyncedToExpected(0, 2); > assertEquals(3, getAssignments(0, 0).size()); > assertEquals(3, getAssignments(1, 0).size()); > assertEquals(3, getAssignments(2, 0).size()); > } > {code} > This code hangs with > {noformat} > 2022-05-26 12:14:00:871 +0300 [ERROR][Thread-89][MetaStorageServiceImpl] > Unexpected exception > java.util.concurrent.CompletionException: class > org.apache.ignite.raft.jraft.rpc.impl.RaftException: ESTATEMACHINE:null > at > java.base/java.util.concurrent.CompletableFuture.encodeThrowable(CompletableFuture.java:331) > at > java.base/java.util.concurrent.CompletableFuture.completeThrowable(CompletableFuture.java:346) > at > java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:632) > at > java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506) > at > java.base/java.util.concurrent.CompletableFuture.completeExceptionally(CompletableFuture.java:2088) > at > org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:617) > at > org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:556) > at > java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859) > at > java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837) > at > java.base/java.util.concurrent.CompletableFuture$Completion.exec(CompletableFuture.java:479) > at > java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290) > at > java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020) > at > java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656) > at > java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594) > at > java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183) > Caused by: class org.apache.ignite.raft.jraft.rpc.impl.RaftException: > ESTATEMACHINE:null > at > org.apache.ignite.raft.jraft.rpc.impl.RaftGroupServiceImpl$1.accept(RaftGroupServiceImpl.java:618) > ... 9 more > {noformat} > Root cause is the bug in {{{}SimpleInMemoryKeyValueStorage{}}}, method > {{SimpleInMemoryKeyValueStorage#doGetValue}} throws NPE when tries to get > revision on the line {{if (lastVal.tombstone())}} because {{Value lastVal = > lastRevVals.get(key)}} is null. > > *Upd* > Root cause was in incorrect usage of exact revision matching. Actually we > don't need such exact mapping at all. -- This message was sent by Atlassian Jira (v8.20.7#820007)