[ https://issues.apache.org/jira/browse/CASSANDRA-15131?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
lujie updated CASSANDRA-15131: ------------------------------ Description: Reproduce: # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f # shutdown node A # In Node B, removing node A by:./bin/nodetool removenode 2331c0c1-f799-4f35-9323-c57ad020732b # But this process is too slow, so we force remove A by:./bin/nodetool removenode force # NPE happens in client # {code:java} RemovalStatus: Removing token (-9206149340638432876). Waiting for replication confirmation from [/10.3.1.11,/10.3.1.14]. error: null -- StackTrace -- java.lang.NullPointerException at org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214) at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556) at org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71) at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275) at com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112) at com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46) at com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237) at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138) at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252) at com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819) at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801) at javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471) at javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76) at javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312) at javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404) at javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832) at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323) at sun.rmi.transport.Transport$1.run(Transport.java:200) at sun.rmi.transport.Transport$1.run(Transport.java:197) at java.security.AccessController.doPrivileged(Native Method) at sun.rmi.transport.Transport.serviceCall(Transport.java:196) at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568) at sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826) at sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683) at java.security.AccessController.doPrivileged(Native Method) at sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) {code} Code Analysis 1. removeNode will mark the node as Leaving {code:java} tokenMetadata.addLeavingEndpoint(endpoint); {code} 2. forceRemoveNode then step into remove {code:java} 1. if (!replicatingNodes.isEmpty() || !tokenMetadata.getLeavingEndpoints().isEmpty()) 2. { 3. logger.warn("Removal not confirmed for for {}", StringUtils.join(this.replicatingNodes, ",")); 4. for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints()) 5. { 6. UUID hostId = tokenMetadata.getHostId(endpoint); 7. Gossiper.instance.advertiseTokenRemoved(endpoint, hostId); 8. excise(tokenMetadata.getTokens(endpoint), endpoint); 10 } 11 replicatingNodes.clear(); 12 removingNode = null; 13 } {code} 3 .code line#6,will get hostId, but if removeNode execute completely right now and it will remove host : *tokenMetadata.removeEndpoint(endpoint);* So hostId is null. 4. code line#7 will call *hostId.toString(),* hence NPE happens. The NPE will prevent other nodes being force removed. was: Reproduce: # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f # shutdown node A # In Node B, removing node A by:./bin/nodetool removenode 2331c0c1-f799-4f35-9323-c57ad020732b # But this process is too slow, so we force remove A by:./bin/nodetool removenode force # NPE happens in client # {code:java} RemovalStatus: Removing token (-9206149340638432876). Waiting for replication confirmation from [/10.3.1.11,/10.3.1.14]. error: null -- StackTrace -- java.lang.NullPointerException at org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214) at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556) at org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71) at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275) at com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112) at com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46) at com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237) at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138) at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252) at com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819) at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801) at javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471) at javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76) at javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312) at javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404) at javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832) at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:497) at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323) at sun.rmi.transport.Transport$1.run(Transport.java:200) at sun.rmi.transport.Transport$1.run(Transport.java:197) at java.security.AccessController.doPrivileged(Native Method) at sun.rmi.transport.Transport.serviceCall(Transport.java:196) at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568) at sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826) at sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683) at java.security.AccessController.doPrivileged(Native Method) at sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) {code} Code Analysis 1. removeNode will mark the node as Leaving {code:java} tokenMetadata.addLeavingEndpoint(endpoint); {code} 2. forceRemoveNode then step into remove {code:java} 1. if (!replicatingNodes.isEmpty() || !tokenMetadata.getLeavingEndpoints().isEmpty()) 2. { 3. logger.warn("Removal not confirmed for for {}", StringUtils.join(this.replicatingNodes, ",")); 4. for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints()) 5. { 6. UUID hostId = tokenMetadata.getHostId(endpoint); 7. Gossiper.instance.advertiseTokenRemoved(endpoint, hostId); 8. excise(tokenMetadata.getTokens(endpoint), endpoint); 10 } 11 replicatingNodes.clear(); 12 removingNode = null; 13 } {code} 3 .code line#6,will get hostId, but if removeNode execute completely right now and it will remove host : *tokenMetadata.removeEndpoint(endpoint);* So hostId is null. 4. code line#7 will call *hostId.toString(),* hence NPE happens. The ugly NPE can't show what happens in force remove request and we should fix it. We found this bug in version 3.11.4, the trunk also has this bug. I will give the patch soon. > Data Race between force remove and remove > ----------------------------------------- > > Key: CASSANDRA-15131 > URL: https://issues.apache.org/jira/browse/CASSANDRA-15131 > Project: Cassandra > Issue Type: Bug > Components: Consistency/Bootstrap and Decommission > Reporter: lujie > Assignee: lujie > Priority: Normal > Labels: pull-request-available > Attachments: 0001-fix-CASSANDRA-15131.patch > > Time Spent: 10m > Remaining Estimate: 0h > > Reproduce: > # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f > # shutdown node A > # In Node B, removing node A by:./bin/nodetool removenode > 2331c0c1-f799-4f35-9323-c57ad020732b > # But this process is too slow, so we force remove A by:./bin/nodetool > removenode force > # NPE happens in client > # > {code:java} > RemovalStatus: Removing token (-9206149340638432876). Waiting for replication > confirmation from [/10.3.1.11,/10.3.1.14]. > error: null > -- StackTrace -- > java.lang.NullPointerException > at > org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214) > at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556) > at > org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71) > at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275) > at > com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112) > at > com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46) > at > com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237) > at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138) > at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252) > at > com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819) > at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801) > at > javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471) > at > javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76) > at > javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312) > at > javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404) > at > javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832) > at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323) > at sun.rmi.transport.Transport$1.run(Transport.java:200) > at sun.rmi.transport.Transport$1.run(Transport.java:197) > at java.security.AccessController.doPrivileged(Native Method) > at sun.rmi.transport.Transport.serviceCall(Transport.java:196) > at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568) > at > sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826) > at > sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683) > at java.security.AccessController.doPrivileged(Native Method) > at > sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > {code} > Code Analysis > 1. removeNode will mark the node as Leaving > {code:java} > tokenMetadata.addLeavingEndpoint(endpoint); > {code} > 2. forceRemoveNode then step into remove > {code:java} > 1. if (!replicatingNodes.isEmpty() || > !tokenMetadata.getLeavingEndpoints().isEmpty()) > 2. { > 3. logger.warn("Removal not confirmed for for {}", > StringUtils.join(this.replicatingNodes, ",")); > 4. for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints()) > 5. { > 6. UUID hostId = tokenMetadata.getHostId(endpoint); > 7. Gossiper.instance.advertiseTokenRemoved(endpoint, hostId); > 8. excise(tokenMetadata.getTokens(endpoint), endpoint); > 10 } > 11 replicatingNodes.clear(); > 12 removingNode = null; > 13 } > {code} > 3 .code line#6,will get hostId, but if removeNode execute completely right > now and it will remove host : *tokenMetadata.removeEndpoint(endpoint);* > So hostId is null. > 4. code line#7 will call *hostId.toString(),* hence NPE happens. > The NPE will prevent other nodes being force removed. > -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org For additional commands, e-mail: commits-h...@cassandra.apache.org