[ https://issues.apache.org/jira/browse/CASSANDRA-15131?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16861001#comment-16861001 ]
lujie commented on CASSANDRA-15131: ----------------------------------- ping-----> > Data Race between force remove and remove > ----------------------------------------- > > Key: CASSANDRA-15131 > URL: https://issues.apache.org/jira/browse/CASSANDRA-15131 > Project: Cassandra > Issue Type: Bug > Components: Consistency/Bootstrap and Decommission > Reporter: lujie > Assignee: lujie > Priority: Normal > Labels: pull-request-available > Attachments: 0001-fix-CASSANDRA-15131.patch > > Time Spent: 10m > Remaining Estimate: 0h > > Reproduce: > # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f > # shutdown node A > # In Node B, removing node A by:./bin/nodetool removenode > 2331c0c1-f799-4f35-9323-c57ad020732b > # But this process is too slow, so in node B , we force remove A > by:./bin/nodetool removenode force > # But we meet NPE > # > {code:java} > RemovalStatus: Removing token (-9206149340638432876). Waiting for replication > confirmation from [/10.3.1.11,/10.3.1.14]. > error: null > -- StackTrace -- > java.lang.NullPointerException > at > org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214) > at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556) > at > org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71) > at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275) > at > com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112) > at > com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46) > at > com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237) > at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138) > at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252) > at > com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819) > at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801) > at > javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471) > at > javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76) > at > javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312) > at > javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404) > at > javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832) > at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:497) > at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323) > at sun.rmi.transport.Transport$1.run(Transport.java:200) > at sun.rmi.transport.Transport$1.run(Transport.java:197) > at java.security.AccessController.doPrivileged(Native Method) > at sun.rmi.transport.Transport.serviceCall(Transport.java:196) > at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568) > at > sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826) > at > sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683) > at java.security.AccessController.doPrivileged(Native Method) > at > sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > {code} > Code Analysis > 1. removeNode will mark the node as Leaving > {code:java} > tokenMetadata.addLeavingEndpoint(endpoint); > {code} > 2. so forceRemoveNode can step into remove(line3 - line12) > {code:java} > 1. if (!replicatingNodes.isEmpty() || > !tokenMetadata.getLeavingEndpoints().isEmpty()) > 2. { > 3. logger.warn("Removal not confirmed for for {}", > StringUtils.join(this.replicatingNodes, ",")); > 4. for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints()) > 5. { > 6. UUID hostId = tokenMetadata.getHostId(endpoint); > 7. Gossiper.instance.advertiseTokenRemoved(endpoint, hostId); > 8. excise(tokenMetadata.getTokens(endpoint), endpoint); > 10 } > 11 replicatingNodes.clear(); > 12 removingNode = null; > 13 } > {code} > 3 .At code line#6, forceRemoveNode will get hostId , but if removeNode just > remove the host just now, the hostId at line6 will be null. > 4. code line#7 will call *hostId.toString(),* hence NPE happens. > If we have two or more nodes that should be force removed, this NPE will > make them be skipped and still exist in cluster. -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org For additional commands, e-mail: commits-h...@cassandra.apache.org