[ 
https://issues.apache.org/jira/browse/CASSANDRA-15131?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

lujie updated CASSANDRA-15131:
------------------------------
    Description: 
Reproduce:
 # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f
 # shutdown node A
 # In Node B,  removing node A by:./bin/nodetool  removenode 
2331c0c1-f799-4f35-9323-c57ad020732b
 # But this process is too slow, so in node B , we force remove A 
by:./bin/nodetool  removenode force 
 # But we meet NPE
 # 
{code:java}
RemovalStatus: Removing token (-9206149340638432876). Waiting for replication 
confirmation from [/10.3.1.11,/10.3.1.14].
error: null
-- StackTrace --
java.lang.NullPointerException
at 
org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214)
at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556)
at 
org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71)
at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275)
at 
com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112)
at 
com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46)
at com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237)
at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138)
at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252)
at 
com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819)
at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801)
at 
javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471)
at 
javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76)
at 
javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312)
at 
javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404)
at 
javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832)
at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323)
at sun.rmi.transport.Transport$1.run(Transport.java:200)
at sun.rmi.transport.Transport$1.run(Transport.java:197)
at java.security.AccessController.doPrivileged(Native Method)
at sun.rmi.transport.Transport.serviceCall(Transport.java:196)
at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683)
at java.security.AccessController.doPrivileged(Native Method)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
{code}

Code Analysis 

1. removeNode will mark the node as Leaving
{code:java}
tokenMetadata.addLeavingEndpoint(endpoint);
{code}
2. so forceRemoveNode can  step into remove(line3 - line12)
{code:java}
1. if (!replicatingNodes.isEmpty() || 
!tokenMetadata.getLeavingEndpoints().isEmpty())
2. {
3.   logger.warn("Removal not confirmed for for {}", 
StringUtils.join(this.replicatingNodes, ","));
4.   for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints())
5.   {
6.      UUID hostId = tokenMetadata.getHostId(endpoint);
7.      Gossiper.instance.advertiseTokenRemoved(endpoint, hostId);
8.      excise(tokenMetadata.getTokens(endpoint), endpoint);
10   }
11   replicatingNodes.clear();
12   removingNode = null;
13 }
{code}
3 .At code line#6, forceRemoveNode will get hostId , but if removeNode just 
remove the host just now, the  hostId at line6 will be null.

4. code line#7 will call *hostId.toString(),* hence NPE happens.

 If we have two or more nodes that should be force removed, this NPE will make 
them be skipped and still exist in cluster. 

  was:
Reproduce:
 # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f
 # shutdown node A
 # In Node B,  removing node A by:./bin/nodetool  removenode 
2331c0c1-f799-4f35-9323-c57ad020732b
 # But this process is too slow, so we force remove A by:./bin/nodetool  
removenode force 
 # NPE happens in client
 # 
{code:java}
RemovalStatus: Removing token (-9206149340638432876). Waiting for replication 
confirmation from [/10.3.1.11,/10.3.1.14].
error: null
-- StackTrace --
java.lang.NullPointerException
at 
org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214)
at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556)
at 
org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71)
at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275)
at 
com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112)
at 
com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46)
at com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237)
at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138)
at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252)
at 
com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819)
at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801)
at 
javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471)
at 
javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76)
at 
javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312)
at 
javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404)
at 
javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832)
at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323)
at sun.rmi.transport.Transport$1.run(Transport.java:200)
at sun.rmi.transport.Transport$1.run(Transport.java:197)
at java.security.AccessController.doPrivileged(Native Method)
at sun.rmi.transport.Transport.serviceCall(Transport.java:196)
at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683)
at java.security.AccessController.doPrivileged(Native Method)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
{code}

Code Analysis 

1. removeNode will mark the node as Leaving
{code:java}
tokenMetadata.addLeavingEndpoint(endpoint);
{code}
2. forceRemoveNode then step into remove
{code:java}
1. if (!replicatingNodes.isEmpty() || 
!tokenMetadata.getLeavingEndpoints().isEmpty())
2. {
3.   logger.warn("Removal not confirmed for for {}", 
StringUtils.join(this.replicatingNodes, ","));
4.   for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints())
5.   {
6.      UUID hostId = tokenMetadata.getHostId(endpoint);
7.      Gossiper.instance.advertiseTokenRemoved(endpoint, hostId);
8.      excise(tokenMetadata.getTokens(endpoint), endpoint);
10   }
11   replicatingNodes.clear();
12   removingNode = null;
13 }
{code}
3 .code line#6,will get hostId, but if removeNode execute completely right now 
and it will remove host : *tokenMetadata.removeEndpoint(endpoint);*

So hostId is null.

4. code line#7 will call *hostId.toString(),* hence NPE happens.

 The NPE will prevent other nodes being force removed.

 


> Data Race between force remove and remove
> -----------------------------------------
>
>                 Key: CASSANDRA-15131
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-15131
>             Project: Cassandra
>          Issue Type: Bug
>          Components: Consistency/Bootstrap and Decommission
>            Reporter: lujie
>            Assignee: lujie
>            Priority: Normal
>              Labels: pull-request-available
>         Attachments: 0001-fix-CASSANDRA-15131.patch
>
>          Time Spent: 10m
>  Remaining Estimate: 0h
>
> Reproduce:
>  # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f
>  # shutdown node A
>  # In Node B,  removing node A by:./bin/nodetool  removenode 
> 2331c0c1-f799-4f35-9323-c57ad020732b
>  # But this process is too slow, so in node B , we force remove A 
> by:./bin/nodetool  removenode force 
>  # But we meet NPE
>  # 
> {code:java}
> RemovalStatus: Removing token (-9206149340638432876). Waiting for replication 
> confirmation from [/10.3.1.11,/10.3.1.14].
> error: null
> -- StackTrace --
> java.lang.NullPointerException
> at 
> org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214)
> at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556)
> at 
> org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:497)
> at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71)
> at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:497)
> at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275)
> at 
> com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112)
> at 
> com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46)
> at 
> com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237)
> at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138)
> at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252)
> at 
> com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819)
> at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801)
> at 
> javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471)
> at 
> javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76)
> at 
> javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312)
> at 
> javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404)
> at 
> javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832)
> at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:497)
> at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323)
> at sun.rmi.transport.Transport$1.run(Transport.java:200)
> at sun.rmi.transport.Transport$1.run(Transport.java:197)
> at java.security.AccessController.doPrivileged(Native Method)
> at sun.rmi.transport.Transport.serviceCall(Transport.java:196)
> at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568)
> at 
> sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826)
> at 
> sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683)
> at java.security.AccessController.doPrivileged(Native Method)
> at 
> sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682)
> at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> {code}
> Code Analysis 
> 1. removeNode will mark the node as Leaving
> {code:java}
> tokenMetadata.addLeavingEndpoint(endpoint);
> {code}
> 2. so forceRemoveNode can  step into remove(line3 - line12)
> {code:java}
> 1. if (!replicatingNodes.isEmpty() || 
> !tokenMetadata.getLeavingEndpoints().isEmpty())
> 2. {
> 3.   logger.warn("Removal not confirmed for for {}", 
> StringUtils.join(this.replicatingNodes, ","));
> 4.   for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints())
> 5.   {
> 6.      UUID hostId = tokenMetadata.getHostId(endpoint);
> 7.      Gossiper.instance.advertiseTokenRemoved(endpoint, hostId);
> 8.      excise(tokenMetadata.getTokens(endpoint), endpoint);
> 10   }
> 11   replicatingNodes.clear();
> 12   removingNode = null;
> 13 }
> {code}
> 3 .At code line#6, forceRemoveNode will get hostId , but if removeNode just 
> remove the host just now, the  hostId at line6 will be null.
> 4. code line#7 will call *hostId.toString(),* hence NPE happens.
>  If we have two or more nodes that should be force removed, this NPE will 
> make them be skipped and still exist in cluster. 



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org
For additional commands, e-mail: commits-h...@cassandra.apache.org

Reply via email to