[ 
https://issues.apache.org/jira/browse/CASSANDRA-15131?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

lujie updated CASSANDRA-15131:
------------------------------
    Description: 
Reproduce:
 # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f
 # shutdown node A
 # In Node B,  removing node A by:./bin/nodetool  removenode 
2331c0c1-f799-4f35-9323-c57ad020732b
 # But this process is too slow, so we force remove A by:./bin/nodetool  
removenode force 
 # NPE happens in client
 # 
{code:java}
RemovalStatus: Removing token (-9206149340638432876). Waiting for replication 
confirmation from [/10.3.1.11,/10.3.1.14].
error: null
-- StackTrace --
java.lang.NullPointerException
at 
org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214)
at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556)
at 
org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71)
at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275)
at 
com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112)
at 
com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46)
at com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237)
at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138)
at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252)
at 
com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819)
at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801)
at 
javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471)
at 
javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76)
at 
javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312)
at 
javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404)
at 
javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832)
at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323)
at sun.rmi.transport.Transport$1.run(Transport.java:200)
at sun.rmi.transport.Transport$1.run(Transport.java:197)
at java.security.AccessController.doPrivileged(Native Method)
at sun.rmi.transport.Transport.serviceCall(Transport.java:196)
at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683)
at java.security.AccessController.doPrivileged(Native Method)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
{code}

Code Analysis 

1. removeNode will mark the node as Leaving
{code:java}
tokenMetadata.addLeavingEndpoint(endpoint);
{code}
2. forceRemoveNode then step into remove
{code:java}
1. if (!replicatingNodes.isEmpty() || 
!tokenMetadata.getLeavingEndpoints().isEmpty())
2. {
3.   logger.warn("Removal not confirmed for for {}", 
StringUtils.join(this.replicatingNodes, ","));
4.   for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints())
5.   {
6.      UUID hostId = tokenMetadata.getHostId(endpoint);
7.      Gossiper.instance.advertiseTokenRemoved(endpoint, hostId);
8.      excise(tokenMetadata.getTokens(endpoint), endpoint);
10   }
11   replicatingNodes.clear();
12   removingNode = null;
13 }
{code}
3 .code line#6,will get hostId, but if removeNode execute completely right now 
and it will remove host : *tokenMetadata.removeEndpoint(endpoint);*

So hostId is null.

4. code line#7 will call *hostId.toString(),* hence NPE happens.

 The NPE will prevent other nodes being force removed.

 

  was:
Reproduce:
 # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f
 # shutdown node A
 # In Node B,  removing node A by:./bin/nodetool  removenode 
2331c0c1-f799-4f35-9323-c57ad020732b
 # But this process is too slow, so we force remove A by:./bin/nodetool  
removenode force 
 # NPE happens in client
 # 
{code:java}
RemovalStatus: Removing token (-9206149340638432876). Waiting for replication 
confirmation from [/10.3.1.11,/10.3.1.14].
error: null
-- StackTrace --
java.lang.NullPointerException
at 
org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214)
at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556)
at 
org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71)
at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275)
at 
com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112)
at 
com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46)
at com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237)
at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138)
at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252)
at 
com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819)
at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801)
at 
javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471)
at 
javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76)
at 
javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312)
at 
javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404)
at 
javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832)
at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source)
at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323)
at sun.rmi.transport.Transport$1.run(Transport.java:200)
at sun.rmi.transport.Transport$1.run(Transport.java:197)
at java.security.AccessController.doPrivileged(Native Method)
at sun.rmi.transport.Transport.serviceCall(Transport.java:196)
at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683)
at java.security.AccessController.doPrivileged(Native Method)
at 
sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
{code}

Code Analysis 

1. removeNode will mark the node as Leaving
{code:java}
tokenMetadata.addLeavingEndpoint(endpoint);
{code}
2. forceRemoveNode then step into remove
{code:java}
1. if (!replicatingNodes.isEmpty() || 
!tokenMetadata.getLeavingEndpoints().isEmpty())
2. {
3.   logger.warn("Removal not confirmed for for {}", 
StringUtils.join(this.replicatingNodes, ","));
4.   for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints())
5.   {
6.      UUID hostId = tokenMetadata.getHostId(endpoint);
7.      Gossiper.instance.advertiseTokenRemoved(endpoint, hostId);
8.      excise(tokenMetadata.getTokens(endpoint), endpoint);
10   }
11   replicatingNodes.clear();
12   removingNode = null;
13 }
{code}
3 .code line#6,will get hostId, but if removeNode execute completely right now 
and it will remove host : *tokenMetadata.removeEndpoint(endpoint);*

So hostId is null.

4. code line#7 will call *hostId.toString(),* hence NPE happens.

 The ugly NPE can't show what happens in force remove request and we should fix 
it. We found this bug in version 3.11.4, the trunk also has this bug. I will 
give the patch soon.

 


> Data Race between force remove and remove
> -----------------------------------------
>
>                 Key: CASSANDRA-15131
>                 URL: https://issues.apache.org/jira/browse/CASSANDRA-15131
>             Project: Cassandra
>          Issue Type: Bug
>          Components: Consistency/Bootstrap and Decommission
>            Reporter: lujie
>            Assignee: lujie
>            Priority: Normal
>              Labels: pull-request-available
>         Attachments: 0001-fix-CASSANDRA-15131.patch
>
>          Time Spent: 10m
>  Remaining Estimate: 0h
>
> Reproduce:
>  # start a three nodes cluster(A, B, C) by : ./bin/cassandra -f
>  # shutdown node A
>  # In Node B,  removing node A by:./bin/nodetool  removenode 
> 2331c0c1-f799-4f35-9323-c57ad020732b
>  # But this process is too slow, so we force remove A by:./bin/nodetool  
> removenode force 
>  # NPE happens in client
>  # 
> {code:java}
> RemovalStatus: Removing token (-9206149340638432876). Waiting for replication 
> confirmation from [/10.3.1.11,/10.3.1.14].
> error: null
> -- StackTrace --
> java.lang.NullPointerException
> at 
> org.apache.cassandra.gms.VersionedValue$VersionedValueFactory.removedNonlocal(VersionedValue.java:214)
> at org.apache.cassandra.gms.Gossiper.advertiseTokenRemoved(Gossiper.java:556)
> at 
> org.apache.cassandra.service.StorageService.forceRemoveCompletion(StorageService.java:4353)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:497)
> at sun.reflect.misc.Trampoline.invoke(MethodUtil.java:71)
> at sun.reflect.GeneratedMethodAccessor4.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:497)
> at sun.reflect.misc.MethodUtil.invoke(MethodUtil.java:275)
> at 
> com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:112)
> at 
> com.sun.jmx.mbeanserver.StandardMBeanIntrospector.invokeM2(StandardMBeanIntrospector.java:46)
> at 
> com.sun.jmx.mbeanserver.MBeanIntrospector.invokeM(MBeanIntrospector.java:237)
> at com.sun.jmx.mbeanserver.PerInterface.invoke(PerInterface.java:138)
> at com.sun.jmx.mbeanserver.MBeanSupport.invoke(MBeanSupport.java:252)
> at 
> com.sun.jmx.interceptor.DefaultMBeanServerInterceptor.invoke(DefaultMBeanServerInterceptor.java:819)
> at com.sun.jmx.mbeanserver.JmxMBeanServer.invoke(JmxMBeanServer.java:801)
> at 
> javax.management.remote.rmi.RMIConnectionImpl.doOperation(RMIConnectionImpl.java:1471)
> at 
> javax.management.remote.rmi.RMIConnectionImpl.access$300(RMIConnectionImpl.java:76)
> at 
> javax.management.remote.rmi.RMIConnectionImpl$PrivilegedOperation.run(RMIConnectionImpl.java:1312)
> at 
> javax.management.remote.rmi.RMIConnectionImpl.doPrivilegedOperation(RMIConnectionImpl.java:1404)
> at 
> javax.management.remote.rmi.RMIConnectionImpl.invoke(RMIConnectionImpl.java:832)
> at sun.reflect.GeneratedMethodAccessor6.invoke(Unknown Source)
> at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:497)
> at sun.rmi.server.UnicastServerRef.dispatch(UnicastServerRef.java:323)
> at sun.rmi.transport.Transport$1.run(Transport.java:200)
> at sun.rmi.transport.Transport$1.run(Transport.java:197)
> at java.security.AccessController.doPrivileged(Native Method)
> at sun.rmi.transport.Transport.serviceCall(Transport.java:196)
> at sun.rmi.transport.tcp.TCPTransport.handleMessages(TCPTransport.java:568)
> at 
> sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run0(TCPTransport.java:826)
> at 
> sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.lambda$run$81(TCPTransport.java:683)
> at java.security.AccessController.doPrivileged(Native Method)
> at 
> sun.rmi.transport.tcp.TCPTransport$ConnectionHandler.run(TCPTransport.java:682)
> at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
> at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
> at java.lang.Thread.run(Thread.java:745)
> {code}
> Code Analysis 
> 1. removeNode will mark the node as Leaving
> {code:java}
> tokenMetadata.addLeavingEndpoint(endpoint);
> {code}
> 2. forceRemoveNode then step into remove
> {code:java}
> 1. if (!replicatingNodes.isEmpty() || 
> !tokenMetadata.getLeavingEndpoints().isEmpty())
> 2. {
> 3.   logger.warn("Removal not confirmed for for {}", 
> StringUtils.join(this.replicatingNodes, ","));
> 4.   for (InetAddress endpoint : tokenMetadata.getLeavingEndpoints())
> 5.   {
> 6.      UUID hostId = tokenMetadata.getHostId(endpoint);
> 7.      Gossiper.instance.advertiseTokenRemoved(endpoint, hostId);
> 8.      excise(tokenMetadata.getTokens(endpoint), endpoint);
> 10   }
> 11   replicatingNodes.clear();
> 12   removingNode = null;
> 13 }
> {code}
> 3 .code line#6,will get hostId, but if removeNode execute completely right 
> now and it will remove host : *tokenMetadata.removeEndpoint(endpoint);*
> So hostId is null.
> 4. code line#7 will call *hostId.toString(),* hence NPE happens.
>  The NPE will prevent other nodes being force removed.
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org
For additional commands, e-mail: commits-h...@cassandra.apache.org

Reply via email to