[ https://issues.apache.org/jira/browse/CASSANDRA-11340?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15189841#comment-15189841 ]
Jeff Jirsa commented on CASSANDRA-11340: ---------------------------------------- We've manually worked around this by building a new jar: {code} diff --git a/src/java/org/apache/cassandra/service/ClientState.java b/src/java/org/apache/cassandra/service/ClientState.java index 23eec73..2adbcc5 100644 --- a/src/java/org/apache/cassandra/service/ClientState.java +++ b/src/java/org/apache/cassandra/service/ClientState.java @@ -264,13 +264,13 @@ public class ClientState return; // prevent system keyspace modification - if (Keyspace.SYSTEM_KS.equalsIgnoreCase(keyspace)) - throw new UnauthorizedException(keyspace + " keyspace is not user-modifiable."); + // if (Keyspace.SYSTEM_KS.equalsIgnoreCase(keyspace)) + // throw new UnauthorizedException(keyspace + " keyspace is not user-modifiable."); // we want to allow altering AUTH_KS and TRACING_KS. - Set<String> allowAlter = Sets.newHashSet(Auth.AUTH_KS, Tracing.TRACE_KS); - if (allowAlter.contains(keyspace.toLowerCase()) && !(resource.isKeyspaceLevel() && perm.equals(Permission.ALTER))) - throw new UnauthorizedException(String.format("Cannot %s %s", perm, resource)); + // Set<String> allowAlter = Sets.newHashSet(Auth.AUTH_KS, Tracing.TRACE_KS); + // if (allowAlter.contains(keyspace.toLowerCase()) && !(resource.isKeyspaceLevel() && perm.equals(Permission.ALTER))) + // throw new UnauthorizedException(String.format("Cannot %s %s", perm, resource)); } public void validateLogin() throws UnauthorizedException {code} Then: {code} cqlsh> alter table system_auth.credentials with speculative_retry = 'NONE'; alter table system_auth.users with speculative_retry = 'NONE'; alter table system_auth.permissions with speculative_retry = 'NONE'; {code} Then restoring the original jar with the restrictions in place. We initiated a mass reconnect, saw load jump into the 20s and then descend. prior to disabling SR on system_auth, such a reconnect could cause load to go into the 300s and availability would be impacted. > Speculative retry on system_auth tables can cause deadlock > ---------------------------------------------------------- > > Key: CASSANDRA-11340 > URL: https://issues.apache.org/jira/browse/CASSANDRA-11340 > Project: Cassandra > Issue Type: Bug > Reporter: Jeff Jirsa > > Reproduced in at least 2.1.9. > It appears possible for queries against system_auth tables to trigger > speculative retry, which causes auth to block on traffic going off node. In > some cases, it appears possible for threads to become deadlocked, causing > load on the nodes to increase sharply. This happens even in clusters with RF > of system_auth == N, as all requests being served locally puts the bar for > 99% SR pretty low. > Incomplete stack trace below, but we haven't yet figured out what exactly is > blocking: > {code} > Thread 82291: (state = BLOCKED) > - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information > may be imprecise) > - java.util.concurrent.locks.LockSupport.parkNanos(long) @bci=11, line=338 > (Compiled frame) > - > org.apache.cassandra.utils.concurrent.WaitQueue$AbstractSignal.awaitUntil(long) > @bci=28, line=307 (Compiled frame) > - org.apache.cassandra.utils.concurrent.SimpleCondition.await(long, > java.util.concurrent.TimeUnit) @bci=76, line=63 (Compiled frame) > - org.apache.cassandra.service.ReadCallback.await(long, > java.util.concurrent.TimeUnit) @bci=25, line=92 (Compiled frame) > - > org.apache.cassandra.service.AbstractReadExecutor$SpeculatingReadExecutor.maybeTryAdditionalReplicas() > @bci=39, line=281 (Compiled frame) > - org.apache.cassandra.service.StorageProxy.fetchRows(java.util.List, > org.apache.cassandra.db.ConsistencyLevel) @bci=175, line=1338 (Compiled frame) > - org.apache.cassandra.service.StorageProxy.readRegular(java.util.List, > org.apache.cassandra.db.ConsistencyLevel) @bci=9, line=1274 (Compiled frame) > - org.apache.cassandra.service.StorageProxy.read(java.util.List, > org.apache.cassandra.db.ConsistencyLevel, > org.apache.cassandra.service.ClientState) @bci=57, line=1199 (Compiled frame) > - > org.apache.cassandra.cql3.statements.SelectStatement.execute(org.apache.cassandra.service.pager.Pageable, > org.apache.cassandra.cql3.QueryOptions, int, long, > org.apache.cassandra.service.QueryState) @bci=35, line=272 (Compiled frame) > - > org.apache.cassandra.cql3.statements.SelectStatement.execute(org.apache.cassandra.service.QueryState, > org.apache.cassandra.cql3.QueryOptions) @bci=105, line=224 (Compiled frame) > - org.apache.cassandra.auth.Auth.selectUser(java.lang.String) @bci=27, > line=265 (Compiled frame) > - org.apache.cassandra.auth.Auth.isExistingUser(java.lang.String) @bci=1, > line=86 (Compiled frame) > - > org.apache.cassandra.service.ClientState.login(org.apache.cassandra.auth.AuthenticatedUser) > @bci=11, line=206 (Compiled frame) > - > org.apache.cassandra.transport.messages.AuthResponse.execute(org.apache.cassandra.service.QueryState) > @bci=58, line=82 (Compiled frame) > - > org.apache.cassandra.transport.Message$Dispatcher.channelRead0(io.netty.channel.ChannelHandlerContext, > org.apache.cassandra.transport.Message$Request) @bci=75, line=439 (Compiled > frame) > - > org.apache.cassandra.transport.Message$Dispatcher.channelRead0(io.netty.channel.ChannelHandlerContext, > java.lang.Object) @bci=6, line=335 (Compiled frame) > - > io.netty.channel.SimpleChannelInboundHandler.channelRead(io.netty.channel.ChannelHandlerContext, > java.lang.Object) @bci=17, line=105 (Compiled frame) > - > io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(java.lang.Object) > @bci=9, line=333 (Compiled frame) > - > io.netty.channel.AbstractChannelHandlerContext.access$700(io.netty.channel.AbstractChannelHandlerContext, > java.lang.Object) @bci=2, line=32 (Compiled frame) > - io.netty.channel.AbstractChannelHandlerContext$8.run() @bci=8, line=324 > (Compiled frame) > - java.util.concurrent.Executors$RunnableAdapter.call() @bci=4, line=511 > (Compiled frame) > - > org.apache.cassandra.concurrent.AbstractTracingAwareExecutorService$FutureTask.run() > @bci=5, line=164 (Compiled frame) > - org.apache.cassandra.concurrent.SEPWorker.run() @bci=87, line=105 > (Interpreted frame) > - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame) > {code} > In a cluster with many connected clients (potentially thousands), a > reconnection flood (for example, restarting all at once) is likely to trigger > this bug. However, it is unlikely to be seen in normal operation. -- This message was sent by Atlassian JIRA (v6.3.4#6332)