Hi all,
I suspect that I've ran into a bug (or two).
On Cassandra 4.1.1, when `cdc_enabled` in the cassandra.yaml file is set
to `false` on at least one node in the cluster, and then the `ALTER
TABLE ... WITH cdc=...` statement was run against that node, the cluster
will end up in the schema disagreement state. At this stage, a rolling
restart will bring the schema back in sync, but the changes made to the
`cdc` table property will be lost.
On Cassandra 4.1.6, the same procedure doesn't cause visible schema
disagreement in the `nodetool describecluster` command's output, but the
`ALTER TABLE` statement only has cosmetic effect on the node it is run.
The node with `cdc_enabled` set to `false` will show the `cdc` table
property has changed, but this does not affect its behaviour in any way.
At the same time, other nodes do not see that table property change at
all. This is perhaps even worse than on 4.1.1, because the alter table
statement is silently failing.
A shell script for reproducing the above described behaviours, and the
output on both 4.1.1 and 4.1.6 are attached.
(as a good security practice, please always read and understand the full
script you downloaded from untrusted sources before attempting to run it)
So, are these bugs? Or is this some kind of behaviour that's documented
but I failed to find that documentation for?
Cheers,
Bowen
#!/bin/sh
set -eu
# pick a version
# ver="4.1.1"
ver="4.1.6"
# a helper function to wait for the DB container to become ready
wait_ready() {
# wait for the node to full start and join the cluster
until [ "$(docker exec "$1" nodetool netstats 2>/dev/null | grep '^Mode:' |
cut -d' ' -f2)" = 'NORMAL' ]; do
sleep 1
done
# wait for the DB ready for queries
until docker exec "$1" cqlsh -e "select key from system.local;" >/dev/null
2>&1; do
sleep 1
done
}
print_schema() {
echo "nodetool describecluster:"
docker exec c1 nodetool describecluster | sed -n '/Schema versions:/,/Stats
for all nodes:/p' | head -n-1
echo "table cdc property on each node:"
for node in c1 c2 c3; do
echo "node: $node"
docker exec "$node" cqlsh -e "desc table ks1.tbl1;" | grep cdc
done
}
echo "version: $ver"
docker network create cassnet >/dev/null
for node in c1 c2 c3; do
echo "starting node: $node"
docker run --network cassnet --name "$node" -e MAX_HEAP_SIZE=1G -e
HEAP_NEWSIZE=200M -e CASSANDRA_SEEDS=c1 -d "cassandra:$ver" >/dev/null
wait_ready "$node"
done
echo "creating keyspace and table"
# create tables
docker exec c1 cqlsh -e "create keyspace ks1 WITH replication = {'class':
'NetworkTopologyStrategy', 'datacenter1': '3'};"
docker exec c1 cqlsh -e "create table ks1.tbl1 (id int primary key);"
# print the schema, they should be consistent
print_schema
# enable cdc on one of the nodes
echo "enabling cdc on one node"
docker exec c3 sed -i 's/^cdc_enabled:.*$/cdc_enabled: true/'
/etc/cassandra/cassandra.yaml
docker restart c3 >/dev/null
wait_ready c3
# enable cdc on the table
# on 4.1.1: will show timeout and schema disagreement warnings
# on 4.1.6: no error
echo "alter table"
docker exec c1 cqlsh -e "alter table ks1.tbl1 WITH cdc=true;" || :
# print the schema, the disagreement/inconsistency will show up
print_schema
# rolling restart
echo "rolling restart"
for node in c1 c2 c3; do
echo "node: $node"
docker restart "$node" >/dev/null
wait_ready "$node"
done
# print the schema again, the disagreement/inconsistency is gone, but the
effect of the ALTER TABLE statement is also voided
print_schema
# clean up
echo "clean up"
docker stop c1 c2 c3 >/dev/null
docker rm c1 c2 c3 >/dev/null
docker network rm cassnet >/dev/null
version: 4.1.1
starting node: c1
starting node: c2
starting node: c3
creating keyspace and table
nodetool describecluster:
Schema versions:
32699367-6dda-3c41-b63e-26f770adb14d: [172.19.0.3, 172.19.0.2,
172.19.0.4]
table cdc property on each node:
node: c1
AND cdc = false
node: c2
AND cdc = false
node: c3
AND cdc = false
enabling cdc on one node
alter table
<stdin>:1:OperationTimedOut: errors={'Connection defunct by heartbeat': 'Client
request timeout. See Session.execute[_async](timeout)'},
last_host=127.0.0.1:9042
<stdin>:1:Warning: schema version mismatch detected; check the schema versions
of your nodes in system.local and system.peers.
nodetool describecluster:
Schema versions:
53789740-2180-3161-a22d-fb2c6a4d32a7: [172.19.0.2]
32699367-6dda-3c41-b63e-26f770adb14d: [172.19.0.3, 172.19.0.4]
table cdc property on each node:
node: c1
AND cdc = true
node: c2
AND cdc = false
node: c3
AND cdc = false
rolling restart
node: c1
node: c2
node: c3
nodetool describecluster:
Schema versions:
53789740-2180-3161-a22d-fb2c6a4d32a7: [172.19.0.3, 172.19.0.2,
172.19.0.4]
table cdc property on each node:
node: c1
AND cdc = false
node: c2
AND cdc = false
node: c3
AND cdc = false
cleanup
version: 4.1.6
starting node: c1
starting node: c2
starting node: c3
creating keyspace and table
nodetool describecluster:
Schema versions:
b595fb53-19d3-3d45-9856-0ac042fdd05b: [172.19.0.3, 172.19.0.2,
172.19.0.4]
table cdc property on each node:
node: c1
AND cdc = false
node: c2
AND cdc = false
node: c3
AND cdc = false
enabling cdc on one node
alter table
nodetool describecluster:
Schema versions:
8691b0f6-965e-31d4-89cf-b10d61a38215: [172.19.0.3, 172.19.0.2,
172.19.0.4]
table cdc property on each node:
node: c1
AND cdc = true
node: c2
AND cdc = false
node: c3
AND cdc = false
rolling restart
node: c1
node: c2
node: c3
nodetool describecluster:
Schema versions:
8691b0f6-965e-31d4-89cf-b10d61a38215: [172.19.0.3, 172.19.0.2,
172.19.0.4]
table cdc property on each node:
node: c1
AND cdc = false
node: c2
AND cdc = false
node: c3
AND cdc = false
clean up