This is an automated email from the ASF dual-hosted git repository.
jchovatia pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/cassandra.git
The following commit(s) were added to refs/heads/trunk by this push:
new 965a39166c Stop repair scheduler if two major versions are detected
965a39166c is described below
commit 965a39166cf22f020065578407ca6c585b4a5132
Author: Himanshu Jindal <[email protected]>
AuthorDate: Tue Aug 12 15:37:15 2025 -0700
Stop repair scheduler if two major versions are detected
patch by Himanshu Jindal; reviewed by Jaydeepkumar Chovatia, Andy Tolbert
for CASSANDRA-20048
---
CHANGES.txt | 1 +
conf/cassandra.yaml | 3 ++
conf/cassandra_latest.yaml | 3 ++
.../pages/managing/operating/auto_repair.adoc | 4 ++
.../cassandra/repair/autorepair/AutoRepair.java | 5 +++
.../repair/autorepair/AutoRepairConfig.java | 17 ++++++++
.../repair/autorepair/AutoRepairUtils.java | 15 +++++++
.../cassandra/service/AutoRepairService.java | 7 +++
.../cassandra/service/AutoRepairServiceMBean.java | 2 +
src/java/org/apache/cassandra/tools/NodeProbe.java | 5 +++
.../tools/nodetool/SetAutoRepairConfig.java | 5 ++-
.../test/log/ClusterMetadataTestHelper.java | 29 +++++++++++--
.../repair/autorepair/AutoRepairUtilsTest.java | 50 ++++++++++++++++++++++
.../tools/nodetool/SetAutoRepairConfigTest.java | 3 +-
14 files changed, 143 insertions(+), 6 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index a804fe506b..8cbd1e173b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
5.1
+ * Stop repair scheduler if two major versions are detected (CASSANDRA-20048)
* Optimize audit logic for batch operations especially when audit is not
enabled for DML (CASSANDRA-20885)
* Implement nodetool history (CASSANDRA-20851)
* Expose StorageService.dropPreparedStatements via JMX (CASSANDRA-20870)
diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index acb0429476..cfa1b7ff9e 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -2782,6 +2782,9 @@ storage_compatibility_mode: NONE
# # The scheduler needs to adjust its order when nodes leave the ring.
Deleted hosts are tracked in metadata
# # for a specified duration to ensure they are indeed removed before
adjustments are made to the schedule.
# history_clear_delete_hosts_buffer_interval: 2h
+# # By default repair is disabled if there are mixed major versions detected
- which would happen
+# # if a major version upgrade is being performed on the cluster, but a user
can enable it using this flag
+# mixed_major_version_repair_enabled: false
# # NOTE: Each of the below settings can be overridden per repair type under
repair_type_overrides
# global_settings:
# # If true, attempts to group tables in the same keyspace into one
repair; otherwise, each table is repaired
diff --git a/conf/cassandra_latest.yaml b/conf/cassandra_latest.yaml
index c987a0ce42..f426048431 100644
--- a/conf/cassandra_latest.yaml
+++ b/conf/cassandra_latest.yaml
@@ -2467,6 +2467,9 @@ storage_compatibility_mode: NONE
# # The scheduler needs to adjust its order when nodes leave the ring.
Deleted hosts are tracked in metadata
# # for a specified duration to ensure they are indeed removed before
adjustments are made to the schedule.
# history_clear_delete_hosts_buffer_interval: 2h
+# # By default repair is disabled if there are mixed major versions detected
- which would happen
+# # if a major version upgrade is being performed on the cluster, but a user
can enable it using this flag
+# mixed_major_version_repair_enabled: false
# # NOTE: Each of the below settings can be overridden per repair type under
repair_type_overrides
# global_settings:
# # If true, attempts to group tables in the same keyspace into one
repair; otherwise, each table is repaired
diff --git a/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc
b/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc
index bd9dad5aa7..e989c49d2a 100644
--- a/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc
+++ b/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc
@@ -167,6 +167,10 @@ is time to schedule repairs.
| history_clear_delete_hosts_buffer_interval | 2h | The scheduler needs to
adjust its order when nodes leave the ring.
Deleted hosts are tracked in metadata for a specified duration to ensure they
are indeed removed before adjustments
are made to the schedule.
+| mixed_major_version_repair_enabled | false | Enable/Disable running repairs
on the cluster when there are mixed
+major versions detected, which usually occurs when the cluster is being
upgraded. Repairs between nodes of
+different major versions is not something that is tested, so this may lead to
data compatibility issues.
+It is strongly discouraged to set this to true without doing extensive testing
beforehand.
|===
diff --git a/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java
b/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java
index 10d18fae63..031b6352bf 100644
--- a/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java
+++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java
@@ -165,6 +165,11 @@ public class AutoRepair
logger.debug("Auto-repair is disabled for repair type {}",
repairType);
return;
}
+ if (!config.isMixedMajorVersionRepairEnabled() &&
AutoRepairUtils.hasMultipleLiveMajorVersions())
+ {
+ logger.info("Auto-repair is disabled when nodes in the cluster
have different major versions");
+ return;
+ }
AutoRepairService.instance.checkCanRun(repairType);
AutoRepairState repairState = repairStates.get(repairType);
try
diff --git
a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java
b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java
index 045e6d21a8..7285af7948 100644
--- a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java
+++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java
@@ -59,6 +59,8 @@ public class AutoRepairConfig implements Serializable
// Minimum duration for the execution of a single repair task. This
prevents the scheduler from overwhelming
// the node by scheduling too many repair tasks in a short period of time.
public volatile DurationSpec.LongSecondsBound repair_task_min_duration =
new DurationSpec.LongSecondsBound("5s");
+ // by default repair is disabled if there are mixed major versions
detected, but you can enable it using this flag
+ public volatile boolean mixed_major_version_repair_enabled = false;
// global_settings overides Options.defaultOptions for all repair types
public volatile Options global_settings;
@@ -149,6 +151,11 @@ public class AutoRepairConfig implements Serializable
this.enabled = enabled;
}
+ public boolean isMixedMajorVersionRepairEnabled()
+ {
+ return mixed_major_version_repair_enabled;
+ }
+
public DurationSpec.IntSecondsBound
getAutoRepairHistoryClearDeleteHostsBufferInterval()
{
return history_clear_delete_hosts_buffer_interval;
@@ -366,6 +373,16 @@ public class AutoRepairConfig implements Serializable
getOptions(repairType).repair_retry_backoff = new
DurationSpec.LongSecondsBound(interval);
}
+ public boolean getMixedMajorVersionRepairEnabled()
+ {
+ return this.mixed_major_version_repair_enabled;
+ }
+
+ public void setMixedMajorVersionRepairEnabled(boolean enabled)
+ {
+ this.mixed_major_version_repair_enabled = enabled;
+ }
+
@VisibleForTesting
static IAutoRepairTokenRangeSplitter
newAutoRepairTokenRangeSplitter(RepairType repairType, ParameterizedClass
parameterizedClass) throws ConfigurationException
{
diff --git
a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java
b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java
index 6da487e5e0..d0da66c6ff 100644
--- a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java
+++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java
@@ -425,6 +425,21 @@ public class AutoRepairUtils
return null;
}
+ /**
+ * Checks whether the cluster has multiple major versions
+ * @return
+ * true if more than one major versions are detected
+ * false if only one major version is detected
+ *
+ */
+ public static boolean hasMultipleLiveMajorVersions()
+ {
+ ClusterMetadata metadata = ClusterMetadata.current();
+ int maxMajorVersion =
ClusterMetadata.current().directory.clusterMaxVersion.cassandraVersion.major;
+ int minMajorVersion =
ClusterMetadata.current().directory.clusterMinVersion.cassandraVersion.major;
+ return maxMajorVersion != minMajorVersion;
+ }
+
@VisibleForTesting
protected static TreeSet<UUID> getHostIdsInCurrentRing(RepairType
repairType, Collection<NodeAddresses> allNodesInRing)
{
diff --git a/src/java/org/apache/cassandra/service/AutoRepairService.java
b/src/java/org/apache/cassandra/service/AutoRepairService.java
index db1b29c389..463629f058 100644
--- a/src/java/org/apache/cassandra/service/AutoRepairService.java
+++ b/src/java/org/apache/cassandra/service/AutoRepairService.java
@@ -102,6 +102,7 @@ public class AutoRepairService implements
AutoRepairServiceMBean
appendConfig(sb, "repair_check_interval",
config.getRepairCheckInterval());
appendConfig(sb, "repair_task_min_duration",
config.getRepairTaskMinDuration());
appendConfig(sb, "history_clear_delete_hosts_buffer_interval",
config.getAutoRepairHistoryClearDeleteHostsBufferInterval());
+ appendConfig(sb, "mixed_major_version_repair_enabled",
config.getMixedMajorVersionRepairEnabled());
for (RepairType repairType : RepairType.values())
{
sb.append(formatRepairTypeConfig(repairType, config));
@@ -271,6 +272,12 @@ public class AutoRepairService implements
AutoRepairServiceMBean
config.setRepairRetryBackoff(RepairType.parse(repairType), interval);
}
+ @Override
+ public void setMixedMajorVersionRepairEnabled(boolean enabled)
+ {
+ config.setMixedMajorVersionRepairEnabled(enabled);
+ }
+
private String formatRepairTypeConfig(RepairType repairType,
AutoRepairConfig config)
{
StringBuilder sb = new StringBuilder();
diff --git a/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java
b/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java
index 181c6008f5..e4d554dd98 100644
--- a/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java
+++ b/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java
@@ -74,4 +74,6 @@ public interface AutoRepairServiceMBean
public void setAutoRepairMaxRetriesCount(String repairType, int retries);
public void setAutoRepairRetryBackoff(String repairType, String interval);
+
+ public void setMixedMajorVersionRepairEnabled(boolean enabled);
}
diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java
b/src/java/org/apache/cassandra/tools/NodeProbe.java
index dcab5048fa..b59655134a 100644
--- a/src/java/org/apache/cassandra/tools/NodeProbe.java
+++ b/src/java/org/apache/cassandra/tools/NodeProbe.java
@@ -2677,6 +2677,11 @@ public class NodeProbe implements AutoCloseable
{
return grProxy;
}
+
+ public void setMixedMajorVersionRepairEnabled(boolean enabled)
+ {
+ autoRepairProxy.setMixedMajorVersionRepairEnabled(enabled);
+ }
}
class ColumnFamilyStoreMBeanIterator implements Iterator<Map.Entry<String,
ColumnFamilyStoreMBean>>
diff --git
a/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java
b/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java
index e3343d83e3..32b0c6ea4f 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java
@@ -53,7 +53,7 @@ public class SetAutoRepairConfig extends AbstractCommand
"|allow_parallel_replica_repair|allow_parallel_repair_across_schedules" +
"|materialized_view_repair_enabled|repair_max_retries" +
"|repair_retry_backoff|repair_session_timeout|min_repair_task_duration" +
-
"|repair_by_keyspace|token_range_splitter.<property>]" })
+
"|repair_by_keyspace|mixed_major_version_repair_enabled|token_range_splitter.<property>]"
})
public String autorepairParamType;
@Parameters(index = "1", description = "Autorepair param value", arity =
"0..1")
@@ -97,6 +97,9 @@ public class SetAutoRepairConfig extends AbstractCommand
case "min_repair_task_duration":
probe.setAutoRepairMinRepairTaskDuration(paramVal);
return;
+ case "mixed_major_version_repair_enabled":
+
probe.setMixedMajorVersionRepairEnabled(Boolean.parseBoolean(paramVal));
+ return;
default:
// proceed to options that require --repair-type option
break;
diff --git
a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java
b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java
index b606a94fc4..0acd6895b8 100644
---
a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java
+++
b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java
@@ -797,10 +797,15 @@ public class ClusterMetadataTestHelper
}
public static void addEndpoint(int i)
+ {
+ addEndpoint(i, NodeVersion.CURRENT);
+ }
+
+ public static void addEndpoint(int i, NodeVersion nodeVersion)
{
try
{
- addEndpoint(InetAddressAndPort.getByName("127.0.0." + i), new
Murmur3Partitioner.LongToken(i));
+ addEndpoint(InetAddressAndPort.getByName("127.0.0." + i), new
Murmur3Partitioner.LongToken(i), nodeVersion);
}
catch (UnknownHostException e)
{
@@ -810,7 +815,12 @@ public class ClusterMetadataTestHelper
public static void addEndpoint(InetAddressAndPort endpoint, Token t)
{
- addEndpoint(endpoint, t, "dc1", "rack1");
+ addEndpoint(endpoint, t, NodeVersion.CURRENT);
+ }
+
+ public static void addEndpoint(InetAddressAndPort endpoint, Token t,
NodeVersion nodeVersion)
+ {
+ addEndpoint(endpoint, t, "dc1", "rack1", nodeVersion);
}
public static void addEndpoint(InetAddressAndPort endpoint,
Collection<Token> tokens)
@@ -830,15 +840,26 @@ public class ClusterMetadataTestHelper
public static void addEndpoint(InetAddressAndPort endpoint, Token t,
String dc, String rack)
{
- addEndpoint(endpoint, Collections.singleton(t), dc, rack);
+ addEndpoint(endpoint, Collections.singleton(t), dc, rack,
NodeVersion.CURRENT);
+ }
+
+ public static void addEndpoint(InetAddressAndPort endpoint, Token t,
String dc, String rack, NodeVersion nodeVersion)
+ {
+ addEndpoint(endpoint, Collections.singleton(t), dc, rack, nodeVersion);
}
public static void addEndpoint(InetAddressAndPort endpoint,
Collection<Token> t, String dc, String rack)
+ {
+ addEndpoint(endpoint, t, dc, rack, NodeVersion.CURRENT);
+ }
+
+ public static void addEndpoint(InetAddressAndPort endpoint,
Collection<Token> t, String dc, String rack,
+ NodeVersion nodeVersion)
{
try
{
Location l = new Location(dc, rack);
- commit(new Register(addr(endpoint), l, NodeVersion.CURRENT));
+ commit(new Register(addr(endpoint), l, nodeVersion));
if (endpoint.equals(FBUtilities.getBroadcastAddressAndPort()))
RegistrationStatus.instance.onRegistration();
lazyJoin(endpoint, new HashSet<>(t)).prepareJoin()
diff --git
a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java
b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java
index d9723ea193..57974f6c8c 100644
--- a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java
+++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java
@@ -24,6 +24,8 @@ import java.util.TreeSet;
import java.util.UUID;
import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper;
import org.apache.cassandra.schema.SystemDistributedKeyspace;
import org.junit.Before;
import org.junit.BeforeClass;
@@ -44,6 +46,8 @@ import org.apache.cassandra.cql3.QueryProcessor;
import org.apache.cassandra.schema.SchemaConstants;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.tcm.membership.NodeAddresses;
+import org.apache.cassandra.tcm.membership.NodeVersion;
+import org.apache.cassandra.utils.CassandraVersion;
import org.apache.cassandra.utils.FBUtilities;
import static org.apache.cassandra.Util.setAutoRepairEnabled;
@@ -232,6 +236,52 @@ public class AutoRepairUtilsTest extends CQLTester
assertTrue(hosts.contains(hostId));
}
+ @Test
+ public void testHasMultipleLiveMajorVersionsWithSingleNode()
+ {
+ boolean result = AutoRepairUtils.hasMultipleLiveMajorVersions();
+ assertFalse(result);
+ }
+
+ @Test
+ public void
testHasMultipleLiveMajorVersionsWithMultipleNodesOfSameVersion()
+ {
+ ClusterMetadataTestHelper.addEndpoint(2);
+ // Test the current behavior with the existing cluster setup
+ // In a single-node test environment, this should return false
+ boolean result = AutoRepairUtils.hasMultipleLiveMajorVersions();
+ assertFalse(result);
+ }
+
+ @Test
+ public void
testHasMultipleLiveMajorVersionsWithMultipleNodesOfSameMajorVersionDifferentMinorVersions()
+ {
+ // add two nodes with the current cassandra major version, but
different minor version
+ CassandraVersion differentCassandraVersion = new CassandraVersion(
+ String.format("%d.%d",
+ NodeVersion.CURRENT.cassandraVersion.major,
+ NodeVersion.CURRENT.cassandraVersion.minor+1));
+ ClusterMetadataTestHelper.addEndpoint(2, new NodeVersion(
+ differentCassandraVersion,
+ NodeVersion.CURRENT_METADATA_VERSION));
+ // With the same major versions, but different minor versions, we
should still see this function return true
+ boolean result = AutoRepairUtils.hasMultipleLiveMajorVersions();
+ assertFalse(result);
+ }
+
+ @Test
+ public void
testHasMultipleLiveMajorVersionsWithMultipleNodesOfDifferentMajorVersions()
+ {
+ // add two nodes with different cassandra major versions
+ CassandraVersion differentCassandraVersion = new CassandraVersion(
+ String.format("%d.%d", NodeVersion.CURRENT.cassandraVersion.major
- 1, 0));
+ ClusterMetadataTestHelper.addEndpoint(2, new
NodeVersion(differentCassandraVersion,
+
NodeVersion.CURRENT_METADATA_VERSION));
+ // With different major versions, we should see this function return
true
+ boolean result = AutoRepairUtils.hasMultipleLiveMajorVersions();
+ assertTrue(result);
+ }
+
@Test
public void testGetHostWithLongestUnrepairTime()
{
diff --git
a/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java
b/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java
index 4ea9516e8e..640d952655 100644
--- a/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java
+++ b/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java
@@ -268,7 +268,8 @@ public class SetAutoRepairConfigTest
forEachRepairType("ignore_dcs", "dc1,dc2", (type) -> verify(probe,
times(1)).setAutoRepairIgnoreDCs(type.name(), ImmutableSet.of("dc1", "dc2"))),
forEachRepairType("token_range_splitter.max_bytes_per_schedule",
"500GiB", (type) -> verify(probe,
times(1)).setAutoRepairTokenRangeSplitterParameter(type.name(),
"max_bytes_per_schedule", "500GiB")),
forEachRepairType("repair_max_retries", "3", (type) ->
verify(probe, times(1)).setAutoRepairMaxRetriesCount(type.name(), 3)),
- forEachRepairType("repair_retry_backoff", "60s", (type) ->
verify(probe, times(1)).setAutoRepairRetryBackoff(type.name(), "60s"))
+ forEachRepairType("repair_retry_backoff", "60s", (type) ->
verify(probe, times(1)).setAutoRepairRetryBackoff(type.name(), "60s")),
+ forEachRepairType("mixed_major_version_repair_enabled", "false",
(type) -> verify(probe, times(1)).setMixedMajorVersionRepairEnabled(false))
).flatMap(Function.identity()).collect(Collectors.toList());
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]