This is an automated email from the ASF dual-hosted git repository.

jchovatia pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/cassandra.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 965a39166c Stop repair scheduler if two major versions are detected
965a39166c is described below

commit 965a39166cf22f020065578407ca6c585b4a5132
Author: Himanshu Jindal <[email protected]>
AuthorDate: Tue Aug 12 15:37:15 2025 -0700

    Stop repair scheduler if two major versions are detected
    
    patch by Himanshu Jindal; reviewed by Jaydeepkumar Chovatia, Andy Tolbert 
for CASSANDRA-20048
---
 CHANGES.txt                                        |  1 +
 conf/cassandra.yaml                                |  3 ++
 conf/cassandra_latest.yaml                         |  3 ++
 .../pages/managing/operating/auto_repair.adoc      |  4 ++
 .../cassandra/repair/autorepair/AutoRepair.java    |  5 +++
 .../repair/autorepair/AutoRepairConfig.java        | 17 ++++++++
 .../repair/autorepair/AutoRepairUtils.java         | 15 +++++++
 .../cassandra/service/AutoRepairService.java       |  7 +++
 .../cassandra/service/AutoRepairServiceMBean.java  |  2 +
 src/java/org/apache/cassandra/tools/NodeProbe.java |  5 +++
 .../tools/nodetool/SetAutoRepairConfig.java        |  5 ++-
 .../test/log/ClusterMetadataTestHelper.java        | 29 +++++++++++--
 .../repair/autorepair/AutoRepairUtilsTest.java     | 50 ++++++++++++++++++++++
 .../tools/nodetool/SetAutoRepairConfigTest.java    |  3 +-
 14 files changed, 143 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index a804fe506b..8cbd1e173b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 5.1
+ * Stop repair scheduler if two major versions are detected (CASSANDRA-20048)
  * Optimize audit logic for batch operations especially when audit is not 
enabled for DML (CASSANDRA-20885)
  * Implement nodetool history (CASSANDRA-20851)
  * Expose StorageService.dropPreparedStatements via JMX (CASSANDRA-20870)
diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index acb0429476..cfa1b7ff9e 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -2782,6 +2782,9 @@ storage_compatibility_mode: NONE
 #   # The scheduler needs to adjust its order when nodes leave the ring. 
Deleted hosts are tracked in metadata
 #   # for a specified duration to ensure they are indeed removed before 
adjustments are made to the schedule.
 #   history_clear_delete_hosts_buffer_interval: 2h
+#   # By default repair is disabled if there are mixed major versions detected 
- which would happen
+#   # if a major version upgrade is being performed on the cluster, but a user 
can enable it using this flag
+#   mixed_major_version_repair_enabled: false
 #   # NOTE: Each of the below settings can be overridden per repair type under 
repair_type_overrides
 #   global_settings:
 #     # If true, attempts to group tables in the same keyspace into one 
repair; otherwise, each table is repaired
diff --git a/conf/cassandra_latest.yaml b/conf/cassandra_latest.yaml
index c987a0ce42..f426048431 100644
--- a/conf/cassandra_latest.yaml
+++ b/conf/cassandra_latest.yaml
@@ -2467,6 +2467,9 @@ storage_compatibility_mode: NONE
 #   # The scheduler needs to adjust its order when nodes leave the ring. 
Deleted hosts are tracked in metadata
 #   # for a specified duration to ensure they are indeed removed before 
adjustments are made to the schedule.
 #   history_clear_delete_hosts_buffer_interval: 2h
+#   # By default repair is disabled if there are mixed major versions detected 
- which would happen
+#   # if a major version upgrade is being performed on the cluster, but a user 
can enable it using this flag
+#   mixed_major_version_repair_enabled: false
 #   # NOTE: Each of the below settings can be overridden per repair type under 
repair_type_overrides
 #   global_settings:
 #     # If true, attempts to group tables in the same keyspace into one 
repair; otherwise, each table is repaired
diff --git a/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc 
b/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc
index bd9dad5aa7..e989c49d2a 100644
--- a/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc
+++ b/doc/modules/cassandra/pages/managing/operating/auto_repair.adoc
@@ -167,6 +167,10 @@ is time to schedule repairs.
 | history_clear_delete_hosts_buffer_interval | 2h | The scheduler needs to 
adjust its order when nodes leave the ring.
 Deleted hosts are tracked in metadata for a specified duration to ensure they 
are indeed removed before adjustments
 are made to the schedule.
+| mixed_major_version_repair_enabled | false | Enable/Disable running repairs 
on the cluster when there are mixed
+major versions detected, which usually occurs when the cluster is being 
upgraded. Repairs between nodes of
+different major versions is not something that is tested, so this may lead to 
data compatibility issues.
+It is strongly discouraged to set this to true without doing extensive testing 
beforehand.
 |===
 
 
diff --git a/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java 
b/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java
index 10d18fae63..031b6352bf 100644
--- a/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java
+++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepair.java
@@ -165,6 +165,11 @@ public class AutoRepair
             logger.debug("Auto-repair is disabled for repair type {}", 
repairType);
             return;
         }
+        if (!config.isMixedMajorVersionRepairEnabled() && 
AutoRepairUtils.hasMultipleLiveMajorVersions())
+        {
+            logger.info("Auto-repair is disabled when nodes in the cluster 
have different major versions");
+            return;
+        }
         AutoRepairService.instance.checkCanRun(repairType);
         AutoRepairState repairState = repairStates.get(repairType);
         try
diff --git 
a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java 
b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java
index 045e6d21a8..7285af7948 100644
--- a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java
+++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairConfig.java
@@ -59,6 +59,8 @@ public class AutoRepairConfig implements Serializable
     // Minimum duration for the execution of a single repair task. This 
prevents the scheduler from overwhelming
     // the node by scheduling too many repair tasks in a short period of time.
     public volatile DurationSpec.LongSecondsBound repair_task_min_duration = 
new DurationSpec.LongSecondsBound("5s");
+    // by default repair is disabled if there are mixed major versions 
detected, but you can enable it using this flag
+    public volatile boolean mixed_major_version_repair_enabled = false;
 
     // global_settings overides Options.defaultOptions for all repair types
     public volatile Options global_settings;
@@ -149,6 +151,11 @@ public class AutoRepairConfig implements Serializable
         this.enabled = enabled;
     }
 
+    public boolean isMixedMajorVersionRepairEnabled()
+    {
+        return mixed_major_version_repair_enabled;
+    }
+
     public DurationSpec.IntSecondsBound 
getAutoRepairHistoryClearDeleteHostsBufferInterval()
     {
         return history_clear_delete_hosts_buffer_interval;
@@ -366,6 +373,16 @@ public class AutoRepairConfig implements Serializable
         getOptions(repairType).repair_retry_backoff = new 
DurationSpec.LongSecondsBound(interval);
     }
 
+    public boolean getMixedMajorVersionRepairEnabled()
+    {
+        return this.mixed_major_version_repair_enabled;
+    }
+
+    public void setMixedMajorVersionRepairEnabled(boolean enabled)
+    {
+        this.mixed_major_version_repair_enabled = enabled;
+    }
+
     @VisibleForTesting
     static IAutoRepairTokenRangeSplitter 
newAutoRepairTokenRangeSplitter(RepairType repairType, ParameterizedClass 
parameterizedClass) throws ConfigurationException
     {
diff --git 
a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java 
b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java
index 6da487e5e0..d0da66c6ff 100644
--- a/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java
+++ b/src/java/org/apache/cassandra/repair/autorepair/AutoRepairUtils.java
@@ -425,6 +425,21 @@ public class AutoRepairUtils
         return null;
     }
 
+    /**
+     * Checks whether the cluster has multiple major versions
+     * @return
+     *  true if more than one major versions are detected
+     *  false if only one major version is detected
+     *
+     */
+    public static boolean hasMultipleLiveMajorVersions()
+    {
+        ClusterMetadata metadata = ClusterMetadata.current();
+        int maxMajorVersion = 
ClusterMetadata.current().directory.clusterMaxVersion.cassandraVersion.major;
+        int minMajorVersion = 
ClusterMetadata.current().directory.clusterMinVersion.cassandraVersion.major;
+        return maxMajorVersion != minMajorVersion;
+    }
+
     @VisibleForTesting
     protected static TreeSet<UUID> getHostIdsInCurrentRing(RepairType 
repairType, Collection<NodeAddresses> allNodesInRing)
     {
diff --git a/src/java/org/apache/cassandra/service/AutoRepairService.java 
b/src/java/org/apache/cassandra/service/AutoRepairService.java
index db1b29c389..463629f058 100644
--- a/src/java/org/apache/cassandra/service/AutoRepairService.java
+++ b/src/java/org/apache/cassandra/service/AutoRepairService.java
@@ -102,6 +102,7 @@ public class AutoRepairService implements 
AutoRepairServiceMBean
         appendConfig(sb, "repair_check_interval", 
config.getRepairCheckInterval());
         appendConfig(sb, "repair_task_min_duration", 
config.getRepairTaskMinDuration());
         appendConfig(sb, "history_clear_delete_hosts_buffer_interval", 
config.getAutoRepairHistoryClearDeleteHostsBufferInterval());
+        appendConfig(sb, "mixed_major_version_repair_enabled", 
config.getMixedMajorVersionRepairEnabled());
         for (RepairType repairType : RepairType.values())
         {
             sb.append(formatRepairTypeConfig(repairType, config));
@@ -271,6 +272,12 @@ public class AutoRepairService implements 
AutoRepairServiceMBean
         config.setRepairRetryBackoff(RepairType.parse(repairType), interval);
     }
 
+    @Override
+    public void setMixedMajorVersionRepairEnabled(boolean enabled)
+    {
+        config.setMixedMajorVersionRepairEnabled(enabled);
+    }
+
     private String formatRepairTypeConfig(RepairType repairType, 
AutoRepairConfig config)
     {
         StringBuilder sb = new StringBuilder();
diff --git a/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java 
b/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java
index 181c6008f5..e4d554dd98 100644
--- a/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java
+++ b/src/java/org/apache/cassandra/service/AutoRepairServiceMBean.java
@@ -74,4 +74,6 @@ public interface AutoRepairServiceMBean
     public void setAutoRepairMaxRetriesCount(String repairType, int retries);
 
     public void setAutoRepairRetryBackoff(String repairType, String interval);
+
+    public void setMixedMajorVersionRepairEnabled(boolean enabled);
 }
diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java 
b/src/java/org/apache/cassandra/tools/NodeProbe.java
index dcab5048fa..b59655134a 100644
--- a/src/java/org/apache/cassandra/tools/NodeProbe.java
+++ b/src/java/org/apache/cassandra/tools/NodeProbe.java
@@ -2677,6 +2677,11 @@ public class NodeProbe implements AutoCloseable
     {
         return grProxy;
     }
+
+    public void setMixedMajorVersionRepairEnabled(boolean enabled)
+    {
+        autoRepairProxy.setMixedMajorVersionRepairEnabled(enabled);
+    }
 }
 
 class ColumnFamilyStoreMBeanIterator implements Iterator<Map.Entry<String, 
ColumnFamilyStoreMBean>>
diff --git 
a/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java 
b/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java
index e3343d83e3..32b0c6ea4f 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/SetAutoRepairConfig.java
@@ -53,7 +53,7 @@ public class SetAutoRepairConfig extends AbstractCommand
                                                           
"|allow_parallel_replica_repair|allow_parallel_repair_across_schedules" +
                                                           
"|materialized_view_repair_enabled|repair_max_retries" +
                                                           
"|repair_retry_backoff|repair_session_timeout|min_repair_task_duration" +
-                                                          
"|repair_by_keyspace|token_range_splitter.<property>]" })
+                                                          
"|repair_by_keyspace|mixed_major_version_repair_enabled|token_range_splitter.<property>]"
 })
     public String autorepairParamType;
 
     @Parameters(index = "1", description = "Autorepair param value", arity = 
"0..1")
@@ -97,6 +97,9 @@ public class SetAutoRepairConfig extends AbstractCommand
             case "min_repair_task_duration":
                 probe.setAutoRepairMinRepairTaskDuration(paramVal);
                 return;
+            case "mixed_major_version_repair_enabled":
+                
probe.setMixedMajorVersionRepairEnabled(Boolean.parseBoolean(paramVal));
+                return;
             default:
                 // proceed to options that require --repair-type option
                 break;
diff --git 
a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java
 
b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java
index b606a94fc4..0acd6895b8 100644
--- 
a/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java
+++ 
b/test/distributed/org/apache/cassandra/distributed/test/log/ClusterMetadataTestHelper.java
@@ -797,10 +797,15 @@ public class ClusterMetadataTestHelper
     }
 
     public static void addEndpoint(int i)
+    {
+        addEndpoint(i, NodeVersion.CURRENT);
+    }
+
+    public static void addEndpoint(int i, NodeVersion nodeVersion)
     {
         try
         {
-            addEndpoint(InetAddressAndPort.getByName("127.0.0." + i), new 
Murmur3Partitioner.LongToken(i));
+            addEndpoint(InetAddressAndPort.getByName("127.0.0." + i), new 
Murmur3Partitioner.LongToken(i), nodeVersion);
         }
         catch (UnknownHostException e)
         {
@@ -810,7 +815,12 @@ public class ClusterMetadataTestHelper
 
     public static void addEndpoint(InetAddressAndPort endpoint, Token t)
     {
-        addEndpoint(endpoint, t, "dc1", "rack1");
+        addEndpoint(endpoint, t, NodeVersion.CURRENT);
+    }
+
+    public static void addEndpoint(InetAddressAndPort endpoint, Token t, 
NodeVersion nodeVersion)
+    {
+        addEndpoint(endpoint, t, "dc1", "rack1", nodeVersion);
     }
 
     public static void addEndpoint(InetAddressAndPort endpoint, 
Collection<Token> tokens)
@@ -830,15 +840,26 @@ public class ClusterMetadataTestHelper
 
     public static void addEndpoint(InetAddressAndPort endpoint, Token t, 
String dc, String rack)
     {
-        addEndpoint(endpoint, Collections.singleton(t), dc, rack);
+        addEndpoint(endpoint, Collections.singleton(t), dc, rack, 
NodeVersion.CURRENT);
+    }
+
+    public static void addEndpoint(InetAddressAndPort endpoint, Token t, 
String dc, String rack, NodeVersion nodeVersion)
+    {
+        addEndpoint(endpoint, Collections.singleton(t), dc, rack, nodeVersion);
     }
 
     public static void addEndpoint(InetAddressAndPort endpoint, 
Collection<Token> t, String dc, String rack)
+    {
+        addEndpoint(endpoint, t, dc, rack, NodeVersion.CURRENT);
+    }
+
+    public static void addEndpoint(InetAddressAndPort endpoint, 
Collection<Token> t, String dc, String rack,
+                                   NodeVersion nodeVersion)
     {
         try
         {
             Location l = new Location(dc, rack);
-            commit(new Register(addr(endpoint), l, NodeVersion.CURRENT));
+            commit(new Register(addr(endpoint), l, nodeVersion));
             if (endpoint.equals(FBUtilities.getBroadcastAddressAndPort()))
                 RegistrationStatus.instance.onRegistration();
             lazyJoin(endpoint, new HashSet<>(t)).prepareJoin()
diff --git 
a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java 
b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java
index d9723ea193..57974f6c8c 100644
--- a/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java
+++ b/test/unit/org/apache/cassandra/repair/autorepair/AutoRepairUtilsTest.java
@@ -24,6 +24,8 @@ import java.util.TreeSet;
 import java.util.UUID;
 
 import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.distributed.test.log.ClusterMetadataTestHelper;
 import org.apache.cassandra.schema.SystemDistributedKeyspace;
 import org.junit.Before;
 import org.junit.BeforeClass;
@@ -44,6 +46,8 @@ import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.tcm.membership.NodeAddresses;
+import org.apache.cassandra.tcm.membership.NodeVersion;
+import org.apache.cassandra.utils.CassandraVersion;
 import org.apache.cassandra.utils.FBUtilities;
 
 import static org.apache.cassandra.Util.setAutoRepairEnabled;
@@ -232,6 +236,52 @@ public class AutoRepairUtilsTest extends CQLTester
         assertTrue(hosts.contains(hostId));
     }
 
+    @Test
+    public void testHasMultipleLiveMajorVersionsWithSingleNode()
+    {
+        boolean result = AutoRepairUtils.hasMultipleLiveMajorVersions();
+        assertFalse(result);
+    }
+
+    @Test
+    public void 
testHasMultipleLiveMajorVersionsWithMultipleNodesOfSameVersion()
+    {
+        ClusterMetadataTestHelper.addEndpoint(2);
+        // Test the current behavior with the existing cluster setup
+        // In a single-node test environment, this should return false
+        boolean result = AutoRepairUtils.hasMultipleLiveMajorVersions();
+        assertFalse(result);
+    }
+
+    @Test
+    public void 
testHasMultipleLiveMajorVersionsWithMultipleNodesOfSameMajorVersionDifferentMinorVersions()
+    {
+        // add two nodes with the current cassandra major version, but 
different minor version
+        CassandraVersion differentCassandraVersion = new CassandraVersion(
+            String.format("%d.%d",
+                          NodeVersion.CURRENT.cassandraVersion.major,
+                          NodeVersion.CURRENT.cassandraVersion.minor+1));
+        ClusterMetadataTestHelper.addEndpoint(2, new NodeVersion(
+             differentCassandraVersion,
+             NodeVersion.CURRENT_METADATA_VERSION));
+        // With the same major versions, but different minor versions, we 
should still see this function return true
+        boolean result = AutoRepairUtils.hasMultipleLiveMajorVersions();
+        assertFalse(result);
+    }
+
+    @Test
+    public void 
testHasMultipleLiveMajorVersionsWithMultipleNodesOfDifferentMajorVersions()
+    {
+        // add two nodes with different cassandra major versions
+        CassandraVersion differentCassandraVersion = new CassandraVersion(
+            String.format("%d.%d", NodeVersion.CURRENT.cassandraVersion.major 
- 1, 0));
+        ClusterMetadataTestHelper.addEndpoint(2, new 
NodeVersion(differentCassandraVersion,
+                                                                 
NodeVersion.CURRENT_METADATA_VERSION));
+        // With different major versions, we should see this function return 
true
+        boolean result = AutoRepairUtils.hasMultipleLiveMajorVersions();
+        assertTrue(result);
+    }
+
     @Test
     public void testGetHostWithLongestUnrepairTime()
     {
diff --git 
a/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java 
b/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java
index 4ea9516e8e..640d952655 100644
--- a/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java
+++ b/test/unit/org/apache/cassandra/tools/nodetool/SetAutoRepairConfigTest.java
@@ -268,7 +268,8 @@ public class SetAutoRepairConfigTest
             forEachRepairType("ignore_dcs", "dc1,dc2", (type) -> verify(probe, 
times(1)).setAutoRepairIgnoreDCs(type.name(), ImmutableSet.of("dc1", "dc2"))),
             forEachRepairType("token_range_splitter.max_bytes_per_schedule", 
"500GiB", (type) -> verify(probe, 
times(1)).setAutoRepairTokenRangeSplitterParameter(type.name(), 
"max_bytes_per_schedule", "500GiB")),
             forEachRepairType("repair_max_retries", "3", (type) -> 
verify(probe, times(1)).setAutoRepairMaxRetriesCount(type.name(), 3)),
-            forEachRepairType("repair_retry_backoff", "60s", (type) -> 
verify(probe, times(1)).setAutoRepairRetryBackoff(type.name(), "60s"))
+            forEachRepairType("repair_retry_backoff", "60s", (type) -> 
verify(probe, times(1)).setAutoRepairRetryBackoff(type.name(), "60s")),
+            forEachRepairType("mixed_major_version_repair_enabled", "false", 
(type) -> verify(probe, times(1)).setMixedMajorVersionRepairEnabled(false))
             ).flatMap(Function.identity()).collect(Collectors.toList());
         }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to