This is an automated email from the ASF dual-hosted git repository.

smiklosovic pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/cassandra.git

commit a2d78be028eec99fe39bb4ad8f98a8f5ec04481d
Author: Stefan Miklosovic <[email protected]>
AuthorDate: Thu Feb 26 10:56:38 2026 +0100

    Replace manual referencing with ColumnFamilyStore.selectAndReference when 
training a dictionary
    
    patch by Stefan Miklosovic; reviewed by Yifan Cai for CASSANDRA-21188
---
 CHANGES.txt                                        |  1 +
 .../compression/CompressionDictionaryManager.java  | 31 +++++++-----
 .../CompressionDictionaryScheduler.java            | 58 ++++------------------
 .../ICompressionDictionaryScheduler.java           |  8 ++-
 .../db/compression/SSTableChunkSampler.java        |  4 +-
 .../CompressionDictionaryOrphanedTest.java         |  7 ++-
 .../CompressionDictionarySchedulerTest.java        | 16 +++---
 ...CompressionDictionaryTrainingFrequencyTest.java |  6 ++-
 .../db/compression/SSTableChunkSamplerTest.java    | 55 ++++++++++----------
 .../ExportImportListCompressionDictionaryTest.java |  6 ++-
 10 files changed, 88 insertions(+), 104 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 701f7bf02c..ac6f6df701 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 5.1
+ * Replace manual referencing with ColumnFamilyStore.selectAndReference when 
training a dictionary (CASSANDRA-21188)
  * Forbid nodes upgrading to a version which cannot read existing log entries 
(CASSANDRA-21174)
  * Introduce a check for minimum time to pass to train or import a compression 
dictionary from the last one (CASSANDRA-21179)
  * Allow overriding compaction strategy parameters during startup 
(CASSANDRA-21169)
diff --git 
a/src/java/org/apache/cassandra/db/compression/CompressionDictionaryManager.java
 
b/src/java/org/apache/cassandra/db/compression/CompressionDictionaryManager.java
index 1207ed034f..6fc7c3d8cd 100644
--- 
a/src/java/org/apache/cassandra/db/compression/CompressionDictionaryManager.java
+++ 
b/src/java/org/apache/cassandra/db/compression/CompressionDictionaryManager.java
@@ -22,7 +22,6 @@ import java.time.Instant;
 import java.time.temporal.ChronoUnit;
 import java.util.List;
 import java.util.Map;
-import java.util.Set;
 
 import javax.annotation.Nullable;
 import javax.management.openmbean.CompositeData;
@@ -39,7 +38,8 @@ import org.apache.cassandra.config.DurationSpec;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import 
org.apache.cassandra.db.compression.CompressionDictionary.LightweightCompressionDictionary;
 import 
org.apache.cassandra.db.compression.CompressionDictionaryDetailsTabularData.CompressionDictionaryDataObject;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.schema.SystemDistributedKeyspace;
 import org.apache.cassandra.utils.FBUtilities;
@@ -53,6 +53,7 @@ import static 
org.apache.cassandra.io.compress.IDictionaryCompressor.DEFAULT_TRA
 import static 
org.apache.cassandra.io.compress.IDictionaryCompressor.TRAINING_MAX_DICTIONARY_SIZE_PARAMETER_NAME;
 import static 
org.apache.cassandra.io.compress.IDictionaryCompressor.TRAINING_MAX_TOTAL_SAMPLE_SIZE_PARAMETER_NAME;
 import static 
org.apache.cassandra.io.compress.IDictionaryCompressor.TRAINING_MIN_FREQUENCY_PARAMETER_NAME;
+import static 
org.apache.cassandra.schema.SystemDistributedKeyspace.retrieveLightweightLatestCompressionDictionary;
 
 public class CompressionDictionaryManager implements 
CompressionDictionaryManagerMBean,
                                                      
ICompressionDictionaryCache,
@@ -225,32 +226,38 @@ public class CompressionDictionaryManager implements 
CompressionDictionaryManage
         // resolve training config and fail fast when invalid, so we do not 
reach logic which would e.g. flush unnecessarily.
         CompressionDictionaryTrainingConfig trainingConfig = 
createTrainingConfig(parameters);
 
-        LightweightCompressionDictionary dictionary = 
SystemDistributedKeyspace.retrieveLightweightLatestCompressionDictionary(columnFamilyStore.getKeyspaceName(),
-                                                                               
                                                
columnFamilyStore.getTableName(),
-                                                                               
                                                
columnFamilyStore.metadata.id.toLongString());
+        LightweightCompressionDictionary dictionary = 
retrieveLightweightLatestCompressionDictionary(columnFamilyStore.getKeyspaceName(),
+                                                                               
                      columnFamilyStore.getTableName(),
+                                                                               
                      columnFamilyStore.metadata.id.toLongString());
 
         checkTrainingFrequency(dictionary);
 
         // SSTable-based training: sample from existing SSTables
-        Set<SSTableReader> sstables = columnFamilyStore.getLiveSSTables();
-        if (sstables.isEmpty())
+
+        // this is not closed here but in training runnable when finished
+        // also, if view is empty, and we throw just below because of it then
+        // there is nothing to "release" so close is not necessary
+        ColumnFamilyStore.RefViewFragment refViewFragment = 
columnFamilyStore.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
+
+        if (refViewFragment.sstables.isEmpty())
         {
             logger.info("No SSTables available for training in table {}.{}, 
flushing memtable first",
                         keyspaceName, tableName);
             
columnFamilyStore.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
-            sstables = columnFamilyStore.getLiveSSTables();
 
-            if (sstables.isEmpty())
+            refViewFragment = 
columnFamilyStore.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
+
+            if (refViewFragment.sstables.isEmpty())
             {
                 throw new IllegalStateException("No SSTables available for 
training in table " + keyspaceName + '.' + tableName + " after flush");
             }
         }
 
         logger.info("Starting SSTable-based training for {}.{} with {} 
SSTables",
-                    keyspaceName, tableName, sstables.size());
+                    keyspaceName, tableName, refViewFragment.sstables.size());
 
         trainer.start(trainingConfig);
-        scheduler.scheduleSSTableBasedTraining(trainer, sstables, 
trainingConfig, force);
+        scheduler.scheduleSSTableBasedTraining(trainer, refViewFragment, 
trainingConfig, force);
     }
 
     @Override
@@ -330,7 +337,7 @@ public class CompressionDictionaryManager implements 
CompressionDictionaryManage
 
         CompressionDictionary.DictId dictId = new 
CompressionDictionary.DictId(kind, dataObject.dictId);
 
-        LightweightCompressionDictionary latestCompressionDictionary = 
SystemDistributedKeyspace.retrieveLightweightLatestCompressionDictionary(keyspaceName,
 tableName, tableId);
+        LightweightCompressionDictionary latestCompressionDictionary = 
retrieveLightweightLatestCompressionDictionary(keyspaceName, tableName, 
tableId);
         if (latestCompressionDictionary != null)
         {
             if (latestCompressionDictionary.dictId.id > dictId.id)
diff --git 
a/src/java/org/apache/cassandra/db/compression/CompressionDictionaryScheduler.java
 
b/src/java/org/apache/cassandra/db/compression/CompressionDictionaryScheduler.java
index c6f5b6a922..f29dae2400 100644
--- 
a/src/java/org/apache/cassandra/db/compression/CompressionDictionaryScheduler.java
+++ 
b/src/java/org/apache/cassandra/db/compression/CompressionDictionaryScheduler.java
@@ -18,10 +18,6 @@
 
 package org.apache.cassandra.db.compression;
 
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
 import java.util.concurrent.ScheduledFuture;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -33,9 +29,8 @@ import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.schema.SystemDistributedKeyspace;
-import org.apache.cassandra.utils.concurrent.Ref;
 
 /**
  * Manages scheduled tasks for compression dictionary operations.
@@ -89,20 +84,21 @@ public class CompressionDictionaryScheduler implements 
ICompressionDictionarySch
 
     @Override
     public void scheduleSSTableBasedTraining(ICompressionDictionaryTrainer 
trainer,
-                                             Set<SSTableReader> sstables,
+                                             ColumnFamilyStore.RefViewFragment 
refViewFragment,
                                              
CompressionDictionaryTrainingConfig config,
                                              boolean force)
     {
         if (!manualTrainingInProgress.compareAndSet(false, true))
         {
+            refViewFragment.close();
             throw new IllegalStateException("Training already in progress for 
table " + keyspaceName + '.' + tableName);
         }
 
         logger.info("Starting SSTable-based dictionary training for {}.{} from 
{} SSTables",
-                    keyspaceName, tableName, sstables.size());
+                    keyspaceName, tableName, refViewFragment.sstables.size());
 
         // Run the SSTableSamplingTask asynchronously
-        SSTableSamplingTask task = new SSTableSamplingTask(sstables, trainer, 
config, force);
+        SSTableSamplingTask task = new SSTableSamplingTask(refViewFragment, 
trainer, config, force);
         ScheduledExecutors.nonPeriodicTasks.submit(task);
     }
 
@@ -166,41 +162,20 @@ public class CompressionDictionaryScheduler implements 
ICompressionDictionarySch
      */
     private class SSTableSamplingTask implements Runnable
     {
-        private final Set<SSTableReader> sstables;
+        private final ColumnFamilyStore.RefViewFragment refViewFragment;
         private final ICompressionDictionaryTrainer trainer;
         private final CompressionDictionaryTrainingConfig config;
-        private final List<Ref<SSTableReader>> sstableRefs;
         private final boolean force;
 
-        private SSTableSamplingTask(Set<SSTableReader> sstables,
+        private SSTableSamplingTask(ColumnFamilyStore.RefViewFragment 
refViewFragment,
                                     ICompressionDictionaryTrainer trainer,
                                     CompressionDictionaryTrainingConfig config,
                                     boolean force)
         {
+            this.refViewFragment = refViewFragment;
             this.trainer = trainer;
             this.config = config;
             this.force = force;
-
-            // Acquire references to all SSTables to prevent deletion during 
sampling
-            this.sstableRefs = new ArrayList<>();
-            Set<SSTableReader> referencedSSTables = new HashSet<>();
-
-            for (SSTableReader sstable : sstables)
-            {
-                Ref<SSTableReader> ref = sstable.tryRef();
-                if (ref != null)
-                {
-                    sstableRefs.add(ref);
-                    referencedSSTables.add(sstable);
-                }
-                else
-                {
-                    logger.debug("Couldn't acquire reference to SSTable {}. It 
may have been removed.",
-                                 sstable.descriptor);
-                }
-            }
-
-            this.sstables = referencedSSTables;
         }
 
         @Override
@@ -208,18 +183,11 @@ public class CompressionDictionaryScheduler implements 
ICompressionDictionarySch
         {
             try
             {
-                if (sstables.isEmpty())
-                {
-                    logger.warn("No SSTables available for sampling in {}.{}", 
keyspaceName, tableName);
-                    cancelManualTraining();
-                    return;
-                }
-
                 logger.info("Sampling chunks from {} SSTables for {}.{}",
-                            sstables.size(), keyspaceName, tableName);
+                            refViewFragment.sstables.size(), keyspaceName, 
tableName);
 
                 // Sample chunks from SSTables and add to trainer
-                SSTableChunkSampler.sampleFromSSTables(sstables, trainer, 
config);
+                
SSTableChunkSampler.sampleFromSSTables(refViewFragment.sstables, trainer, 
config);
 
                 logger.info("Completed sampling for {}.{}, now training 
dictionary",
                             keyspaceName, tableName);
@@ -247,11 +215,7 @@ public class CompressionDictionaryScheduler implements 
ICompressionDictionarySch
             }
             finally
             {
-                // Release all SSTable references
-                for (Ref<SSTableReader> ref : sstableRefs)
-                {
-                    ref.release();
-                }
+                refViewFragment.close();
             }
         }
     }
diff --git 
a/src/java/org/apache/cassandra/db/compression/ICompressionDictionaryScheduler.java
 
b/src/java/org/apache/cassandra/db/compression/ICompressionDictionaryScheduler.java
index 9a9ac64d8c..2101b538e3 100644
--- 
a/src/java/org/apache/cassandra/db/compression/ICompressionDictionaryScheduler.java
+++ 
b/src/java/org/apache/cassandra/db/compression/ICompressionDictionaryScheduler.java
@@ -18,9 +18,7 @@
 
 package org.apache.cassandra.db.compression;
 
-import java.util.Set;
-
-import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.db.ColumnFamilyStore.RefViewFragment;
 
 /**
  * Interface for managing scheduled tasks for compression dictionary 
operations.
@@ -41,13 +39,13 @@ public interface ICompressionDictionaryScheduler extends 
AutoCloseable
      * Schedules SSTable-based training that samples from existing SSTables.
      *
      * @param trainer the trainer to use
-     * @param sstables the set of SSTables to sample from
+     * @param refViewFragment the view of SSTables to sample from
      * @param config the training configuration
      * @param force force the dictionary training even if there are not enough 
samples
      * @throws IllegalStateException if training is already in progress
      */
     void scheduleSSTableBasedTraining(ICompressionDictionaryTrainer trainer,
-                                      Set<SSTableReader> sstables,
+                                      RefViewFragment refViewFragment,
                                       CompressionDictionaryTrainingConfig 
config,
                                       boolean force);
 
diff --git 
a/src/java/org/apache/cassandra/db/compression/SSTableChunkSampler.java 
b/src/java/org/apache/cassandra/db/compression/SSTableChunkSampler.java
index d65070062c..50b3798096 100644
--- a/src/java/org/apache/cassandra/db/compression/SSTableChunkSampler.java
+++ b/src/java/org/apache/cassandra/db/compression/SSTableChunkSampler.java
@@ -91,7 +91,7 @@ public class SSTableChunkSampler
      * @param trainer  the trainer to add samples to
      * @param config   the training configuration with sample size limits
      */
-    public static void sampleFromSSTables(Set<SSTableReader> sstables,
+    public static void sampleFromSSTables(List<SSTableReader> sstables,
                                           ICompressionDictionaryTrainer 
trainer,
                                           CompressionDictionaryTrainingConfig 
config) throws IOException
     {
@@ -125,7 +125,7 @@ public class SSTableChunkSampler
     /**
      * Builds SSTableChunkInfo objects for all SSTables and logs statistics.
      */
-    static List<SSTableChunkInfo> buildSSTableInfos(Set<SSTableReader> 
sstables,
+    static List<SSTableChunkInfo> buildSSTableInfos(List<SSTableReader> 
sstables,
                                                     
CompressionDictionaryTrainingConfig config)
     {
         List<SSTableChunkInfo> sstableInfos = new ArrayList<>();
diff --git 
a/test/unit/org/apache/cassandra/db/compression/CompressionDictionaryOrphanedTest.java
 
b/test/unit/org/apache/cassandra/db/compression/CompressionDictionaryOrphanedTest.java
index b42da12554..0a90d8aa82 100644
--- 
a/test/unit/org/apache/cassandra/db/compression/CompressionDictionaryOrphanedTest.java
+++ 
b/test/unit/org/apache/cassandra/db/compression/CompressionDictionaryOrphanedTest.java
@@ -112,7 +112,6 @@ public class CompressionDictionaryOrphanedTest extends 
CQLTester
         assertEquals(2, dicts.size());
         assertEquals(tableId, dicts.get(0).tableId);
         assertEquals(tableId, dicts.get(1).tableId);
-
     }
 
     private void assertOrphaned(String tableId)
@@ -159,6 +158,8 @@ public class CompressionDictionaryOrphanedTest extends 
CQLTester
         .contains(tableName);
     }
 
+    private static int batch = 1;
+
     private void createSSTables()
     {
         for (int file = 0; file < 10; file++)
@@ -166,12 +167,14 @@ public class CompressionDictionaryOrphanedTest extends 
CQLTester
             int batchSize = 1000;
             for (int i = 0; i < batchSize; i++)
             {
-                int index = i + file * batchSize;
+                int index = batch + (i + file * batchSize);
                 executeFormattedQuery(format("INSERT INTO %s.%s (id, data) 
VALUES (?, ?)", keyspace(), tableName),
                                       index, "test data " + index);
             }
 
             flush();
         }
+
+        batch++;
     }
 }
diff --git 
a/test/unit/org/apache/cassandra/db/compression/CompressionDictionarySchedulerTest.java
 
b/test/unit/org/apache/cassandra/db/compression/CompressionDictionarySchedulerTest.java
index e99a318358..b52062fbf6 100644
--- 
a/test/unit/org/apache/cassandra/db/compression/CompressionDictionarySchedulerTest.java
+++ 
b/test/unit/org/apache/cassandra/db/compression/CompressionDictionarySchedulerTest.java
@@ -18,9 +18,6 @@
 
 package org.apache.cassandra.db.compression;
 
-import java.util.HashSet;
-import java.util.Set;
-
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -29,7 +26,8 @@ import org.apache.cassandra.config.DataStorageSpec;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
 
 import static org.apache.cassandra.Util.spinUntilTrue;
 import static 
org.apache.cassandra.io.compress.IDictionaryCompressor.DEFAULT_TRAINING_MAX_DICTIONARY_SIZE_PARAMETER_VALUE;
@@ -66,11 +64,11 @@ public class CompressionDictionarySchedulerTest extends 
CQLTester
 
         try (CompressionDictionaryManager manager = 
cfs.compressionDictionaryManager())
         {
-            Set<SSTableReader> sstables = new HashSet<>();
+            ColumnFamilyStore.RefViewFragment refViewFragment = 
cfs.selectAndReference(View.select(SSTableSet.CANONICAL, (x) -> false));
             CompressionDictionaryTrainingConfig config = 
createSampleAllTrainingConfig(cfs);
 
             // Should not throw, but task will complete quickly with no 
SSTables
-            scheduler.scheduleSSTableBasedTraining(manager.trainer(), 
sstables, config, true);
+            scheduler.scheduleSSTableBasedTraining(manager.trainer(), 
refViewFragment, config, true);
             spinUntilTrue(() -> !scheduler.isManualTrainingRunning());
             assertThat(manager.getCurrent()).isNull();
         }
@@ -89,14 +87,14 @@ public class CompressionDictionarySchedulerTest extends 
CQLTester
         {
             createSSTables();
 
-            Set<SSTableReader> sstables = cfs.getLiveSSTables();
-            assertThat(sstables).isNotEmpty();
+            ColumnFamilyStore.RefViewFragment refViewFragment = 
cfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
+            assertThat(refViewFragment.sstables).isNotEmpty();
 
             CompressionDictionaryTrainingConfig config = 
createSampleAllTrainingConfig(cfs);
             manager.trainer().start(config);
 
             assertThat(manager.getCurrent()).as("There should be no dictionary 
at this step").isNull();
-            scheduler.scheduleSSTableBasedTraining(manager.trainer(), 
sstables, config, true);
+            scheduler.scheduleSSTableBasedTraining(manager.trainer(), 
refViewFragment, config, true);
 
             // Task should be scheduled
             assertThat(scheduler.isManualTrainingRunning()).isTrue();
diff --git 
a/test/unit/org/apache/cassandra/db/compression/CompressionDictionaryTrainingFrequencyTest.java
 
b/test/unit/org/apache/cassandra/db/compression/CompressionDictionaryTrainingFrequencyTest.java
index c7eb86737f..11bb2b17ca 100644
--- 
a/test/unit/org/apache/cassandra/db/compression/CompressionDictionaryTrainingFrequencyTest.java
+++ 
b/test/unit/org/apache/cassandra/db/compression/CompressionDictionaryTrainingFrequencyTest.java
@@ -208,6 +208,8 @@ public class CompressionDictionaryTrainingFrequencyTest 
extends CQLTester
         result.assertOnCleanExit();
     }
 
+    private static int batch = 1;
+
     private void createSSTables()
     {
         for (int file = 0; file < 10; file++)
@@ -215,13 +217,15 @@ public class CompressionDictionaryTrainingFrequencyTest 
extends CQLTester
             int batchSize = 1000;
             for (int i = 0; i < batchSize; i++)
             {
-                int index = i + file * batchSize;
+                int index = batch * (i + file * batchSize);
                 executeFormattedQuery(format("INSERT INTO %s.%s (id, data) 
VALUES (?, ?)", keyspace(), tableName),
                                       index, "test data " + index);
             }
 
             flush();
         }
+
+        batch++;
     }
 
     private Pair<CompressionDictionaryDataObject, File> export() throws 
Throwable
diff --git 
a/test/unit/org/apache/cassandra/db/compression/SSTableChunkSamplerTest.java 
b/test/unit/org/apache/cassandra/db/compression/SSTableChunkSamplerTest.java
index 2fc44853a2..118a0c7274 100644
--- a/test/unit/org/apache/cassandra/db/compression/SSTableChunkSamplerTest.java
+++ b/test/unit/org/apache/cassandra/db/compression/SSTableChunkSamplerTest.java
@@ -27,6 +27,8 @@ import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import 
org.apache.cassandra.db.compression.SSTableChunkSampler.SSTableChunkInfo;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 
 import static org.assertj.core.api.Assertions.assertThat;
@@ -118,19 +120,20 @@ public class SSTableChunkSamplerTest extends CQLTester
                                                                                
         .chunkSize(64 * 1024)
                                                                                
         .build();
 
-        Set<SSTableReader> sstables = cfs.getLiveSSTables();
-        assertThat(sstables).hasSizeGreaterThanOrEqualTo(3);
-
-        List<SSTableChunkInfo> sstableInfos = 
SSTableChunkSampler.buildSSTableInfos(sstables, config);
-        long totalChunks = sstableInfos.stream().mapToLong(info -> 
info.chunkCount).sum();
-        long targetChunkCount = 
SSTableChunkSampler.calculateTargetChunkCount(sstableInfos, totalChunks, 
config);
-
-        // Target should be based on maxTotalSampleSize divided by average 
chunk size
-        assertThat(targetChunkCount).isGreaterThan(0);
-        long totalDataSize = sstableInfos.stream().mapToLong(info -> 
info.dataLength).sum();
-        int averageChunkSize = (int) (totalDataSize / totalChunks);
-        long expectedTarget = config.maxTotalSampleSize / averageChunkSize;
-        assertThat(targetChunkCount).isEqualTo(expectedTarget);
+        try (ColumnFamilyStore.RefViewFragment refViewFragment = 
cfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL)))
+        {
+            
assertThat(refViewFragment.sstables).hasSizeGreaterThanOrEqualTo(3);
+            List<SSTableChunkInfo> sstableInfos = 
SSTableChunkSampler.buildSSTableInfos(refViewFragment.sstables, config);
+            long totalChunks = sstableInfos.stream().mapToLong(info -> 
info.chunkCount).sum();
+            long targetChunkCount = 
SSTableChunkSampler.calculateTargetChunkCount(sstableInfos, totalChunks, 
config);
+
+            // Target should be based on maxTotalSampleSize divided by average 
chunk size
+            assertThat(targetChunkCount).isGreaterThan(0);
+            long totalDataSize = sstableInfos.stream().mapToLong(info -> 
info.dataLength).sum();
+            int averageChunkSize = (int) (totalDataSize / totalChunks);
+            long expectedTarget = config.maxTotalSampleSize / averageChunkSize;
+            assertThat(targetChunkCount).isEqualTo(expectedTarget);
+        }
     }
 
     @Test
@@ -186,21 +189,23 @@ public class SSTableChunkSamplerTest extends CQLTester
         }
         flush();
 
-        Set<SSTableReader> sstables = cfs.getLiveSSTables();
-        assertThat(sstables).isNotEmpty();
+        try (ColumnFamilyStore.RefViewFragment refViewFragment = 
cfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL)))
+        {
+            assertThat(refViewFragment.sstables).isNotEmpty();
 
-        CompressionDictionaryTrainingConfig config = 
CompressionDictionaryTrainingConfig.builder()
-                                                                               
         .chunkSize(64 * 1024)
-                                                                               
         .build();
+            CompressionDictionaryTrainingConfig config = 
CompressionDictionaryTrainingConfig.builder()
+                                                                               
             .chunkSize(64 * 1024)
+                                                                               
             .build();
 
-        // Create a mock trainer that is not ready to sample
-        ICompressionDictionaryTrainer trainer = 
mock(ICompressionDictionaryTrainer.class, RETURNS_DEEP_STUBS);
-        
when(trainer.getTrainingState().getStatus()).thenReturn(ICompressionDictionaryTrainer.TrainingStatus.NOT_STARTED);
+            // Create a mock trainer that is not ready to sample
+            ICompressionDictionaryTrainer trainer = 
mock(ICompressionDictionaryTrainer.class, RETURNS_DEEP_STUBS);
+            
when(trainer.getTrainingState().getStatus()).thenReturn(ICompressionDictionaryTrainer.TrainingStatus.NOT_STARTED);
 
-        // Should throw IllegalStateException when trainer is not ready
-        assertThatThrownBy(() -> 
SSTableChunkSampler.sampleFromSSTables(sstables, trainer, config))
-        .isInstanceOf(IllegalStateException.class)
-        .hasMessageContaining("Trainer is not ready to accept samples");
+            // Should throw IllegalStateException when trainer is not ready
+            assertThatThrownBy(() -> 
SSTableChunkSampler.sampleFromSSTables(refViewFragment.sstables, trainer, 
config))
+            .isInstanceOf(IllegalStateException.class)
+            .hasMessageContaining("Trainer is not ready to accept samples");
+        }
     }
 
     @Test
diff --git 
a/test/unit/org/apache/cassandra/tools/nodetool/ExportImportListCompressionDictionaryTest.java
 
b/test/unit/org/apache/cassandra/tools/nodetool/ExportImportListCompressionDictionaryTest.java
index 204cf115e7..6792b3537a 100644
--- 
a/test/unit/org/apache/cassandra/tools/nodetool/ExportImportListCompressionDictionaryTest.java
+++ 
b/test/unit/org/apache/cassandra/tools/nodetool/ExportImportListCompressionDictionaryTest.java
@@ -234,6 +234,8 @@ public class ExportImportListCompressionDictionaryTest 
extends CQLTester
         .contains(table);
     }
 
+    private static int batch = 1;
+
     private void createSSTables()
     {
         for (int file = 0; file < 10; file++)
@@ -241,11 +243,13 @@ public class ExportImportListCompressionDictionaryTest 
extends CQLTester
             int batchSize = 1000;
             for (int i = 0; i < batchSize; i++)
             {
-                int index = i + file * batchSize;
+                int index = batch * (i + file * batchSize);
                 execute("INSERT INTO %s (id, data) VALUES (?, ?)", index, 
"test data " + index);
             }
 
             flush();
         }
+
+        batch++;
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to