Re: [PR] CASSANDRA-21129: Offline TCM dump tool [cassandra]

via GitHub Sun, 15 Feb 2026 06:39:44 -0800


dracarys09 commented on code in PR #4581:
URL: https://github.com/apache/cassandra/pull/4581#discussion_r2809424158



##########
src/java/org/apache/cassandra/tools/TCMDump.java:
##########
@@ -0,0 +1,539 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tools;
+
+import java.io.IOException;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.io.util.FileOutputStreamPlus;
+import org.apache.cassandra.schema.DistributedMetadataLogKeyspace;
+import org.apache.cassandra.schema.Keyspaces;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.SchemaConstants;
+import org.apache.cassandra.tcm.ClusterMetadata;
+import org.apache.cassandra.tcm.ClusterMetadataService;
+import org.apache.cassandra.tcm.Epoch;
+import org.apache.cassandra.tcm.MetadataSnapshots;
+import org.apache.cassandra.tcm.log.Entry;
+import org.apache.cassandra.tcm.log.LogReader;
+import org.apache.cassandra.tcm.log.LogState;
+import org.apache.cassandra.tcm.log.SystemKeyspaceStorage;
+import org.apache.cassandra.tcm.membership.NodeVersion;
+import org.apache.cassandra.tcm.serialization.VerboseMetadataSerializer;
+
+import picocli.CommandLine;
+import picocli.CommandLine.Command;
+import picocli.CommandLine.Option;
+
+import static com.google.common.base.Throwables.getStackTraceAsString;
+
+/**
+ * Standalone tool to dump Transactional Cluster Metadata (TCM) from local 
SSTables.
+ * <p>
+ * This is an emergency recovery tool for debugging when a Cassandra instance 
cannot
+ * start due to TCM issues. It reads the local_metadata_log and 
metadata_snapshots
+ * tables from the system keyspace to reconstruct and display the cluster 
metadata state.
+ * <p>
+ * Usage:
+ * <pre>
+ * # Binary dump (default)
+ * tcmdump dump --data-dir /path/to/data
+ *
+ * # toString output for debugging
+ * tcmdump dump --data-dir /path/to/data --to-string
+ *
+ * # Dump log entries
+ * tcmdump dump --data-dir /path/to/data --dump-log --from-epoch 1 --to-epoch 
50
+ *
+ * # Dump distributed log (CMS nodes)
+ * tcmdump dump --data-dir /path/to/data --dump-distributed-log
+ * </pre>
+ */
+@Command(name = "tcmdump",
+mixinStandardHelpOptions = true,
+description = "Dump Transactional Cluster Metadata from local SSTables",
+subcommands = { TCMDump.DumpMetadata.class })
+public class TCMDump implements Runnable
+{
+    private static final Output output = Output.CONSOLE;
+
+    public static void main(String... args)
+    {
+        Util.initDatabaseDescriptor();
+
+        CommandLine cli = new 
CommandLine(TCMDump.class).setExecutionExceptionHandler((ex, cmd, parseResult) 
-> {
+            err(ex);
+            return 2;
+        });
+        int status = cli.execute(args);
+        System.exit(status);
+    }
+
+    protected static void err(Throwable e)
+    {
+        output.err.println("error: " + e.getMessage());
+        output.err.println("-- StackTrace --");
+        output.err.println(getStackTraceAsString(e));
+    }
+
+    @Override
+    public void run()
+    {
+        CommandLine.usage(this, output.out);
+    }
+
+    @Command(name = "dump", description = "Dump cluster metadata from 
SSTables")
+    public static class DumpMetadata implements Runnable
+    {
+        @Option(names = { "-d", "--data-dir" }, description = "Data directory 
containing system keyspace")
+        public String dataDir;
+
+        @Option(names = { "-s", "--sstables" }, description = "Path to SSTable 
directory for metadata tables (can be specified multiple times for log and 
snapshot tables)", arity = "1..*")
+        public List<String> sstables;
+
+        @Option(names = { "-p", "--partitioner" }, description = "Partitioner 
class name",
+        defaultValue = "org.apache.cassandra.dht.Murmur3Partitioner")
+        public String partitioner;
+
+        @Option(names = { "-o", "--output" }, description = "Output file path 
for binary dump (default: temp file)")
+        public String outputFile;
+
+        // Output modes
+        @Option(names = { "--to-string" }, description = "Print 
ClusterMetadata.toString() to stdout")
+        public boolean toStringOutput;
+
+        @Option(names = { "--dump-log" }, description = "Dump log entries 
(toString each entry)")
+        public boolean dumpLog;
+
+        @Option(names = { "--dump-distributed-log" }, description = "Dump 
distributed_metadata_log (for CMS nodes)")
+        public boolean dumpDistributedLog;
+
+        // Filters
+        @Option(names = { "--epoch" }, description = "Show state at specific 
epoch")
+        public Long targetEpoch;
+
+        @Option(names = { "--from-epoch" }, description = "Filter log entries 
from this epoch")
+        public Long fromEpoch;
+
+        @Option(names = { "--to-epoch" }, description = "Filter log entries to 
this epoch")
+        public Long toEpoch;
+
+        // Debug
+        @Option(names = { "-v", "--verbose" }, description = "Verbose output")
+        public boolean verbose;
+
+        @Option(names = { "--debug" }, description = "Show stack traces on 
errors")
+        public boolean debug;
+
+        private Path tempDir;
+
+        /**
+         * Gets the log state from the given storage, detecting and logging 
gaps in epochs.
+         * <p>
+         * It detects gaps in the epoch sequence and logs warnings instead of 
throwing exceptions, allowing the tool to
+         * still output all available epochs.
+         *
+         * @param storage         the storage to read entries from
+         * @param snapshotManager the snapshot manager for base state
+         * @param targetEpoch     optional target epoch to filter to (null for 
all epochs)
+         * @param out             the output to write warnings to
+         * @return the LogState with all available entries
+         */
+        @VisibleForTesting
+        static LogState getLogState(SystemKeyspaceStorage storage,
+                                    MetadataSnapshots snapshotManager,
+                                    Long targetEpoch,
+                                    Output out)
+        {
+            ClusterMetadata base = snapshotManager.getLatestSnapshot();
+            Epoch baseEpoch = base == null ? Epoch.EMPTY : base.epoch;
+            Epoch endEpoch = targetEpoch != null ? Epoch.create(targetEpoch) : 
Epoch.create(Long.MAX_VALUE);
+
+            try
+            {
+                LogReader.EntryHolder entryHolder = 
storage.getEntries(baseEpoch, endEpoch);
+                ImmutableList.Builder<Entry> entries = ImmutableList.builder();
+                Epoch prevEpoch = baseEpoch;
+                List<String> gaps = new ArrayList<>();
+
+                for (Entry e : (Iterable<Entry>) entryHolder::iterator)
+                {
+                    if (!prevEpoch.nextEpoch().is(e.epoch))
+                    {
+                        gaps.add(String.format("Gap detected: expected epoch 
%d but found %d",
+                                               prevEpoch.getEpoch() + 1, 
e.epoch.getEpoch()));
+                    }
+                    prevEpoch = e.epoch;
+                    entries.add(e);
+                }
+
+                if (!gaps.isEmpty())
+                {
+                    out.err.println("WARNING: Found " + gaps.size() + " gap(s) 
in the epoch sequence:");
+                    for (String gap : gaps)
+                    {
+                        out.err.println("  " + gap);
+                    }
+                    out.err.println("Proceeding with available epochs...");
+                }
+
+                ImmutableList<Entry> entryList = entries.build();
+                // If there's a gap between the base state and the first 
entry, we need to pass null
+                // as the base state to avoid the LogState constructor 
invariant check failing
+                ClusterMetadata effectiveBase = base;
+                if (effectiveBase != null && !entryList.isEmpty() && 
!entryList.get(0).epoch.isDirectlyAfter(effectiveBase.epoch))
+                {
+                    out.err.println("WARNING: Gap between snapshot (epoch " + 
effectiveBase.epoch.getEpoch() +
+                                    ") and first log entry (epoch " + 
entryList.get(0).epoch.getEpoch() +
+                                    "). Proceeding without base snapshot.");
+                    effectiveBase = null;
+                }
+
+                return new LogState(effectiveBase, entryList);
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException("Failed to read log entries", e);
+            }
+        }
+
+        @Override
+        public void run()
+        {
+            try
+            {
+                // Create temporary directory for SSTable import
+                setupTempDirectory();
+
+                DatabaseDescriptor.setPartitioner(partitioner);
+
+                if (dumpDistributedLog)
+                {
+                    // Set up schema for distributed metadata keyspace
+                    // Use a dummy datacenter name since we're just reading 
SSTables offline
+                    
ClusterMetadataService.empty(Keyspaces.of(SystemKeyspace.metadata(), 
DistributedMetadataLogKeyspace.initialMetadata("dc1")));
+                }
+                else
+                {
+                    // Set up minimal schema for system keyspace only
+                    
ClusterMetadataService.empty(Keyspaces.of(SystemKeyspace.metadata()));
+                }
+                Keyspace.setInitialized();
+
+                if (dumpLog)
+                {
+                    importSSTables();
+                    LogState logState = getLogState();
+                    dumpLogEntries(logState);
+                }
+                else if (dumpDistributedLog)
+                {
+                    importDistributedLogSSTables();
+                    dumpDistributedLogEntries();
+                }
+                else
+                {
+                    importSSTables();
+                    LogState logState = getLogState();
+
+                    if (logState.isEmpty())
+                    {
+                        output.out.println("No metadata available");
+                        return;
+                    }
+
+                    ClusterMetadata metadata = logState.flatten().baseState;
+
+                    if (toStringOutput)
+                    {
+                        output.out.println(metadata.toString());
+                    }
+                    else
+                    {
+                        dumpBinary(metadata);
+                    }
+                }
+            }
+            catch (Exception e)
+            {
+                if (debug)
+                {
+                    e.printStackTrace(output.err);
+                }
+                else
+                {
+                    output.err.println("Error: " + e.getMessage());
+                }
+                System.exit(1);
+            }
+            finally
+            {
+                cleanupTempDirectory();
+            }
+        }
+
+        /**
+         * Creates a temporary directory and configures DatabaseDescriptor to 
use it.
+         * This ensures we don't pollute any existing data directories.
+         */
+        private void setupTempDirectory() throws IOException
+        {
+            tempDir = Files.createTempDirectory("tcmdump");
+            DatabaseDescriptor.getRawConfig().data_file_directories = new 
String[]{ tempDir.resolve("data").toString() };
+            DatabaseDescriptor.getRawConfig().commitlog_directory = 
tempDir.resolve("commitlog").toString();
+            DatabaseDescriptor.getRawConfig().hints_directory = 
tempDir.resolve("hints").toString();
+            DatabaseDescriptor.getRawConfig().saved_caches_directory = 
tempDir.resolve("saved_caches").toString();
+
+            if (verbose)
+            {
+                output.out.println("Using temporary directory: " + tempDir);
+            }
+        }
+
+        /**
+         * Cleans up the temporary directory.
+         */
+        private void cleanupTempDirectory()
+        {
+            if (tempDir != null)
+            {
+                try
+                {
+                    Files.walkFileTree(tempDir, new SimpleFileVisitor<>()
+                    {
+                        @Override
+                        public FileVisitResult visitFile(Path file, 
BasicFileAttributes attrs)
+                        throws IOException
+                        {
+                            Files.deleteIfExists(file);
+                            return FileVisitResult.CONTINUE;
+                        }
+
+                        @Override
+                        public FileVisitResult postVisitDirectory(Path dir, 
IOException exc)
+                        throws IOException
+                        {
+                            Files.deleteIfExists(dir);
+                            return FileVisitResult.CONTINUE;
+                        }
+                    });
+                }
+                catch (IOException e)
+                {
+                    if (verbose)
+                    {
+                        output.err.println("Warning: Failed to fully cleanup 
temp directory: " + tempDir + " (" + e.getMessage() + ")");
+                    }
+                }
+                finally
+                {
+                    // Avoid accidental reuse
+                    tempDir = null;
+                }
+            }
+        }
+
+        /**
+         * Dumps ClusterMetadata to a binary file using 
VerboseMetadataSerializer.
+         * This is the same format used by 
ClusterMetadataService.dumpClusterMetadata().
+         */
+        private void dumpBinary(ClusterMetadata metadata) throws IOException
+        {
+            Path outputPath = outputFile != null ? Path.of(outputFile) : 
Files.createTempFile("clustermetadata", ".dump");
+            try (FileOutputStreamPlus out = new 
FileOutputStreamPlus(outputPath))
+            {
+                
VerboseMetadataSerializer.serialize(ClusterMetadata.serializer, metadata, out, 
NodeVersion.CURRENT.serializationVersion());

Review Comment:
   Done. Though I think there is one caveat.  Serializing to an older version 
may lose data if the metadata contains features added in newer versions. For 
example, if metadata has Accord data (V7+) and you serialize to V6, the Accord 
data would be lost or cause errors. 
   
   I've added a comment mentioning this explicitly.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] CASSANDRA-21129: Offline TCM dump tool [cassandra]

Reply via email to