This is an automated email from the ASF dual-hosted git repository. dcapwell pushed a commit to branch cassandra-3.0 in repository https://gitbox.apache.org/repos/asf/cassandra.git
commit 4b0c0817fa7b8e12a8e6cf96a9a2b67f36b449e8 Merge: dc725bc b905397 Author: David Capwell <[email protected]> AuthorDate: Tue Jul 28 17:56:49 2020 -0700 Merge branch 'cassandra-2.2' into cassandra-3.0 CHANGES.txt | 1 + src/java/org/apache/cassandra/db/LegacyLayout.java | 2 + .../org/apache/cassandra/utils/ByteBufferUtil.java | 2 + .../upgrade/MigrateDropColumnsTest.java | 106 +++++++++++++++++++++ 4 files changed, 111 insertions(+) diff --cc CHANGES.txt index a42de51,439ef5d..d22ba43 --- a/CHANGES.txt +++ b/CHANGES.txt @@@ -1,7 -1,4 +1,8 @@@ -2.2.18 +3.0.22: ++ * 3.x fails to start if commit log has range tombstones from a column which is also deleted (CASSANDRA-15970) + * Forbid altering UDTs used in partition keys (CASSANDRA-15933) + * Fix empty/null json string representation (CASSANDRA-15896) +Merged from 2.2: * Fix CQL parsing of collections when the column type is reversed (CASSANDRA-15814) diff --cc src/java/org/apache/cassandra/db/LegacyLayout.java index 8492de5,0000000..e58603a mode 100644,000000..100644 --- a/src/java/org/apache/cassandra/db/LegacyLayout.java +++ b/src/java/org/apache/cassandra/db/LegacyLayout.java @@@ -1,2793 -1,0 +1,2795 @@@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db; + +import java.io.DataInput; +import java.io.IOException; +import java.io.IOError; +import java.nio.ByteBuffer; +import java.security.MessageDigest; +import java.util.*; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +import org.apache.cassandra.cql3.ColumnIdentifier; +import org.apache.cassandra.cql3.SuperColumnCompatibility; +import org.apache.cassandra.utils.AbstractIterator; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; +import com.google.common.collect.PeekingIterator; + +import org.apache.cassandra.config.CFMetaData; +import org.apache.cassandra.config.ColumnDefinition; +import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.filter.DataLimits; +import org.apache.cassandra.db.rows.*; +import org.apache.cassandra.db.partitions.*; +import org.apache.cassandra.db.context.CounterContext; +import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.io.util.DataInputPlus; +import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.utils.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import static com.google.common.collect.Iterables.all; +import static org.apache.cassandra.utils.ByteBufferUtil.bytes; + +/** + * Functions to deal with the old format. + */ +public abstract class LegacyLayout +{ + private static final Logger logger = LoggerFactory.getLogger(LegacyLayout.class); + private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1L, TimeUnit.MINUTES); + + public final static int MAX_CELL_NAME_LENGTH = FBUtilities.MAX_UNSIGNED_SHORT; + + public final static int STATIC_PREFIX = 0xFFFF; + + public final static int DELETION_MASK = 0x01; + public final static int EXPIRATION_MASK = 0x02; + public final static int COUNTER_MASK = 0x04; + public final static int COUNTER_UPDATE_MASK = 0x08; + private final static int RANGE_TOMBSTONE_MASK = 0x10; + + // Used in decodeBound if the number of components in the legacy bound is greater than the clustering size, + // indicating a complex column deletion (i.e. a collection tombstone), but the referenced column is either + // not present in the current table metadata, or is not currently a complex column. In that case, we'll + // check the dropped columns for the table which should contain the previous column definition. If that + // previous definition is also not complex (indicating that the column may have been dropped and re-added + // with different types multiple times), we use this fake definition to ensure that the complex deletion + // can be safely processed. This resulting deletion should be filtered out of any row created by a + // CellGrouper by the dropped column check, but this gives us an extra level of confidence as that check + // is timestamp based and so is fallible in the face of clock drift. + private static final ColumnDefinition INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN = + new ColumnDefinition("", + "", + ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance), + SetType.getInstance(UTF8Type.instance, true), + ColumnDefinition.NO_POSITION, + ColumnDefinition.Kind.REGULAR); + + private LegacyLayout() {} + + public static AbstractType<?> makeLegacyComparator(CFMetaData metadata) + { + ClusteringComparator comparator = metadata.comparator; + if (!metadata.isCompound()) + { + assert comparator.size() == 1; + return comparator.subtype(0); + } + + boolean hasCollections = metadata.hasCollectionColumns() || metadata.hasDroppedCollectionColumns(); + List<AbstractType<?>> types = new ArrayList<>(comparator.size() + (metadata.isDense() ? 0 : 1) + (hasCollections ? 1 : 0)); + + types.addAll(comparator.subtypes()); + + if (!metadata.isDense()) + { + types.add(UTF8Type.instance); + + if (hasCollections) + { + Map<ByteBuffer, CollectionType> defined = new HashMap<>(); + + for (CFMetaData.DroppedColumn def : metadata.getDroppedColumns().values()) + if (def.type instanceof CollectionType && def.type.isMultiCell()) + defined.put(bytes(def.name), (CollectionType) def.type); + + for (ColumnDefinition def : metadata.partitionColumns()) + if (def.type instanceof CollectionType && def.type.isMultiCell()) + defined.put(def.name.bytes, (CollectionType) def.type); + + types.add(ColumnToCollectionType.getInstance(defined)); + } + } + return CompositeType.getInstance(types); + } + + public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer cellname) + throws UnknownColumnException + { + assert cellname != null; + if (metadata.isSuper()) + { + assert superColumnName != null; + return decodeForSuperColumn(metadata, new Clustering(superColumnName), cellname); + } + + assert superColumnName == null; + return decodeCellName(metadata, cellname); + } + + private static LegacyCellName decodeForSuperColumn(CFMetaData metadata, Clustering clustering, ByteBuffer subcol) + { + ColumnDefinition def = metadata.getColumnDefinition(subcol); + if (def != null) + { + // it's a statically defined subcolumn + return new LegacyCellName(clustering, def, null); + } + + def = metadata.compactValueColumn(); + assert def != null && def.type instanceof MapType; + return new LegacyCellName(clustering, def, subcol); + } + + public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname) throws UnknownColumnException + { + return decodeCellName(metadata, cellname, false); + } + + public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname, boolean readAllAsDynamic) throws UnknownColumnException + { + Clustering clustering = decodeClustering(metadata, cellname); + + if (metadata.isSuper()) + return decodeForSuperColumn(metadata, clustering, CompositeType.extractComponent(cellname, 1)); + + if (metadata.isDense() || (metadata.isCompactTable() && readAllAsDynamic)) + return new LegacyCellName(clustering, metadata.compactValueColumn(), null); + + ByteBuffer column = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size()) : cellname; + if (column == null) + { + // Tables for composite 2ndary indexes used to be compound but dense, but we've transformed them into regular tables + // (non compact ones) but with no regular column (i.e. we only care about the clustering). So we'll get here + // in that case, and what we want to return is basically a row marker. + if (metadata.partitionColumns().isEmpty()) + return new LegacyCellName(clustering, null, null); + + // Otherwise, we shouldn't get there + throw new IllegalArgumentException("No column name component found in cell name"); + } + + // Row marker, this is ok + if (!column.hasRemaining()) + return new LegacyCellName(clustering, null, null); + + ColumnDefinition def = metadata.getColumnDefinition(column); + + if (metadata.isCompactTable()) + { + if (def == null || def.isPrimaryKeyColumn()) + // If it's a compact table, it means the column is in fact a "dynamic" one + return new LegacyCellName(new Clustering(column), metadata.compactValueColumn(), null); + } + else if (def == null) + { + throw new UnknownColumnException(metadata, column); + } + + ByteBuffer collectionElement = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size() + 1) : null; + if (collectionElement != null && def.type instanceof CollectionType) + { + ((CollectionType)def.type).nameComparator().validateIfFixedSize(collectionElement); + } + + // Note that because static compact columns are translated to static defs in the new world order, we need to force a static + // clustering if the definition is static (as it might not be in this case). + return new LegacyCellName(def.isStatic() ? Clustering.STATIC_CLUSTERING : clustering, def, collectionElement); + } + + public static LegacyBound decodeSliceBound(CFMetaData metadata, ByteBuffer bound, boolean isStart) + { + return decodeBound(metadata, bound, isStart, false); + } + + public static LegacyBound decodeTombstoneBound(CFMetaData metadata, ByteBuffer bound, boolean isStart) + { + return decodeBound(metadata, bound, isStart, true); + } + + private static LegacyBound decodeBound(CFMetaData metadata, ByteBuffer bound, boolean isStart, boolean isDeletion) + { + if (!bound.hasRemaining()) + return isStart ? LegacyBound.BOTTOM : LegacyBound.TOP; + + if (!metadata.isCompound()) + { + // The non compound case is a lot easier, in that there is no EOC nor collection to worry about, so dealing + // with that first. + metadata.comparator.subtype(0).validateIfFixedSize(bound); + return new LegacyBound(isStart ? Slice.Bound.inclusiveStartOf(bound) : Slice.Bound.inclusiveEndOf(bound), false, null); + } + + int clusteringSize = metadata.comparator.size(); + + boolean isStatic = metadata.isCompound() && CompositeType.isStaticName(bound); + List<ByteBuffer> components = CompositeType.splitName(bound); + byte eoc = CompositeType.lastEOC(bound); + for (int i=0; i<Math.min(clusteringSize, components.size()); i++) + { + metadata.comparator.subtype(i).validateIfFixedSize(components.get(i)); + } + + // if the bound we have decoded is static, 2.2 format requires there to be N empty clusterings + assert !isStatic || + (components.size() >= clusteringSize + && all(components.subList(0, clusteringSize), ByteBufferUtil.EMPTY_BYTE_BUFFER::equals)); + + ColumnDefinition collectionName = null; + if (components.size() > clusteringSize) + { + // For a deletion, there can be more components than the clustering size only in the case this is the + // bound of a collection range tombstone. In such a case, there is exactly one more component, and that + // component is the name of the collection being deleted, since we do not support collection range deletions. + // If the bound is not part of a deletion, it is from slice query filter. The column name may be: + // - a valid, non-collection column; in this case we expect a single extra component + // - an empty buffer, representing a row marker; in this case we also expect a single extra empty component + // - a valid collection column and the first part of a cell path; in this case we expect exactly two extra components + // In any of these slice cases, these items are unnecessary for the bound we construct, + // so we can simply remove them, after corroborating we have encountered one of these scenario. + assert !metadata.isCompactTable() : toDebugHex(components); + + // In all cases, the element straight after the clusterings should contain the name of a column. + if (components.size() > clusteringSize + 1) + { + // we accept bounds from paging state that occur inside a complex column - in this case, we expect + // two excess components, the first of which is a column name, the second a key into the collection + if (isDeletion) + throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": deletion can have at most one extra component"); + + if (clusteringSize + 2 != components.size()) + throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": complex slices require exactly two extra components"); + + // decode simply to verify that we have (or may have had) a complex column; we assume the collection key is valid + decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic); + components.remove(clusteringSize + 1); + } + else if (isDeletion) + { + collectionName = decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic); + } + else if (components.get(clusteringSize).hasRemaining()) + { + decodeBoundVerifySimpleColumn(metadata, components, clusteringSize, isStatic); + } + components.remove(clusteringSize); + } + + boolean isInclusive; + if (isStart) + { + isInclusive = eoc <= 0; + } + else + { + isInclusive = eoc >= 0; + + // for an end bound, if we only have a prefix of all the components and the final EOC is zero, + // then it should only match up to the prefix but no further, that is, it is an inclusive bound + // of the exact prefix but an exclusive bound of anything beyond it, so adding an empty + // composite value ensures this behavior, see CASSANDRA-12423 for more details + if (eoc == 0 && components.size() < clusteringSize) + { + components.add(ByteBufferUtil.EMPTY_BYTE_BUFFER); + isInclusive = false; + } + } + + Slice.Bound.Kind boundKind = Slice.Bound.boundKind(isStart, isInclusive); + Slice.Bound sb = Slice.Bound.create(boundKind, components.toArray(new ByteBuffer[components.size()])); + return new LegacyBound(sb, isStatic, collectionName); + } + + // finds the simple column definition associated with components.get(clusteringSize) + // if no such columns exists, or ever existed, we throw an exception; if we do not know, we return a dummy column definition + private static ColumnDefinition decodeBoundLookupComplexColumn(CFMetaData metadata, List<ByteBuffer> components, int clusteringSize, boolean isStatic) + { + ByteBuffer columnNameBytes = components.get(clusteringSize); + ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes); + if (columnName == null || !columnName.isComplex()) + { + columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic); + // if no record of the column having ever existed is found, something is badly wrong + if (columnName == null) + throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected complex column at position " + clusteringSize); + + // if we do have a record of dropping this column but it wasn't previously complex, use a fake + // column definition for safety (see the comment on the constant declaration for details) + if (!columnName.isComplex()) + columnName = INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN; + } + + return columnName; + } + + // finds the simple column definition associated with components.get(clusteringSize) + // if no such columns exists, and definitely never existed, we throw an exception + private static void decodeBoundVerifySimpleColumn(CFMetaData metadata, List<ByteBuffer> components, int clusteringSize, boolean isStatic) + { + ByteBuffer columnNameBytes = components.get(clusteringSize); + ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes); + if (columnName == null || !columnName.isSimple()) + { + columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic); + // if no record of the column having ever existed is found, something is badly wrong + if (columnName == null) + throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected simple column at position " + clusteringSize); + } + } + + private static String toDebugHex(Collection<ByteBuffer> buffers) + { + return buffers.stream().map(ByteBufferUtil::bytesToHex).collect(Collectors.joining()); + } + + public static ByteBuffer encodeBound(CFMetaData metadata, Slice.Bound bound, boolean isStart) + { + if (bound == Slice.Bound.BOTTOM || bound == Slice.Bound.TOP || metadata.comparator.size() == 0) + return ByteBufferUtil.EMPTY_BYTE_BUFFER; + + ClusteringPrefix clustering = bound.clustering(); + + if (!metadata.isCompound()) + { + assert clustering.size() == 1; + return clustering.get(0); + } + + CompositeType ctype = CompositeType.getInstance(metadata.comparator.subtypes()); + CompositeType.Builder builder = ctype.builder(); + for (int i = 0; i < clustering.size(); i++) + builder.add(clustering.get(i)); + + if (isStart) + return bound.isInclusive() ? builder.build() : builder.buildAsEndOfRange(); + else + return bound.isInclusive() ? builder.buildAsEndOfRange() : builder.build(); + } + + public static ByteBuffer encodeCellName(CFMetaData metadata, ClusteringPrefix clustering, ByteBuffer columnName, ByteBuffer collectionElement) + { + boolean isStatic = clustering == Clustering.STATIC_CLUSTERING; + + if (!metadata.isCompound()) + { + if (isStatic) + return columnName; + + assert clustering.size() == 1 : "Expected clustering size to be 1, but was " + clustering.size(); + return clustering.get(0); + } + + // We use comparator.size() rather than clustering.size() because of static clusterings + int clusteringSize = metadata.comparator.size(); + int size = clusteringSize + (metadata.isDense() ? 0 : 1) + (collectionElement == null ? 0 : 1); + if (metadata.isSuper()) + size = clusteringSize + 1; + ByteBuffer[] values = new ByteBuffer[size]; + for (int i = 0; i < clusteringSize; i++) + { + if (isStatic) + { + values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER; + continue; + } + + ByteBuffer v = clustering.get(i); + // we can have null (only for dense compound tables for backward compatibility reasons) but that + // means we're done and should stop there as far as building the composite is concerned. + if (v == null) + return CompositeType.build(Arrays.copyOfRange(values, 0, i)); + + values[i] = v; + } + + if (metadata.isSuper()) + { + // We need to set the "column" (in thrift terms) name, i.e. the value corresponding to the subcomparator. + // What it is depends if this a cell for a declared "static" column or a "dynamic" column part of the + // super-column internal map. + assert columnName != null; // This should never be null for supercolumns, see decodeForSuperColumn() above + values[clusteringSize] = columnName.equals(SuperColumnCompatibility.SUPER_COLUMN_MAP_COLUMN) + ? collectionElement + : columnName; + } + else + { + if (!metadata.isDense()) + values[clusteringSize] = columnName; + if (collectionElement != null) + values[clusteringSize + 1] = collectionElement; + } + + return CompositeType.build(isStatic, values); + } + + public static Clustering decodeClustering(CFMetaData metadata, ByteBuffer value) + { + int csize = metadata.comparator.size(); + if (csize == 0) + return Clustering.EMPTY; + + if (metadata.isCompound() && CompositeType.isStaticName(value)) + return Clustering.STATIC_CLUSTERING; + + List<ByteBuffer> components = metadata.isCompound() + ? CompositeType.splitName(value) + : Collections.singletonList(value); + + for (int i=0; i<Math.min(csize, components.size()); i++) + { + AbstractType<?> type = metadata.comparator.subtype(i); + type.validateIfFixedSize(components.get(i)); + } + return new Clustering(components.subList(0, Math.min(csize, components.size())).toArray(new ByteBuffer[csize])); + } + + public static ByteBuffer encodeClustering(CFMetaData metadata, ClusteringPrefix clustering) + { + if (clustering.size() == 0) + return ByteBufferUtil.EMPTY_BYTE_BUFFER; + + if (!metadata.isCompound()) + { + assert clustering.size() == 1; + return clustering.get(0); + } + + ByteBuffer[] values = new ByteBuffer[clustering.size()]; + for (int i = 0; i < clustering.size(); i++) + values[i] = clustering.get(i); + return CompositeType.build(values); + } + + /** + * The maximum number of cells to include per partition when converting to the old format. + * <p> + * We already apply the limit during the actual query, but for queries that counts cells and not rows (thrift queries + * and distinct queries as far as old nodes are concerned), we may still include a little bit more than requested + * because {@link DataLimits} always include full rows. So if the limit ends in the middle of a queried row, the + * full row will be part of our result. This would confuse old nodes however so we make sure to truncate it to + * what's expected before writting it on the wire. + * + * @param command the read commmand for which to determine the maximum cells per partition. This can be {@code null} + * in which case {@code Integer.MAX_VALUE} is returned. + * @return the maximum number of cells per partition that should be enforced according to the read command if + * post-query limitation are in order (see above). This will be {@code Integer.MAX_VALUE} if no such limits are + * necessary. + */ + private static int maxLiveCellsPerPartition(ReadCommand command) + { + if (command == null) + return Integer.MAX_VALUE; + + DataLimits limits = command.limits(); + + // There is 2 types of DISTINCT queries: those that includes only the partition key, and those that include static columns. + // On old nodes, the latter expects the first row in term of CQL count, which is what we already have and there is no additional + // limit to apply. The former however expect only one cell per partition and rely on it (See CASSANDRA-10762). + if (limits.isDistinct()) + return command.columnFilter().fetchedColumns().statics.isEmpty() ? 1 : Integer.MAX_VALUE; + + switch (limits.kind()) + { + case THRIFT_LIMIT: + case SUPER_COLUMN_COUNTING_LIMIT: + return limits.perPartitionCount(); + default: + return Integer.MAX_VALUE; + } + } + + // For serializing to old wire format + public static LegacyUnfilteredPartition fromUnfilteredRowIterator(ReadCommand command, UnfilteredRowIterator iterator) + { + // we need to extract the range tombstone so materialize the partition. Since this is + // used for the on-wire format, this is not worst than it used to be. + final ImmutableBTreePartition partition = ImmutableBTreePartition.create(iterator); + DeletionInfo info = partition.deletionInfo(); + Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> pair = fromRowIterator(partition.metadata(), partition.iterator(), partition.staticRow()); + + LegacyLayout.LegacyRangeTombstoneList rtl = pair.left; + + // Processing the cell iterator results in the LegacyRangeTombstoneList being populated, so we do this + // before we use the LegacyRangeTombstoneList at all + List<LegacyLayout.LegacyCell> cells = Lists.newArrayList(pair.right); + + int maxCellsPerPartition = maxLiveCellsPerPartition(command); + cells = maybeTrimLiveCells(cells, maxCellsPerPartition, command); + + // The LegacyRangeTombstoneList already has range tombstones for the single-row deletions and complex + // deletions. Go through our normal range tombstones and add then to the LegacyRTL so that the range + // tombstones all get merged and sorted properly. + if (info.hasRanges()) + { + Iterator<RangeTombstone> rangeTombstoneIterator = info.rangeIterator(false); + while (rangeTombstoneIterator.hasNext()) + { + RangeTombstone rt = rangeTombstoneIterator.next(); + Slice slice = rt.deletedSlice(); + LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(slice.start(), false, null); + LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(slice.end(), false, null); + rtl.add(start, end, rt.deletionTime().markedForDeleteAt(), rt.deletionTime().localDeletionTime()); + } + } + + return new LegacyUnfilteredPartition(info.getPartitionDeletion(), rtl, cells); + } + + private static List<LegacyCell> maybeTrimLiveCells(List<LegacyCell> cells, int maxLiveCells, ReadCommand command) + { + if (null == command || maxLiveCells >= cells.size()) + return cells; + + int nowInSec = command.nowInSec(); + int live = 0; + int dead = 0; + + for (int i = 0; i < cells.size() && live < maxLiveCells; i++) + { + if (cells.get(i).isLive(nowInSec)) + live++; + else + dead++; + } + + return cells.subList(0, live + dead); + } + + public static void serializeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, DataOutputPlus out, int version) throws IOException + { + assert version < MessagingService.VERSION_30; + + out.writeBoolean(true); + + LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition); + + UUIDSerializer.serializer.serialize(partition.metadata().cfId, out, version); + DeletionTime.serializer.serialize(legacyPartition.partitionDeletion, out); + + legacyPartition.rangeTombstones.serialize(out, partition.metadata()); + + // begin cell serialization + out.writeInt(legacyPartition.cells.size()); + for (LegacyLayout.LegacyCell cell : legacyPartition.cells) + { + ByteBufferUtil.writeWithShortLength(cell.name.encode(partition.metadata()), out); + out.writeByte(cell.serializationFlags()); + if (cell.isExpiring()) + { + out.writeInt(cell.ttl); + out.writeInt(cell.localDeletionTime); + } + else if (cell.isTombstone()) + { + out.writeLong(cell.timestamp); + out.writeInt(TypeSizes.sizeof(cell.localDeletionTime)); + out.writeInt(cell.localDeletionTime); + continue; + } + else if (cell.isCounterUpdate()) + { + out.writeLong(cell.timestamp); + long count = CounterContext.instance().getUpdateCount(cell.value); + ByteBufferUtil.writeWithLength(ByteBufferUtil.bytes(count), out); + continue; + } + else if (cell.isCounter()) + { + out.writeLong(Long.MIN_VALUE); // timestampOfLastDelete (not used, and MIN_VALUE is the default) + } + + out.writeLong(cell.timestamp); + ByteBufferUtil.writeWithLength(cell.value, out); + } + } + + // For the old wire format + // Note: this can return null if an empty partition is serialized! + public static UnfilteredRowIterator deserializeLegacyPartition(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException + { + assert version < MessagingService.VERSION_30; + + // This is only used in mutation, and mutation have never allowed "null" column families + boolean present = in.readBoolean(); + if (!present) + return null; + + CFMetaData metadata = CFMetaData.serializer.deserialize(in, version); + LegacyDeletionInfo info = LegacyDeletionInfo.deserialize(metadata, in); + int size = in.readInt(); + Iterator<LegacyCell> cells = deserializeCells(metadata, in, flag, size); + SerializationHelper helper = new SerializationHelper(metadata, version, flag); + return onWireCellstoUnfilteredRowIterator(metadata, metadata.partitioner.decorateKey(key), info, cells, false, helper); + } + + // For the old wire format + public static long serializedSizeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, int version) + { + assert version < MessagingService.VERSION_30; + + if (partition.isEmpty()) + return TypeSizes.sizeof(false); + + long size = TypeSizes.sizeof(true); + + LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition); + + size += UUIDSerializer.serializer.serializedSize(partition.metadata().cfId, version); + size += DeletionTime.serializer.serializedSize(legacyPartition.partitionDeletion); + size += legacyPartition.rangeTombstones.serializedSize(partition.metadata()); + + // begin cell serialization + size += TypeSizes.sizeof(legacyPartition.cells.size()); + for (LegacyLayout.LegacyCell cell : legacyPartition.cells) + { + size += ByteBufferUtil.serializedSizeWithShortLength(cell.name.encode(partition.metadata())); + size += 1; // serialization flags + if (cell.isExpiring()) + { + size += TypeSizes.sizeof(cell.ttl); + size += TypeSizes.sizeof(cell.localDeletionTime); + } + else if (cell.isTombstone()) + { + size += TypeSizes.sizeof(cell.timestamp); + // localDeletionTime replaces cell.value as the body + size += TypeSizes.sizeof(TypeSizes.sizeof(cell.localDeletionTime)); + size += TypeSizes.sizeof(cell.localDeletionTime); + continue; + } + else if (cell.isCounterUpdate()) + { + size += TypeSizes.sizeof(cell.timestamp); + long count = CounterContext.instance().getUpdateCount(cell.value); + size += ByteBufferUtil.serializedSizeWithLength(ByteBufferUtil.bytes(count)); + continue; + } + else if (cell.isCounter()) + { + size += TypeSizes.sizeof(Long.MIN_VALUE); // timestampOfLastDelete + } + + size += TypeSizes.sizeof(cell.timestamp); + size += ByteBufferUtil.serializedSizeWithLength(cell.value); + } + + return size; + } + + // For thrift sake + public static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata, + DecoratedKey key, + LegacyDeletionInfo delInfo, + Iterator<LegacyCell> cells) + { + SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL); + return toUnfilteredRowIterator(metadata, key, delInfo, cells, false, helper); + } + + // For deserializing old wire format + public static UnfilteredRowIterator onWireCellstoUnfilteredRowIterator(CFMetaData metadata, + DecoratedKey key, + LegacyDeletionInfo delInfo, + Iterator<LegacyCell> cells, + boolean reversed, + SerializationHelper helper) + { + + // If the table is a static compact, the "column_metadata" are now internally encoded as + // static. This has already been recognized by decodeCellName, but it means the cells + // provided are not in the expected order (the "static" cells are not necessarily at the front). + // So sort them to make sure toUnfilteredRowIterator works as expected. + // Further, if the query is reversed, then the on-wire format still has cells in non-reversed + // order, but we need to have them reverse in the final UnfilteredRowIterator. So reverse them. + if (metadata.isStaticCompactTable() || reversed) + { + List<LegacyCell> l = new ArrayList<>(); + Iterators.addAll(l, cells); + Collections.sort(l, legacyCellComparator(metadata, reversed)); + cells = l.iterator(); + } + + return toUnfilteredRowIterator(metadata, key, delInfo, cells, reversed, helper); + } + + private static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata, + DecoratedKey key, + LegacyDeletionInfo delInfo, + Iterator<LegacyCell> cells, + boolean reversed, + SerializationHelper helper) + { + // A reducer that basically does nothing, we know the 2 merged iterators can't have conflicting atoms (since we merge cells with range tombstones). + MergeIterator.Reducer<LegacyAtom, LegacyAtom> reducer = new MergeIterator.Reducer<LegacyAtom, LegacyAtom>() + { + private LegacyAtom atom; + + public void reduce(int idx, LegacyAtom current) + { + // We're merging cell with range tombstones, so we should always only have a single atom to reduce. + assert atom == null; + atom = current; + } + + protected LegacyAtom getReduced() + { + return atom; + } + + protected void onKeyChange() + { + atom = null; + } + }; + List<Iterator<LegacyAtom>> iterators = Arrays.asList(asLegacyAtomIterator(cells), asLegacyAtomIterator(delInfo.inRowRangeTombstones())); + PeekingIterator<LegacyAtom> atoms = Iterators.peekingIterator(MergeIterator.get(iterators, legacyAtomComparator(metadata), reducer)); + + // Check if we have some static + Row staticRow = atoms.hasNext() && atoms.peek().isStatic() + ? getNextRow(CellGrouper.staticGrouper(metadata, helper), atoms) + : Rows.EMPTY_STATIC_ROW; + + Iterator<Row> rows = convertToRows(new CellGrouper(metadata, helper), atoms); + Iterator<RangeTombstone> ranges = delInfo.deletionInfo.rangeIterator(reversed); + return new RowAndDeletionMergeIterator(metadata, + key, + delInfo.deletionInfo.getPartitionDeletion(), + ColumnFilter.all(metadata), + staticRow, + reversed, + EncodingStats.NO_STATS, + rows, + ranges, + true); + } + + public static Row extractStaticColumns(CFMetaData metadata, DataInputPlus in, Columns statics) throws IOException + { + assert !statics.isEmpty(); + assert metadata.isCompactTable(); + + if (metadata.isSuper()) + // TODO: there is in practice nothing to do here, but we need to handle the column_metadata for super columns somewhere else + throw new UnsupportedOperationException(); + + Set<ByteBuffer> columnsToFetch = new HashSet<>(statics.size()); + for (ColumnDefinition column : statics) + columnsToFetch.add(column.name.bytes); + + Row.Builder builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds()); + builder.newRow(Clustering.STATIC_CLUSTERING); + + boolean foundOne = false; + LegacyAtom atom; + while ((atom = readLegacyAtomSkippingUnknownColumn(metadata,in)) != null) + { + if (atom.isCell()) + { + LegacyCell cell = atom.asCell(); + if (!columnsToFetch.contains(cell.name.encode(metadata))) + continue; + + foundOne = true; + cell.name.column.type.validateIfFixedSize(cell.value); + builder.addCell(new BufferCell(cell.name.column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, null)); + } + else + { + LegacyRangeTombstone tombstone = atom.asRangeTombstone(); + // TODO: we need to track tombstones and potentially ignore cells that are + // shadowed (or even better, replace them by tombstones). + throw new UnsupportedOperationException(); + } + } + + return foundOne ? builder.build() : Rows.EMPTY_STATIC_ROW; + } + + private static LegacyAtom readLegacyAtomSkippingUnknownColumn(CFMetaData metadata, DataInputPlus in) + throws IOException + { + while (true) + { + try + { + return readLegacyAtom(metadata, in, false); + } + catch (UnknownColumnException e) + { + // Simply skip, as the method name implies. + } + } + + } + + private static Row getNextRow(CellGrouper grouper, PeekingIterator<? extends LegacyAtom> cells) + { + if (!cells.hasNext()) + return null; + + grouper.reset(); + while (cells.hasNext() && grouper.addAtom(cells.peek())) + { + // We've added the cell already in the grouper, so just skip it + cells.next(); + } + return grouper.getRow(); + } + + @SuppressWarnings("unchecked") + private static Iterator<LegacyAtom> asLegacyAtomIterator(Iterator<? extends LegacyAtom> iter) + { + return (Iterator<LegacyAtom>)iter; + } + + private static Iterator<Row> convertToRows(final CellGrouper grouper, final PeekingIterator<LegacyAtom> atoms) + { + return new AbstractIterator<Row>() + { + protected Row computeNext() + { + if (!atoms.hasNext()) + return endOfData(); + + return getNextRow(grouper, atoms); + } + }; + } + + public static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRowIterator(final RowIterator iterator) + { + return fromRowIterator(iterator.metadata(), iterator, iterator.staticRow()); + } + + private static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRowIterator(final CFMetaData metadata, final Iterator<Row> iterator, final Row staticRow) + { + LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10); + Iterator<LegacyCell> cells = new AbstractIterator<LegacyCell>() + { + private Iterator<LegacyCell> currentRow = initializeRow(); + + private Iterator<LegacyCell> initializeRow() + { + if (staticRow == null || staticRow.isEmpty()) + return Collections.<LegacyLayout.LegacyCell>emptyIterator(); + + Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> row = fromRow(metadata, staticRow); + deletions.addAll(row.left); + return row.right; + } + + protected LegacyCell computeNext() + { + while (true) + { + if (currentRow.hasNext()) + return currentRow.next(); + + if (!iterator.hasNext()) + return endOfData(); + + Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> row = fromRow(metadata, iterator.next()); + deletions.addAll(row.left); + currentRow = row.right; + } + } + }; + + return Pair.create(deletions, cells); + } + + private static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRow(final CFMetaData metadata, final Row row) + { + // convert any complex deletions or row deletion into normal range tombstones so that we can build and send a proper RangeTombstoneList + // to legacy nodes + LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10); + + if (!row.deletion().isLive()) + { + Clustering clustering = row.clustering(); + Slice.Bound startBound = Slice.Bound.inclusiveStartOf(clustering); + Slice.Bound endBound = Slice.Bound.inclusiveEndOf(clustering); + + LegacyBound start = new LegacyLayout.LegacyBound(startBound, false, null); + LegacyBound end = new LegacyLayout.LegacyBound(endBound, false, null); + + deletions.add(start, end, row.deletion().time().markedForDeleteAt(), row.deletion().time().localDeletionTime()); + } + + for (ColumnData cd : row) + { + ColumnDefinition col = cd.column(); + if (col.isSimple()) + continue; + + DeletionTime delTime = ((ComplexColumnData)cd).complexDeletion(); + if (!delTime.isLive()) + { + Clustering clustering = row.clustering(); + boolean isStatic = clustering == Clustering.STATIC_CLUSTERING; + assert isStatic == col.isStatic(); + + Slice.Bound startBound = isStatic + ? LegacyDeletionInfo.staticBound(metadata, true) + : Slice.Bound.inclusiveStartOf(clustering); + Slice.Bound endBound = isStatic + ? LegacyDeletionInfo.staticBound(metadata, false) + : Slice.Bound.inclusiveEndOf(clustering); + + LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(startBound, isStatic, col); + LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(endBound, isStatic, col); + + deletions.add(start, end, delTime.markedForDeleteAt(), delTime.localDeletionTime()); + } + } + + Iterator<LegacyCell> cells = new AbstractIterator<LegacyCell>() + { + private final Iterator<Cell> cells = row.cellsInLegacyOrder(metadata, false).iterator(); + // we don't have (and shouldn't have) row markers for compact tables. + private boolean hasReturnedRowMarker = metadata.isCompactTable(); + + protected LegacyCell computeNext() + { + if (!hasReturnedRowMarker) + { + hasReturnedRowMarker = true; + + // don't include a row marker if there's no timestamp on the primary key; this is the 3.0+ equivalent + // of a row marker + if (!row.primaryKeyLivenessInfo().isEmpty()) + { + LegacyCellName cellName = new LegacyCellName(row.clustering(), null, null); + LivenessInfo info = row.primaryKeyLivenessInfo(); + return new LegacyCell(info.isExpiring() ? LegacyCell.Kind.EXPIRING : LegacyCell.Kind.REGULAR, cellName, ByteBufferUtil.EMPTY_BYTE_BUFFER, info.timestamp(), info.localExpirationTime(), info.ttl()); + } + } + + if (!cells.hasNext()) + return endOfData(); + + return makeLegacyCell(row.clustering(), cells.next()); + } + }; + return Pair.create(deletions, cells); + } + + private static LegacyCell makeLegacyCell(Clustering clustering, Cell cell) + { + LegacyCell.Kind kind; + if (cell.isCounterCell()) + kind = LegacyCell.Kind.COUNTER; + else if (cell.isTombstone()) + kind = LegacyCell.Kind.DELETED; + else if (cell.isExpiring()) + kind = LegacyCell.Kind.EXPIRING; + else + kind = LegacyCell.Kind.REGULAR; + + CellPath path = cell.path(); + assert path == null || path.size() == 1; + LegacyCellName name = new LegacyCellName(clustering, cell.column(), path == null ? null : path.get(0)); + return new LegacyCell(kind, name, cell.value(), cell.timestamp(), cell.localDeletionTime(), cell.ttl()); + } + + public static RowIterator toRowIterator(final CFMetaData metadata, + final DecoratedKey key, + final Iterator<LegacyCell> cells, + final int nowInSec) + { + SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL); + return UnfilteredRowIterators.filter(toUnfilteredRowIterator(metadata, key, LegacyDeletionInfo.live(), cells, false, helper), nowInSec); + } + + public static Comparator<LegacyCell> legacyCellComparator(CFMetaData metadata) + { + return legacyCellComparator(metadata, false); + } + + public static Comparator<LegacyCell> legacyCellComparator(final CFMetaData metadata, final boolean reversed) + { + final Comparator<LegacyCellName> cellNameComparator = legacyCellNameComparator(metadata, reversed); + return new Comparator<LegacyCell>() + { + public int compare(LegacyCell cell1, LegacyCell cell2) + { + LegacyCellName c1 = cell1.name; + LegacyCellName c2 = cell2.name; + + int c = cellNameComparator.compare(c1, c2); + if (c != 0) + return c; + + // The actual sorting when the cellname is equal doesn't matter, we just want to make + // sure the cells are not considered equal. + if (cell1.timestamp != cell2.timestamp) + return cell1.timestamp < cell2.timestamp ? -1 : 1; + + if (cell1.localDeletionTime != cell2.localDeletionTime) + return cell1.localDeletionTime < cell2.localDeletionTime ? -1 : 1; + + return cell1.value.compareTo(cell2.value); + } + }; + } + + // Note that this doesn't exactly compare cells as they were pre-3.0 because within a row they sort columns like + // in 3.0, that is, with simple columns before complex columns. In other words, this comparator makes sure cells + // are in the proper order to convert them to actual 3.0 rows. + public static Comparator<LegacyCellName> legacyCellNameComparator(final CFMetaData metadata, final boolean reversed) + { + return new Comparator<LegacyCellName>() + { + public int compare(LegacyCellName c1, LegacyCellName c2) + { + // Compare clustering first + if (c1.clustering == Clustering.STATIC_CLUSTERING) + { + if (c2.clustering != Clustering.STATIC_CLUSTERING) + return -1; + } + else if (c2.clustering == Clustering.STATIC_CLUSTERING) + { + return 1; + } + else + { + int c = metadata.comparator.compare(c1.clustering, c2.clustering); + if (c != 0) + return reversed ? -c : c; + } + + // Note that when reversed, we only care about the clustering being reversed, so it's ok + // not to take reversed into account below. + + // Then check the column name + if (c1.column != c2.column) + { + // A null for the column means it's a row marker + if (c1.column == null) + return -1; + if (c2.column == null) + return 1; + + assert c1.column.isRegular() || c1.column.isStatic(); + assert c2.column.isRegular() || c2.column.isStatic(); + int cmp = c1.column.compareTo(c2.column); + if (cmp != 0) + return cmp; + } + + assert (c1.collectionElement == null) == (c2.collectionElement == null); + + if (c1.collectionElement != null) + { + AbstractType<?> colCmp = ((CollectionType)c1.column.type).nameComparator(); + return colCmp.compare(c1.collectionElement, c2.collectionElement); + } + return 0; + } + }; + } + + private static boolean equalValues(ClusteringPrefix c1, ClusteringPrefix c2, ClusteringComparator comparator) + { + assert c1.size() == c2.size(); + for (int i = 0; i < c1.size(); i++) + { + if (comparator.compareComponent(i, c1.get(i), c2.get(i)) != 0) + return false; + } + return true; + } + + static Comparator<LegacyAtom> legacyAtomComparator(CFMetaData metadata) + { + return (o1, o2) -> + { + // First we want to compare by clustering, but we have to be careful with range tombstone, because + // we can have collection deletion and we want those to sort properly just before the column they + // delete, not before the whole row. + // We also want to special case static so they sort before any non-static. Note in particular that + // this special casing is important in the case of one of the Atom being Slice.Bound.BOTTOM: we want + // it to sort after the static as we deal with static first in toUnfilteredAtomIterator and having + // Slice.Bound.BOTTOM first would mess that up (note that static deletion is handled through a specific + // static tombstone, see LegacyDeletionInfo.add()). + if (o1.isStatic() != o2.isStatic()) + return o1.isStatic() ? -1 : 1; + + ClusteringPrefix c1 = o1.clustering(); + ClusteringPrefix c2 = o2.clustering(); + + int clusteringComparison; + if (c1.size() != c2.size() || (o1.isCell() == o2.isCell()) || !equalValues(c1, c2, metadata.comparator)) + { + clusteringComparison = metadata.comparator.compare(c1, c2); + } + else + { + // one is a cell and one is a range tombstone, and both have the same prefix size (that is, the + // range tombstone is either a row deletion or a collection deletion). + LegacyRangeTombstone rt = o1.isCell() ? o2.asRangeTombstone() : o1.asRangeTombstone(); + clusteringComparison = rt.isCollectionTombstone() + ? 0 + : metadata.comparator.compare(c1, c2); + } + + // Note that if both are range tombstones and have the same clustering, then they are equal. + if (clusteringComparison != 0) + return clusteringComparison; + + if (o1.isCell()) + { + LegacyCell cell1 = o1.asCell(); + if (o2.isCell()) + { + LegacyCell cell2 = o2.asCell(); + // Check for row marker cells + if (cell1.name.column == null) + return cell2.name.column == null ? 0 : -1; + return cell2.name.column == null ? 1 : cell1.name.column.compareTo(cell2.name.column); + } + + LegacyRangeTombstone rt2 = o2.asRangeTombstone(); + assert rt2.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality + if (cell1.name.column == null) + return -1; + int cmp = cell1.name.column.compareTo(rt2.start.collectionName); + // If both are for the same column, then the RT should come first + return cmp == 0 ? 1 : cmp; + } + else + { + assert o2.isCell(); + LegacyCell cell2 = o2.asCell(); + + LegacyRangeTombstone rt1 = o1.asRangeTombstone(); + assert rt1.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality + + if (cell2.name.column == null) + return 1; + + int cmp = rt1.start.collectionName.compareTo(cell2.name.column); + // If both are for the same column, then the RT should come first + return cmp == 0 ? -1 : cmp; + } + }; + } + + public static LegacyAtom readLegacyAtom(CFMetaData metadata, DataInputPlus in, boolean readAllAsDynamic) + throws IOException, UnknownColumnException + { + ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in); + if (!cellname.hasRemaining()) + return null; // END_OF_ROW + + try + { + int b = in.readUnsignedByte(); + return (b & RANGE_TOMBSTONE_MASK) != 0 + ? readLegacyRangeTombstoneBody(metadata, in, cellname) + : readLegacyCellBody(metadata, in, cellname, b, SerializationHelper.Flag.LOCAL, readAllAsDynamic); + } + catch (UnknownColumnException e) + { + // We legitimately can get here in 2 cases: + // 1) for system tables, because we've unceremoniously removed columns (without registering them as dropped) + // 2) for dropped columns. + // In any other case, there is a mismatch between the schema and the data, and we complain loudly in + // that case. Note that if we are in a legit case of an unknown column, we want to simply skip that cell, + // but we don't do this here and re-throw the exception because the calling code sometimes has to know + // about this happening. This does mean code calling this method should handle this case properly. + if (!metadata.ksName.equals(SystemKeyspace.NAME) && metadata.getDroppedColumnDefinition(e.columnName) == null) + throw new IllegalStateException(String.format("Got cell for unknown column %s in sstable of %s.%s: " + + "This suggest a problem with the schema which doesn't list " + + "this column. Even if that column was dropped, it should have " + + "been listed as such", metadata.ksName, metadata.cfName, UTF8Type.instance.compose(e.columnName)), e); + + throw e; + } + } + + public static LegacyCell readLegacyCell(CFMetaData metadata, DataInput in, SerializationHelper.Flag flag) throws IOException, UnknownColumnException + { + ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in); + int b = in.readUnsignedByte(); + return readLegacyCellBody(metadata, in, cellname, b, flag, false); + } + + public static LegacyCell readLegacyCellBody(CFMetaData metadata, DataInput in, ByteBuffer cellname, int mask, SerializationHelper.Flag flag, boolean readAllAsDynamic) + throws IOException, UnknownColumnException + { + // Note that we want to call decodeCellName only after we've deserialized other parts, since it can throw + // and we want to throw only after having deserialized the full cell. + if ((mask & COUNTER_MASK) != 0) + { + in.readLong(); // timestampOfLastDelete: this has been unused for a long time so we ignore it + long ts = in.readLong(); + ByteBuffer value = ByteBufferUtil.readWithLength(in); + if (flag == SerializationHelper.Flag.FROM_REMOTE || (flag == SerializationHelper.Flag.LOCAL && CounterContext.instance().shouldClearLocal(value))) + value = CounterContext.instance().clearAllLocal(value); + return new LegacyCell(LegacyCell.Kind.COUNTER, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL); + } + else if ((mask & EXPIRATION_MASK) != 0) + { + int ttl = in.readInt(); + int expiration = in.readInt(); + long ts = in.readLong(); + ByteBuffer value = ByteBufferUtil.readWithLength(in); + return new LegacyCell(LegacyCell.Kind.EXPIRING, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, expiration, ttl); + } + else + { + long ts = in.readLong(); + ByteBuffer value = ByteBufferUtil.readWithLength(in); + LegacyCellName name = decodeCellName(metadata, cellname, readAllAsDynamic); + return (mask & COUNTER_UPDATE_MASK) != 0 + ? new LegacyCell(LegacyCell.Kind.COUNTER, name, CounterContext.instance().createUpdate(ByteBufferUtil.toLong(value)), ts, Cell.NO_DELETION_TIME, Cell.NO_TTL) + : ((mask & DELETION_MASK) == 0 + ? new LegacyCell(LegacyCell.Kind.REGULAR, name, value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL) + : new LegacyCell(LegacyCell.Kind.DELETED, name, ByteBufferUtil.EMPTY_BYTE_BUFFER, ts, ByteBufferUtil.toInt(value), Cell.NO_TTL)); + } + } + + public static LegacyRangeTombstone readLegacyRangeTombstoneBody(CFMetaData metadata, DataInputPlus in, ByteBuffer boundname) throws IOException + { + LegacyBound min = decodeTombstoneBound(metadata, boundname, true); + LegacyBound max = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false); + DeletionTime dt = DeletionTime.serializer.deserialize(in); + return new LegacyRangeTombstone(min, max, dt); + } + + public static Iterator<LegacyCell> deserializeCells(final CFMetaData metadata, + final DataInput in, + final SerializationHelper.Flag flag, + final int size) + { + return new AbstractIterator<LegacyCell>() + { + private int i = 0; + + protected LegacyCell computeNext() + { + if (i >= size) + return endOfData(); + + ++i; + try + { + return readLegacyCell(metadata, in, flag); + } + catch (UnknownColumnException e) + { + // We can get there if we read a cell for a dropped column, and if that is the case, + // then simply ignore the cell is fine. But also not that we ignore if it's the + // system keyspace because for those table we actually remove columns without registering + // them in the dropped columns + if (metadata.ksName.equals(SystemKeyspace.NAME) || metadata.getDroppedColumnDefinition(e.columnName) != null) + return computeNext(); + else + throw new IOError(e); + } + catch (IOException e) + { + throw new IOError(e); + } + } + }; + } + + public static class CellGrouper + { + /** + * The fake TTL used for expired rows that have been compacted. + */ + private static final int FAKE_TTL = 1; + + public final CFMetaData metadata; + private final boolean isStatic; + private final SerializationHelper helper; + private final Row.Builder builder; + private Clustering clustering; + + private LegacyRangeTombstone rowDeletion; + private LegacyRangeTombstone collectionDeletion; + + /** + * Used to track if we need to add pk liveness info (row marker) when removing invalid legacy cells. + * + * In 2.1 these invalid cells existed but were not queryable, in this case specifically because they + * represented values for clustering key columns that were written as data cells. + * + * However, the presence (or not) of such cells on an otherwise empty CQL row (or partition) would decide + * if an empty result row were returned for the CQL row (or partition). To maintain this behaviour we + * insert a row marker containing the liveness info of these invalid cells iff we have no other data + * on the row. + * + * See also CASSANDRA-15365 + */ + private boolean hasValidCells = false; + private LivenessInfo invalidLivenessInfo = null; + + public CellGrouper(CFMetaData metadata, SerializationHelper helper) + { + this(metadata, helper, false); + } + + private CellGrouper(CFMetaData metadata, SerializationHelper helper, boolean isStatic) + { + this.metadata = metadata; + this.isStatic = isStatic; + this.helper = helper; + // We cannot use a sorted builder because we don't have exactly the same ordering in 3.0 and pre-3.0. More precisely, within a row, we + // store all simple columns before the complex ones in 3.0, which we use to sort everything sorted by the column name before. Note however + // that the unsorted builder won't have to reconcile cells, so the exact value we pass for nowInSec doesn't matter. + this.builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds()); + } + + public static CellGrouper staticGrouper(CFMetaData metadata, SerializationHelper helper) + { + return new CellGrouper(metadata, helper, true); + } + + public void reset() + { + this.clustering = null; + this.rowDeletion = null; + this.collectionDeletion = null; + this.invalidLivenessInfo = null; + this.hasValidCells = false; + } + + /** + * Try adding the provided atom to the currently grouped row. + * + * @param atom the new atom to try to add. This <b>must</b> be a "row" atom, that is either a cell or a legacy + * range tombstone that covers only one row (row deletion) or a subset of it (collection + * deletion). Meaning that legacy range tombstone covering multiple rows (that should be handled as + * legit range tombstone in the new storage engine) should be handled separately. Atoms should also + * be provided in proper clustering order. + * @return {@code true} if the provided atom has been "consumed" by this grouper (this does _not_ mean the + * atom has been "used" by the grouper as the grouper will skip some shadowed atoms for instance, just + * that {@link #getRow()} shouldn't be called just yet if there is more atom in the atom iterator we're + * grouping). {@code false} otherwise, that is if the row currently built by this grouper is done + * _without_ the provided atom being "consumed" (and so {@link #getRow()} should be called and the + * grouper resetted, after which the provided atom should be provided again). + */ + public boolean addAtom(LegacyAtom atom) + { + assert atom.isRowAtom(metadata) : "Unexpected non in-row legacy range tombstone " + atom; + return atom.isCell() + ? addCell(atom.asCell()) + : addRangeTombstone(atom.asRangeTombstone()); + } + + private boolean addCell(LegacyCell cell) + { + if (clustering == null) + { + clustering = cell.name.clustering; + assert !isStatic || clustering == Clustering.STATIC_CLUSTERING; + builder.newRow(clustering); + } + else if (!clustering.equals(cell.name.clustering)) + { + return false; + } + + // Ignore shadowed cells + if (rowDeletion != null && rowDeletion.deletionTime.deletes(cell.timestamp)) + return true; + + ColumnDefinition column = cell.name.column; + if (column == null) + { + // It's the row marker + assert !cell.value.hasRemaining(); + // In 2.1, the row marker expired cell might have been converted into a deleted one by compaction. + // If we do not set the primary key liveness info for this row and it does not contains any regular columns + // the row will be empty. To avoid that, we reuse the localDeletionTime but use a fake TTL. + // The only time in 2.x that we actually delete a row marker is in 2i tables, so in that case we do + // want to actually propagate the row deletion. (CASSANDRA-13320) + if (!cell.isTombstone()) + builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(cell.timestamp, cell.ttl, cell.localDeletionTime)); + else if (metadata.isIndex()) + builder.addRowDeletion(Row.Deletion.regular(new DeletionTime(cell.timestamp, cell.localDeletionTime))); + else + builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(cell.timestamp, FAKE_TTL, cell.localDeletionTime)); + hasValidCells = true; + } + else if (column.isPrimaryKeyColumn() && metadata.isCQLTable()) + { + // SSTables generated offline and side-loaded may include invalid cells which have the column name + // of a primary key column. So that we don't fail when encountering these cells, we treat them the + // same way as 2.1 did, namely we include their clusterings in the new CQL row, but drop the invalid + // column part of the cell + noSpamLogger.warn("Illegal cell name for CQL3 table {}.{}. {} is defined as a primary key column", + metadata.ksName, metadata.cfName, column.name); + + if (invalidLivenessInfo != null) + { + // when we have several invalid cells we follow the logic in LivenessInfo#supersedes when picking the PKLI to keep: + LivenessInfo newInvalidLiveness = LivenessInfo.create(cell.timestamp, cell.isTombstone() ? FAKE_TTL : cell.ttl, cell.localDeletionTime); + if (newInvalidLiveness.supersedes(invalidLivenessInfo)) + invalidLivenessInfo = newInvalidLiveness; + } + else + { + invalidLivenessInfo = LivenessInfo.create(cell.timestamp, cell.isTombstone() ? FAKE_TTL : cell.ttl, cell.localDeletionTime); + } + return true; + } + else + { + if (collectionDeletion != null && collectionDeletion.start.collectionName.name.equals(column.name) && collectionDeletion.deletionTime.deletes(cell.timestamp)) + return true; + + if (helper.includes(column)) + { + hasValidCells = true; + CellPath path = null; + if (column.isComplex()) + { + // Recalling startOfComplexColumn for every cell is a big inefficient, but it's ok in practice + // and it's simpler. And since 1) this only matter for super column selection in thrift in + // practice and 2) is only used during upgrade, it's probably worth keeping things simple. + helper.startOfComplexColumn(column); + path = cell.name.collectionElement == null ? null : CellPath.create(cell.name.collectionElement); + if (!helper.includes(path)) + return true; + } + column.type.validateIfFixedSize(cell.value); + Cell c = new BufferCell(column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, path); + if (!helper.isDropped(c, column.isComplex())) + builder.addCell(c); + if (column.isComplex()) + { + helper.endOfComplexColumn(); + } + } + } + return true; + } + + private boolean addRangeTombstone(LegacyRangeTombstone tombstone) + { + if (tombstone.isRowDeletion(metadata)) + { + return addRowTombstone(tombstone); + } + else + { + // The isRowAtom() assertion back in addAtom would have already triggered otherwise, but spelling it + // out nonetheless. + assert tombstone.isCollectionTombstone(); + return addCollectionTombstone(tombstone); + } + } + + private boolean addRowTombstone(LegacyRangeTombstone tombstone) + { + if (clustering != null) + { + // If we're already in the row, there might be a chance that there were two range tombstones + // written, as 2.x storage format does not guarantee just one range tombstone, unlike 3.x. + // We have to make sure that clustering matches, which would mean that tombstone is for the + // same row. + if (clustering.equals(tombstone.start.getAsClustering(metadata))) + { + // If the tombstone superceeds the previous delete, we discard the previous one. + // This assumes that we are building the row from a sane source (ie, this row deletion + // does not delete anything already added to the builder). See CASSANDRA-15789 for details + if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime)) + { + builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime)); + rowDeletion = tombstone; + hasValidCells = true; + } + return true; + } + + // different clustering -> new row + return false; + } + + clustering = tombstone.start.getAsClustering(metadata); + builder.newRow(clustering); + builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime)); + rowDeletion = tombstone; + hasValidCells = true; + + return true; + } + + private boolean addCollectionTombstone(LegacyRangeTombstone tombstone) + { + // If the collection tombstone is not included in the query (which technically would only apply to thrift + // queries since CQL one "fetch" everything), we can skip it (so return), but we're problably still within + // the current row so we return `true`. Technically, it is possible that tombstone belongs to another row + // that the row currently grouped, but as we ignore it, returning `true` is ok in that case too. + if (!helper.includes(tombstone.start.collectionName)) + return true; // see CASSANDRA-13109 + + // The helper needs to be informed about the current complex column identifier before + // it can perform the comparison between the recorded drop time and the RT deletion time. + // If the RT has been superceded by a drop, we still return true as we don't want the + // grouper to terminate yet. + helper.startOfComplexColumn(tombstone.start.collectionName); + if (helper.isDroppedComplexDeletion(tombstone.deletionTime)) + return true; + + if (clustering == null) + { + clustering = tombstone.start.getAsClustering(metadata); + builder.newRow(clustering); + } + else if (!clustering.equals(tombstone.start.getAsClustering(metadata))) + { + return false; + } + + builder.addComplexDeletion(tombstone.start.collectionName, tombstone.deletionTime); + if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime)) + collectionDeletion = tombstone; + hasValidCells = true; + + return true; + } + + /** + * Whether the provided range tombstone starts strictly after the current row of the cell grouper (if no row is + * currently started, this return false). + */ + public boolean startsAfterCurrentRow(LegacyRangeTombstone rangeTombstone) + { + return clustering != null && metadata.comparator.compare(rangeTombstone.start.bound, clustering) > 0; + } + + /** + * The clustering of the current row of the cell grouper, or {@code null} if no row is currently started. + */ + public Clustering currentRowClustering() + { + return clustering; + } + + /** + * Generates the row currently grouped by this grouper and reset it for the following row. + * <p> + * Note that the only correct way to call this is when either all the atom we're trying to group has been + * consumed, or when {@link #addAtom(LegacyAtom)} returns {@code false}. + * + * @return the current row that has been grouped, or {@code null} in the rare case where all the atoms + * "consumed" by {@link #addAtom(LegacyAtom)} for this row were skipped (we skip atoms under a few conditions). + */ + public Row getRow() + { ++ if (clustering == null) ++ return null; + if (!hasValidCells && invalidLivenessInfo != null) + builder.addPrimaryKeyLivenessInfo(invalidLivenessInfo); + return builder.build(); + } + } + + public static class LegacyUnfilteredPartition + { + public final DeletionTime partitionDeletion; + public final LegacyRangeTombstoneList rangeTombstones; + public final List<LegacyCell> cells; + + private LegacyUnfilteredPartition(DeletionTime partitionDeletion, LegacyRangeTombstoneList rangeTombstones, List<LegacyCell> cells) + { + this.partitionDeletion = partitionDeletion; + this.rangeTombstones = rangeTombstones; + this.cells = cells; + } + + public void digest(CFMetaData metadata, MessageDigest digest) + { + for (LegacyCell cell : cells) + { + digest.update(cell.name.encode(metadata).duplicate()); + + if (cell.isCounter()) + CounterContext.instance().updateDigest(digest, cell.value); + else + digest.update(cell.value.duplicate()); + + FBUtilities.updateWithLong(digest, cell.timestamp); + FBUtilities.updateWithByte(digest, cell.serializationFlags()); + + if (cell.isExpiring()) + FBUtilities.updateWithInt(digest, cell.ttl); + + if (cell.isCounter()) + { + // Counters used to have the timestampOfLastDelete field, which we stopped using long ago and has been hard-coded + // to Long.MIN_VALUE but was still taken into account in 2.2 counter digests (to maintain backward compatibility + // in the first place). + FBUtilities.updateWithLong(digest, Long.MIN_VALUE); + } + } + + if (partitionDeletion.markedForDeleteAt() != Long.MIN_VALUE) + digest.update(ByteBufferUtil.bytes(partitionDeletion.markedForDeleteAt())); + + if (!rangeTombstones.isEmpty()) + rangeTombstones.updateDigest(digest); + } + } + + public static class LegacyCellName + { + public final Clustering clustering; + public final ColumnDefinition column; + public final ByteBuffer collectionElement; + + @VisibleForTesting + public LegacyCellName(Clustering clustering, ColumnDefinition column, ByteBuffer collectionElement) + { + this.clustering = clustering; + this.column = column; + this.collectionElement = collectionElement; + } + + public static LegacyCellName create(Clustering clustering, ColumnDefinition column) + { + return new LegacyCellName(clustering, column, null); + } + + public ByteBuffer encode(CFMetaData metadata) + { + return encodeCellName(metadata, clustering, column == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : column.name.bytes, collectionElement); + } + + public ByteBuffer superColumnSubName() + { + assert collectionElement != null; + return collectionElement; + } + + public ByteBuffer superColumnName() + { + return clustering.get(0); + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < clustering.size(); i++) + sb.append(i > 0 ? ":" : "").append(clustering.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(clustering.get(i))); + return String.format("Cellname(clustering=%s, column=%s, collElt=%s)", sb.toString(), column == null ? "null" : column.name, collectionElement == null ? "null" : ByteBufferUtil.bytesToHex(collectionElement)); + } + } + + public static class LegacyBound + { + public static final LegacyBound BOTTOM = new LegacyBound(Slice.Bound.BOTTOM, false, null); + public static final LegacyBound TOP = new LegacyBound(Slice.Bound.TOP, false, null); + + public final Slice.Bound bound; + public final boolean isStatic; + public final ColumnDefinition collectionName; + + public LegacyBound(Slice.Bound bound, boolean isStatic, ColumnDefinition collectionName) + { + this.bound = bound; + this.isStatic = isStatic; + this.collectionName = collectionName; + } + + public Clustering getAsClustering(CFMetaData metadata) + { + if (isStatic) + return Clustering.STATIC_CLUSTERING; + + assert bound.size() == metadata.comparator.size(); + ByteBuffer[] values = new ByteBuffer[bound.size()]; + for (int i = 0; i < bound.size(); i++) + values[i] = bound.get(i); + return new Clustering(values); + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder(); + sb.append(bound.kind()).append('('); + for (int i = 0; i < bound.size(); i++) + sb.append(i > 0 ? ":" : "").append(bound.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(bound.get(i))); + sb.append(')'); + return String.format("Bound(%s, collection=%s)", sb.toString(), collectionName == null ? "null" : collectionName.name); + } + } + + public interface LegacyAtom + { + public boolean isCell(); + + // note that for static atoms, LegacyCell and LegacyRangeTombstone behave differently here: + // - LegacyCell returns the modern Clustering.STATIC_CLUSTERING + // - LegacyRangeTombstone returns the 2.2 bound (i.e. N empty ByteBuffer, where N is number of clusterings) + // in LegacyDeletionInfo.add(), we split any LRT with a static bound out into the inRowRangeTombstones collection + // these are merged with regular row cells, in the CellGrouper, and their clustering is obtained via start.bound.getAsClustering + // (also, it should be impossibly to issue raw static row deletions anyway) + public ClusteringPrefix clustering(); + public boolean isStatic(); + + public LegacyCell asCell(); + public LegacyRangeTombstone asRangeTombstone(); + + /** + * Whether the atom is one that becomes part of a {@link Row} in the new storage engine, meaning it is either + * as cell or a legacy range tombstone that covers a single row, or parts of one. + */ + public boolean isRowAtom(CFMetaData metadata); + } + + /** + * A legacy cell. + * <p> + * This is used as a temporary object to facilitate dealing with the legacy format, this + * is not meant to be optimal. + */ + public static class LegacyCell implements LegacyAtom + { + private final static int DELETION_MASK = 0x01; + private final static int EXPIRATION_MASK = 0x02; + private final static int COUNTER_MASK = 0x04; + private final static int COUNTER_UPDATE_MASK = 0x08; + private final static int RANGE_TOMBSTONE_MASK = 0x10; + + public enum Kind { REGULAR, EXPIRING, DELETED, COUNTER } + + public final Kind kind; + + public final LegacyCellName name; + public final ByteBuffer value; + + public final long timestamp; + public final int localDeletionTime; + public final int ttl; + + @VisibleForTesting + public LegacyCell(Kind kind, LegacyCellName name, ByteBuffer value, long timestamp, int localDeletionTime, int ttl) + { + this.kind = kind; + this.name = name; + this.value = value; + this.timestamp = timestamp; + this.localDeletionTime = localDeletionTime; + this.ttl = ttl; + } + + public static LegacyCell regular(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp) + throws UnknownColumnException + { + return new LegacyCell(Kind.REGULAR, decodeCellName(metadata, superColumnName, name), value, timestamp, Cell.NO_DELETION_TIME, Cell.NO_TTL); + } + + public static LegacyCell expiring(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp, int ttl, int nowInSec) + throws UnknownColumnException + { + /* + * CASSANDRA-14092: Max expiration date capping is maybe performed here, expiration overflow policy application + * is done at {@link org.apache.cassandra.thrift.ThriftValidation#validateTtl(CFMetaData, Column)} + */ + return new LegacyCell(Kind.EXPIRING, decodeCellName(metadata, superColumnName, name), value, timestamp, ExpirationDateOverflowHandling.computeLocalExpirationTime(nowInSec, ttl), ttl); + } + + public static LegacyCell tombstone(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long timestamp, int nowInSec) + throws UnknownColumnException + { + return new LegacyCell(Kind.DELETED, decodeCellName(metadata, superColumnName, name), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, nowInSec, LivenessInfo.NO_TTL); + } + + public static LegacyCell counterUpdate(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long value) + throws UnknownColumnException + { + // See UpdateParameters.addCounter() for more details on this + ByteBuffer counterValue = CounterContext.instance().createUpdate(value); + return counter(decodeCellName(metadata, superColumnName, name), counterValue); + } + + public static LegacyCell counter(LegacyCellName name, ByteBuffer value) + { + return new LegacyCell(Kind.COUNTER, name, value, FBUtilities.timestampMicros(), Cell.NO_DELETION_TIME, Cell.NO_TTL); + } + + public byte serializationFlags() + { + if (isExpiring()) + return EXPIRATION_MASK; + if (isTombstone()) + return DELETION_MASK; + if (isCounterUpdate()) + return COUNTER_UPDATE_MASK; + if (isCounter()) + return COUNTER_MASK; + return 0; + } + + public boolean isCounterUpdate() + { + // See UpdateParameters.addCounter() for more details on this + return isCounter() && CounterContext.instance().isUpdate(value); + } + + public ClusteringPrefix clustering() + { + return name.clustering; + } + + public boolean isStatic() + { + return name.clustering == Clustering.STATIC_CLUSTERING; + } + + public boolean isCell() + { + return true; + } + + public LegacyCell asCell() + { + return this; + } + + public LegacyRangeTombstone asRangeTombstone() + { + throw new UnsupportedOperationException(); + } + + @Override + public boolean isRowAtom(CFMetaData metaData) + { + return true; + } + + public boolean isCounter() + { + return kind == Kind.COUNTER; + } + + public boolean isExpiring() + { + return kind == Kind.EXPIRING; + } + + public boolean isTombstone() + { + return kind == Kind.DELETED; + } + + public boolean isLive(int nowInSec) + { + if (isTombstone()) + return false; + + return !isExpiring() || nowInSec < localDeletionTime; + } + + @Override + public String toString() + { + return String.format("LegacyCell(%s, name=%s, v=%s, ts=%s, ldt=%s, ttl=%s)", kind, name, ByteBufferUtil.bytesToHex(value), timestamp, localDeletionTime, ttl); + } + } + + /** + * A legacy range tombstone. + * <p> + * This is used as a temporary object to facilitate dealing with the legacy format, this + * is not meant to be optimal. + */ + public static class LegacyRangeTombstone implements LegacyAtom + { + public final LegacyBound start; + public final LegacyBound stop; + public final DeletionTime deletionTime; + + public LegacyRangeTombstone(LegacyBound start, LegacyBound stop, DeletionTime deletionTime) + { + // Because of the way RangeTombstoneList work, we can have a tombstone where only one of + // the bound has a collectionName. That happens if we have a big tombstone A (spanning one + // or multiple rows) and a collection tombstone B. In that case, RangeTombstoneList will + // split this into 3 RTs: the first one from the beginning of A to the beginning of B, + // then B, then a third one from the end of B to the end of A. To make this simpler, if + // we detect that case we transform the 1st and 3rd tombstone so they don't end in the middle + // of a row (which is still correct). + if ((start.collectionName == null) != (stop.collectionName == null)) + { + if (start.collectionName == null) + stop = new LegacyBound(Slice.Bound.inclusiveEndOf(stop.bound.values), stop.isStatic, null); + else + start = new LegacyBound(Slice.Bound.inclusiveStartOf(start.bound.values), start.isStatic, null); + } + else if (!Objects.equals(start.collectionName, stop.collectionName)) + { + // We're in the similar but slightly more complex case where on top of the big tombstone + // A, we have 2 (or more) collection tombstones B and C within A. So we also end up with + // a tombstone that goes between the end of B and the start of C. + start = new LegacyBound(start.bound, start.isStatic, null); + stop = new LegacyBound(stop.bound, stop.isStatic, null); + } + + this.start = start; + this.stop = stop; + this.deletionTime = deletionTime; + } + + /** @see LegacyAtom#clustering for static inconsistencies explained */ + public ClusteringPrefix clustering() + { + return start.bound; + } + + public LegacyRangeTombstone withNewStart(LegacyBound newStart) + { + return new LegacyRangeTombstone(newStart, stop, deletionTime); + } + + public LegacyRangeTombstone withNewStart(Slice.Bound newStart) + { + return withNewStart(new LegacyBound(newStart, start.isStatic, null)); + } + + public LegacyRangeTombstone withNewEnd(LegacyBound newStop) + { + return new LegacyRangeTombstone(start, newStop, deletionTime); + } + + public LegacyRangeTombstone withNewEnd(Slice.Bound newEnd) + { + return withNewEnd(new LegacyBound(newEnd, stop.isStatic, null)); + } + + public boolean isCell() + { + return false; + } + + public boolean isStatic() + { + return start.isStatic || stop.isStatic; + } + + public LegacyCell asCell() + { + throw new UnsupportedOperationException(); + } + + public LegacyRangeTombstone asRangeTombstone() + { + return this; + } + + @Override + public boolean isRowAtom(CFMetaData metadata) + { + return isCollectionTombstone() || isRowDeletion(metadata); + } + + public boolean isCollectionTombstone() + { + return start.collectionName != null; + } + + public boolean isRowDeletion(CFMetaData metadata) + { + if (start.collectionName != null + || stop.collectionName != null + || start.bound.size() != metadata.comparator.size() + || stop.bound.size() != metadata.comparator.size()) + return false; + + for (int i = 0; i < start.bound.size(); i++) + if (!Objects.equals(start.bound.get(i), stop.bound.get(i))) + return false; + return true; + } + + @Override + public String toString() + { + return String.format("RT(%s-%s, %s)", start, stop, deletionTime); + } + } + + public static class LegacyDeletionInfo + { + public final MutableDeletionInfo deletionInfo; + public final List<LegacyRangeTombstone> inRowTombstones = new ArrayList<>(); + + private LegacyDeletionInfo(MutableDeletionInfo deletionInfo) + { + this.deletionInfo = deletionInfo; + } + + public static LegacyDeletionInfo live() + { + return new LegacyDeletionInfo(MutableDeletionInfo.live()); + } + + public void add(DeletionTime topLevel) + { + deletionInfo.add(topLevel); + } + + private static Slice.Bound staticBound(CFMetaData metadata, boolean isStart) + { + // In pre-3.0 nodes, static row started by a clustering with all empty values so we + // preserve that here. Note that in practice, it doesn't really matter since the rest + // of the code will ignore the bound for RT that have their static flag set. + ByteBuffer[] values = new ByteBuffer[metadata.comparator.size()]; + for (int i = 0; i < values.length; i++) + values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER; + return isStart + ? Slice.Bound.inclusiveStartOf(values) + : Slice.Bound.inclusiveEndOf(values); + } + + public void add(CFMetaData metadata, LegacyRangeTombstone tombstone) + { + if (metadata.hasStaticColumns()) + { + /* + * For table having static columns we have to deal with the following cases: + * 1. the end of the tombstone is static (in which case either the start is static or is BOTTOM, which is the same + * for our consideration). This mean that either the range only delete the static row, or that it's a collection + * tombstone of a static collection. In both case, we just add the tombstone to the inRowTombstones. + * 2. only the start is static. There is then 2 subcase: either the start is inclusive, and that mean we include the + * static row and more (so we add an inRowTombstone for the static and deal with the rest normally). Or the start + * is exclusive, and that means we explicitely exclude the static (in which case we can just add the tombstone + * as if it started at BOTTOM). + * 3. none of the bound are static but the start is BOTTOM. This means we intended to delete the static row so we + * need to add it to the inRowTombstones (and otherwise handle the range normally). + */ + if (tombstone.stop.isStatic) + { + // If the start is BOTTOM, we replace it by the beginning of the starting row so as to not confuse the + // RangeTombstone.isRowDeletion() method + if (tombstone.start == LegacyBound.BOTTOM) + tombstone = tombstone.withNewStart(new LegacyBound(staticBound(metadata, true), true, null)); + inRowTombstones.add(tombstone); + return; + } + + if (tombstone.start.isStatic) + { + if (tombstone.start.bound.isInclusive()) + inRowTombstones.add(tombstone.withNewEnd(new LegacyBound(staticBound(metadata, false), true, null))); + + tombstone = tombstone.withNewStart(LegacyBound.BOTTOM); + } + else if (tombstone.start == LegacyBound.BOTTOM) + { + inRowTombstones.add(new LegacyRangeTombstone(new LegacyBound(staticBound(metadata, true), true, null), + new LegacyBound(staticBound(metadata, false), true, null), + tombstone.deletionTime)); + } + } + + if (tombstone.isCollectionTombstone() || tombstone.isRowDeletion(metadata)) + inRowTombstones.add(tombstone); + else + add(metadata, new RangeTombstone(Slice.make(tombstone.start.bound, tombstone.stop.bound), tombstone.deletionTime)); + } + + public void add(CFMetaData metadata, RangeTombstone tombstone) + { + deletionInfo.add(tombstone, metadata.comparator); + } + + public Iterator<LegacyRangeTombstone> inRowRangeTombstones() + { + return inRowTombstones.iterator(); + } + + public static LegacyDeletionInfo deserialize(CFMetaData metadata, DataInputPlus in) throws IOException + { + DeletionTime topLevel = DeletionTime.serializer.deserialize(in); + + int rangeCount = in.readInt(); + if (rangeCount == 0) + return new LegacyDeletionInfo(new MutableDeletionInfo(topLevel)); + + LegacyDeletionInfo delInfo = new LegacyDeletionInfo(new MutableDeletionInfo(topLevel)); + for (int i = 0; i < rangeCount; i++) + { + LegacyBound start = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), true); + LegacyBound end = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false); + int delTime = in.readInt(); + long markedAt = in.readLong(); + + delInfo.add(metadata, new LegacyRangeTombstone(start, end, new DeletionTime(markedAt, delTime))); + } + return delInfo; + } + } + + /** + * A helper class for LegacyRangeTombstoneList. This replaces the Comparator<Composite> that RTL used before 3.0. + */ + private static class LegacyBoundComparator implements Comparator<LegacyBound> + { + ClusteringComparator clusteringComparator; + + public LegacyBoundComparator(ClusteringComparator clusteringComparator) + { + this.clusteringComparator = clusteringComparator; + } + + public int compare(LegacyBound a, LegacyBound b) + { + // In the legacy sorting, BOTTOM comes before anything else + if (a == LegacyBound.BOTTOM) + return b == LegacyBound.BOTTOM ? 0 : -1; + if (b == LegacyBound.BOTTOM) + return 1; + + // Excluding BOTTOM, statics are always before anything else. + if (a.isStatic != b.isStatic) + return a.isStatic ? -1 : 1; + + // We have to be careful with bound comparison because of collections. Namely, if the 2 bounds represent the + // same prefix, then we should take the collectionName into account before taking the bounds kind + // (ClusteringPrefix.Kind). This means we can't really call ClusteringComparator.compare() directly. + // For instance, if + // a is (bound=INCL_START_BOUND('x'), collectionName='d') + // b is (bound=INCL_END_BOUND('x'), collectionName='c') + // Ten b < a since the element 'c' of collection 'x' comes before element 'd', but calling + // clusteringComparator.compare(a.bound, b.bound) returns -1. + // See CASSANDRA-13125 for details. + int sa = a.bound.size(); + int sb = b.bound.size(); + for (int i = 0; i < Math.min(sa, sb); i++) + { + int cmp = clusteringComparator.compareComponent(i, a.bound.get(i), b.bound.get(i)); + if (cmp != 0) + return cmp; + } + + if (sa != sb) + return sa < sb ? a.bound.kind().comparedToClustering : -b.bound.kind().comparedToClustering; + + // Both bound represent the same prefix, compare the collection names + // If one has a collection name and the other doesn't, the other comes before as it points to the beginning of the row. + if ((a.collectionName == null) != (b.collectionName == null)) + return a.collectionName == null ? -1 : 1; + + // If they both have a collection, compare that first + if (a.collectionName != null) + { + int cmp = UTF8Type.instance.compare(a.collectionName.name.bytes, b.collectionName.name.bytes); + if (cmp != 0) + return cmp; + } + + // Lastly, if everything so far is equal, compare their clustering kind + return ClusteringPrefix.Kind.compare(a.bound.kind(), b.bound.kind()); + } + } + + /** + * Almost an entire copy of RangeTombstoneList from C* 2.1. The main difference is that LegacyBoundComparator + * is used in place of Comparator<Composite> (because Composite doesn't exist any more). + * + * This class is needed to allow us to convert single-row deletions and complex deletions into range tombstones + * and properly merge them into the normal set of range tombstones. + */ + public static class LegacyRangeTombstoneList + { + private final LegacyBoundComparator comparator; + + // Note: we don't want to use a List for the markedAts and delTimes to avoid boxing. We could + // use a List for starts and ends, but having arrays everywhere is almost simpler. + LegacyBound[] starts; + LegacyBound[] ends; + private long[] markedAts; + private int[] delTimes; + + private int size; + + private LegacyRangeTombstoneList(LegacyBoundComparator comparator, LegacyBound[] starts, LegacyBound[] ends, long[] markedAts, int[] delTimes, int size) + { + assert starts.length == ends.length && starts.length == markedAts.length && starts.length == delTimes.length; + this.comparator = comparator; + this.starts = starts; + this.ends = ends; + this.markedAts = markedAts; + this.delTimes = delTimes; + this.size = size; + } + + public LegacyRangeTombstoneList(LegacyBoundComparator comparator, int capacity) + { + this(comparator, new LegacyBound[capacity], new LegacyBound[capacity], new long[capacity], new int[capacity], 0); + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder(); + sb.append('['); + for (int i = 0; i < size; i++) + { + if (i > 0) + sb.append(','); + sb.append('(').append(starts[i]).append(", ").append(ends[i]).append(')'); + } + return sb.append(']').toString(); + } + + public boolean isEmpty() + { + return size == 0; + } + + public int size() + { + return size; + } + + /** + * Adds a new range tombstone. + * + * This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case), + * but it doesn't assume it. + */ + public void add(LegacyBound start, LegacyBound end, long markedAt, int delTime) + { + if (isEmpty()) + { + addInternal(0, start, end, markedAt, delTime); + return; + } + + int c = comparator.compare(ends[size-1], start); + + // Fast path if we add in sorted order + if (c <= 0) + { + addInternal(size, start, end, markedAt, delTime); + } + else + { + // Note: insertFrom expect i to be the insertion point in term of interval ends + int pos = Arrays.binarySearch(ends, 0, size, start, comparator); + insertFrom((pos >= 0 ? pos : -pos-1), start, end, markedAt, delTime); + } + } + + /* + * Inserts a new element starting at index i. This method assumes that: + * ends[i-1] <= start <= ends[i] + * + * A RangeTombstoneList is a list of range [s_0, e_0]...[s_n, e_n] such that: + * - s_i <= e_i + * - e_i <= s_i+1 + * - if s_i == e_i and e_i == s_i+1 then s_i+1 < e_i+1 + * Basically, range are non overlapping except for their bound and in order. And while + * we allow ranges with the same value for the start and end, we don't allow repeating + * such range (so we can't have [0, 0][0, 0] even though it would respect the first 2 + * conditions). + * + */ + + /** + * Adds all the range tombstones of {@code tombstones} to this RangeTombstoneList. + */ + public void addAll(LegacyRangeTombstoneList tombstones) + { + if (tombstones.isEmpty()) + return; + + if (isEmpty()) + { + copyArrays(tombstones, this); + return; + } + + /* + * We basically have 2 techniques we can use here: either we repeatedly call add() on tombstones values, + * or we do a merge of both (sorted) lists. If this lists is bigger enough than the one we add, then + * calling add() will be faster, otherwise it's merging that will be faster. + * + * Let's note that during memtables updates, it might not be uncommon that a new update has only a few range + * tombstones, while the CF we're adding it to (the one in the memtable) has many. In that case, using add() is + * likely going to be faster. + * + * In other cases however, like when diffing responses from multiple nodes, the tombstone lists we "merge" will + * be likely sized, so using add() might be a bit inefficient. + * + * Roughly speaking (this ignore the fact that updating an element is not exactly constant but that's not a big + * deal), if n is the size of this list and m is tombstones size, merging is O(n+m) while using add() is O(m*log(n)). + * + * But let's not crank up a logarithm computation for that. Long story short, merging will be a bad choice only + * if this list size is lot bigger that the other one, so let's keep it simple. + */ + if (size > 10 * tombstones.size) + { + for (int i = 0; i < tombstones.size; i++) + add(tombstones.starts[i], tombstones.ends[i], tombstones.markedAts[i], tombstones.delTimes[i]); + } + else + { + int i = 0; + int j = 0; + while (i < size && j < tombstones.size) + { + if (comparator.compare(tombstones.starts[j], ends[i]) <= 0) + { + insertFrom(i, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); + j++; + } + else + { + i++; + } + } + // Addds the remaining ones from tombstones if any (note that addInternal will increment size if relevant). + for (; j < tombstones.size; j++) + addInternal(size, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); + } + } + + private static void copyArrays(LegacyRangeTombstoneList src, LegacyRangeTombstoneList dst) + { + dst.grow(src.size); + System.arraycopy(src.starts, 0, dst.starts, 0, src.size); + System.arraycopy(src.ends, 0, dst.ends, 0, src.size); + System.arraycopy(src.markedAts, 0, dst.markedAts, 0, src.size); + System.arraycopy(src.delTimes, 0, dst.delTimes, 0, src.size); + dst.size = src.size; + } + + private void insertFrom(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) + { + while (i < size) + { + assert i == 0 || comparator.compare(ends[i-1], start) <= 0; + + int c = comparator.compare(start, ends[i]); + assert c <= 0; + if (c == 0) + { + // If start == ends[i], then we can insert from the next one (basically the new element + // really start at the next element), except for the case where starts[i] == ends[i]. + // In this latter case, if we were to move to next element, we could end up with ...[x, x][x, x]... + if (comparator.compare(starts[i], ends[i]) == 0) + { + // The current element cover a single value which is equal to the start of the inserted + // element. If the inserted element overwrites the current one, just remove the current + // (it's included in what we insert) and proceed with the insert. + if (markedAt > markedAts[i]) + { + removeInternal(i); + continue; + } + + // Otherwise (the current singleton interval override the new one), we want to leave the + // current element and move to the next, unless start == end since that means the new element + // is in fact fully covered by the current one (so we're done) + if (comparator.compare(start, end) == 0) + return; + } + i++; + continue; + } + + // Do we overwrite the current element? + if (markedAt > markedAts[i]) + { + // We do overwrite. + + // First deal with what might come before the newly added one. + if (comparator.compare(starts[i], start) < 0) + { + addInternal(i, starts[i], start, markedAts[i], delTimes[i]); + i++; + // We don't need to do the following line, but in spirit that's what we want to do + // setInternal(i, start, ends[i], markedAts, delTime]) + } + + // now, start <= starts[i] + + // Does the new element stops before/at the current one, + int endCmp = comparator.compare(end, starts[i]); + if (endCmp <= 0) + { + // Here start <= starts[i] and end <= starts[i] + // This means the current element is before the current one. However, one special + // case is if end == starts[i] and starts[i] == ends[i]. In that case, + // the new element entirely overwrite the current one and we can just overwrite + if (endCmp == 0 && comparator.compare(starts[i], ends[i]) == 0) + setInternal(i, start, end, markedAt, delTime); + else + addInternal(i, start, end, markedAt, delTime); + return; + } + + // Do we overwrite the current element fully? + int cmp = comparator.compare(ends[i], end); + if (cmp <= 0) + { + // We do overwrite fully: + // update the current element until it's end and continue + // on with the next element (with the new inserted start == current end). + + // If we're on the last element, we can optimize + if (i == size-1) + { + setInternal(i, start, end, markedAt, delTime); + return; + } + + setInternal(i, start, ends[i], markedAt, delTime); + if (cmp == 0) + return; + + start = ends[i]; + i++; + } + else + { + // We don't ovewrite fully. Insert the new interval, and then update the now next + // one to reflect the not overwritten parts. We're then done. + addInternal(i, start, end, markedAt, delTime); + i++; + setInternal(i, end, ends[i], markedAts[i], delTimes[i]); + return; + } + } + else + { + // we don't overwrite the current element + + // If the new interval starts before the current one, insert that new interval + if (comparator.compare(start, starts[i]) < 0) + { + // If we stop before the start of the current element, just insert the new + // interval and we're done; otherwise insert until the beginning of the + // current element + if (comparator.compare(end, starts[i]) <= 0) + { + addInternal(i, start, end, markedAt, delTime); + return; + } + addInternal(i, start, starts[i], markedAt, delTime); + i++; + } + + // After that, we're overwritten on the current element but might have + // some residual parts after ... + + // ... unless we don't extend beyond it. + if (comparator.compare(end, ends[i]) <= 0) + return; + + start = ends[i]; + i++; + } + } + + // If we got there, then just insert the remainder at the end + addInternal(i, start, end, markedAt, delTime); + } + + private int capacity() + { + return starts.length; + } + + private void addInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) + { + assert i >= 0; + + if (size == capacity()) + growToFree(i); + else if (i < size) + moveElements(i); + + setInternal(i, start, end, markedAt, delTime); + size++; + } + + private void removeInternal(int i) + { + assert i >= 0; + + System.arraycopy(starts, i+1, starts, i, size - i - 1); + System.arraycopy(ends, i+1, ends, i, size - i - 1); + System.arraycopy(markedAts, i+1, markedAts, i, size - i - 1); + System.arraycopy(delTimes, i+1, delTimes, i, size - i - 1); + + --size; + starts[size] = null; + ends[size] = null; + } + + /* + * Grow the arrays, leaving index i "free" in the process. + */ + private void growToFree(int i) + { + int newLength = (capacity() * 3) / 2 + 1; + grow(i, newLength); + } + + /* + * Grow the arrays to match newLength capacity. + */ + private void grow(int newLength) + { + if (capacity() < newLength) + grow(-1, newLength); + } + + private void grow(int i, int newLength) + { + starts = grow(starts, size, newLength, i); + ends = grow(ends, size, newLength, i); + markedAts = grow(markedAts, size, newLength, i); + delTimes = grow(delTimes, size, newLength, i); + } + + private static LegacyBound[] grow(LegacyBound[] a, int size, int newLength, int i) + { + if (i < 0 || i >= size) + return Arrays.copyOf(a, newLength); + + LegacyBound[] newA = new LegacyBound[newLength]; + System.arraycopy(a, 0, newA, 0, i); + System.arraycopy(a, i, newA, i+1, size - i); + return newA; + } + + private static long[] grow(long[] a, int size, int newLength, int i) + { + if (i < 0 || i >= size) + return Arrays.copyOf(a, newLength); + + long[] newA = new long[newLength]; + System.arraycopy(a, 0, newA, 0, i); + System.arraycopy(a, i, newA, i+1, size - i); + return newA; + } + + private static int[] grow(int[] a, int size, int newLength, int i) + { + if (i < 0 || i >= size) + return Arrays.copyOf(a, newLength); + + int[] newA = new int[newLength]; + System.arraycopy(a, 0, newA, 0, i); + System.arraycopy(a, i, newA, i+1, size - i); + return newA; + } + + /* + * Move elements so that index i is "free", assuming the arrays have at least one free slot at the end. + */ + private void moveElements(int i) + { + if (i >= size) + return; + + System.arraycopy(starts, i, starts, i+1, size - i); + System.arraycopy(ends, i, ends, i+1, size - i); + System.arraycopy(markedAts, i, markedAts, i+1, size - i); + System.arraycopy(delTimes, i, delTimes, i+1, size - i); + // we set starts[i] to null to indicate the position is now empty, so that we update boundaryHeapSize + // when we set it + starts[i] = null; + } + + private void setInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) + { + starts[i] = start; + ends[i] = end; + markedAts[i] = markedAt; + delTimes[i] = delTime; + } + + public void updateDigest(MessageDigest digest) + { + ByteBuffer longBuffer = ByteBuffer.allocate(8); + for (int i = 0; i < size; i++) + { + for (int j = 0; j < starts[i].bound.size(); j++) + digest.update(starts[i].bound.get(j).duplicate()); + if (starts[i].collectionName != null) + digest.update(starts[i].collectionName.name.bytes.duplicate()); + for (int j = 0; j < ends[i].bound.size(); j++) + digest.update(ends[i].bound.get(j).duplicate()); + if (ends[i].collectionName != null) + digest.update(ends[i].collectionName.name.bytes.duplicate()); + + longBuffer.putLong(0, markedAts[i]); + digest.update(longBuffer.array(), 0, 8); + } + } + + public void serialize(DataOutputPlus out, CFMetaData metadata) throws IOException + { + out.writeInt(size); + if (size == 0) + return; + + if (metadata.isCompound()) + serializeCompound(out, metadata.isDense()); + else + serializeSimple(out); + } + + private void serializeCompound(DataOutputPlus out, boolean isDense) throws IOException + { + List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); + + if (!isDense) + types.add(UTF8Type.instance); + + CompositeType type = CompositeType.getInstance(types); + + for (int i = 0; i < size; i++) + { + LegacyBound start = starts[i]; + LegacyBound end = ends[i]; + + CompositeType.Builder startBuilder = type.builder(start.isStatic); + CompositeType.Builder endBuilder = type.builder(end.isStatic); + for (int j = 0; j < start.bound.clustering().size(); j++) + { + startBuilder.add(start.bound.get(j)); + endBuilder.add(end.bound.get(j)); + } + + if (start.collectionName != null) + startBuilder.add(start.collectionName.name.bytes); + if (end.collectionName != null) + endBuilder.add(end.collectionName.name.bytes); + + ByteBufferUtil.writeWithShortLength(startBuilder.build(), out); + ByteBufferUtil.writeWithShortLength(endBuilder.buildAsEndOfRange(), out); + + out.writeInt(delTimes[i]); + out.writeLong(markedAts[i]); + } + } + + private void serializeSimple(DataOutputPlus out) throws IOException + { + List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); + assert types.size() == 1 : types; + + for (int i = 0; i < size; i++) + { + LegacyBound start = starts[i]; + LegacyBound end = ends[i]; + + ClusteringPrefix startClustering = start.bound.clustering(); + ClusteringPrefix endClustering = end.bound.clustering(); + + assert startClustering.size() == 1; + assert endClustering.size() == 1; + + ByteBufferUtil.writeWithShortLength(startClustering.get(0), out); + ByteBufferUtil.writeWithShortLength(endClustering.get(0), out); + + out.writeInt(delTimes[i]); + out.writeLong(markedAts[i]); + } + } + + public long serializedSize(CFMetaData metadata) + { + long size = 0; + size += TypeSizes.sizeof(this.size); + + if (this.size == 0) + return size; + + if (metadata.isCompound()) + return size + serializedSizeCompound(metadata.isDense()); + else + return size + serializedSizeSimple(); + } + + private long serializedSizeCompound(boolean isDense) + { + long size = 0; + List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); + if (!isDense) + types.add(UTF8Type.instance); + CompositeType type = CompositeType.getInstance(types); + + for (int i = 0; i < this.size; i++) + { + LegacyBound start = starts[i]; + LegacyBound end = ends[i]; + + CompositeType.Builder startBuilder = type.builder(); + CompositeType.Builder endBuilder = type.builder(); + for (int j = 0; j < start.bound.size(); j++) + startBuilder.add(start.bound.get(j)); + for (int j = 0; j < end.bound.size(); j++) + endBuilder.add(end.bound.get(j)); + + if (start.collectionName != null) + startBuilder.add(start.collectionName.name.bytes); + if (end.collectionName != null) + endBuilder.add(end.collectionName.name.bytes); + + size += ByteBufferUtil.serializedSizeWithShortLength(startBuilder.build()); + size += ByteBufferUtil.serializedSizeWithShortLength(endBuilder.buildAsEndOfRange()); + + size += TypeSizes.sizeof(delTimes[i]); + size += TypeSizes.sizeof(markedAts[i]); + } + return size; + } + + private long serializedSizeSimple() + { + long size = 0; + List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); + assert types.size() == 1 : types; + + for (int i = 0; i < this.size; i++) + { + LegacyBound start = starts[i]; + LegacyBound end = ends[i]; + + ClusteringPrefix startClustering = start.bound.clustering(); + ClusteringPrefix endClustering = end.bound.clustering(); + + assert startClustering.size() == 1; + assert endClustering.size() == 1; + + size += ByteBufferUtil.serializedSizeWithShortLength(startClustering.get(0)); + size += ByteBufferUtil.serializedSizeWithShortLength(endClustering.get(0)); + + size += TypeSizes.sizeof(delTimes[i]); + size += TypeSizes.sizeof(markedAts[i]); + } + return size; + } + } +} diff --cc test/distributed/org/apache/cassandra/distributed/upgrade/MigrateDropColumnsTest.java index 0000000,0000000..0af024c new file mode 100644 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MigrateDropColumnsTest.java @@@ -1,0 -1,0 +1,106 @@@ ++package org.apache.cassandra.distributed.upgrade; ++ ++import java.util.Arrays; ++import java.util.Collections; ++ ++import com.google.common.collect.ImmutableMap; ++import com.google.common.collect.ImmutableSet; ++import com.google.common.collect.Sets; ++import org.junit.Assert; ++import org.junit.Test; ++ ++import org.apache.cassandra.db.marshal.CompositeType; ++import org.apache.cassandra.db.marshal.Int32Type; ++import org.apache.cassandra.db.marshal.MapType; ++import org.apache.cassandra.distributed.api.ConsistencyLevel; ++import org.apache.cassandra.distributed.api.Feature; ++import org.apache.cassandra.distributed.api.ICoordinator; ++import org.apache.cassandra.distributed.api.QueryResults; ++import org.apache.cassandra.distributed.api.SimpleQueryResult; ++import org.apache.cassandra.distributed.shared.AssertUtils; ++import org.apache.cassandra.distributed.shared.Versions; ++import org.apache.cassandra.distributed.test.ThriftClientUtils; ++import org.apache.cassandra.thrift.Deletion; ++import org.apache.cassandra.thrift.Mutation; ++import org.apache.cassandra.thrift.SlicePredicate; ++import org.apache.cassandra.thrift.SliceRange; ++import org.apache.cassandra.utils.ByteBufferUtil; ++ ++public class MigrateDropColumnsTest extends UpgradeTestBase ++{ ++ private static final MapType MAP_TYPE = MapType.getInstance(Int32Type.instance, Int32Type.instance, true); ++ ++ @Test ++ public void dropColumns() throws Throwable ++ { ++ new TestCase() ++ .upgrade(Versions.Major.v22, Versions.Major.v30) ++// .upgrade(Versions.Major.v22, Versions.Major.v3X) ++// .upgrade(Versions.Major.v30, Versions.Major.v3X) ++// .upgrade(Versions.Major.v22, Versions.Major.v30, Versions.Major.v3X) ++ .withConfig(c -> c.with(Feature.NATIVE_PROTOCOL)) ++ .setup(cluster -> { ++ cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl(pk int, tables map<int, int>, PRIMARY KEY (pk))")); ++ ++ ICoordinator coordinator = cluster.coordinator(1); ++ ++ // write a RT to pk=0 ++ ThriftClientUtils.thriftClient(cluster.get(1), thrift -> { ++ thrift.set_keyspace(KEYSPACE); ++ ++ Mutation mutation = new Mutation(); ++ Deletion deletion = new Deletion(); ++ SlicePredicate slice = new SlicePredicate(); ++ SliceRange range = new SliceRange(); ++ range.setStart(CompositeType.build(ByteBufferUtil.bytes("tables"))); ++ range.setFinish(CompositeType.build(ByteBufferUtil.bytes("tables"))); ++ slice.setSlice_range(range); ++ deletion.setPredicate(slice); ++ deletion.setTimestamp(System.currentTimeMillis()); ++ mutation.setDeletion(deletion); ++ ++ thrift.batch_mutate(Collections.singletonMap(ByteBufferUtil.bytes(0), ++ Collections.singletonMap("tbl", Arrays.asList(mutation))), ++ org.apache.cassandra.thrift.ConsistencyLevel.ALL); ++ }); ++ ++ // write table to pk=1 ++ // NOTE: because jvm-dtest doesn't support collections in the execute interface (see CASSANDRA-15969) ++ // need to encode to a ByteBuffer first ++ coordinator.execute(withKeyspace("INSERT INTO %s.tbl (pk, tables) VALUES (?, ?)"), ConsistencyLevel.ONE, 1, MAP_TYPE.decompose(ImmutableMap.of(1, 1))); ++ ++ cluster.forEach(inst -> inst.flush(KEYSPACE)); ++ ++ cluster.schemaChange(withKeyspace("ALTER TABLE %s.tbl DROP tables")); ++ }) ++ .runAfterClusterUpgrade(cluster -> { ++ ICoordinator coordinator = cluster.coordinator(1); ++ SimpleQueryResult qr = coordinator.executeWithResult("SELECT column_name " + ++ "FROM system_schema.dropped_columns " + ++ "WHERE keyspace_name=?" + ++ " AND table_name=?;", ++ ConsistencyLevel.ALL, KEYSPACE, "tbl"); ++ Assert.assertEquals(ImmutableSet.of("tables"), Sets.newHashSet(qr.map(r -> r.getString("column_name")))); ++ ++ assertRows(coordinator); ++ ++ // upgradesstables, make sure everything is still working ++ cluster.forEach(n -> n.nodetoolResult("upgradesstables", KEYSPACE).asserts().success()); ++ ++ assertRows(coordinator); ++ }) ++ .run(); ++ } ++ ++ private static void assertRows(ICoordinator coordinator) ++ { ++ // since only a RT was written to this row there is no liveness information, so the row will be skipped ++ AssertUtils.assertRows( ++ coordinator.executeWithResult(withKeyspace("SELECT * FROM %s.tbl WHERE pk=?"), ConsistencyLevel.ALL, 0), ++ QueryResults.empty()); ++ ++ AssertUtils.assertRows( ++ coordinator.executeWithResult(withKeyspace("SELECT * FROM %s.tbl WHERE pk=?"), ConsistencyLevel.ALL, 1), ++ QueryResults.builder().row(1).build()); ++ } ++} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
