Repository: cassandra Updated Branches: refs/heads/cassandra-3.0 322f7e9bb -> 45937def3 refs/heads/cassandra-3.11 4f30dae1d -> 4d3f5a32b refs/heads/trunk 44cffc0b1 -> 5069b2c0f
DESC order reads can fail to return the last Unfiltered in the partition patch by Aleksey Yeschenko; reviewed by Sam Tunnicliffe and Benedict Elliott Smith for CASSANDRA-14766 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/45937def Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/45937def Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/45937def Branch: refs/heads/cassandra-3.0 Commit: 45937def313bbb32024ae890f830e23bcc6ccae5 Parents: 322f7e9 Author: Aleksey Yeshchenko <alek...@apple.com> Authored: Tue Sep 18 13:12:11 2018 +0100 Committer: Aleksey Yeshchenko <alek...@apple.com> Committed: Tue Sep 25 17:02:06 2018 +0100 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../cassandra/db/UnfilteredDeserializer.java | 115 ++++++++++++------- ...bles-legacy_ka_14766-ka-1-CompressionInfo.db | Bin 0 -> 43 bytes .../legacy_tables-legacy_ka_14766-ka-1-Data.db | Bin 0 -> 103 bytes ...gacy_tables-legacy_ka_14766-ka-1-Digest.sha1 | 1 + ...legacy_tables-legacy_ka_14766-ka-1-Filter.db | Bin 0 -> 16 bytes .../legacy_tables-legacy_ka_14766-ka-1-Index.db | Bin 0 -> 134 bytes ...cy_tables-legacy_ka_14766-ka-1-Statistics.db | Bin 0 -> 4450 bytes ...egacy_tables-legacy_ka_14766-ka-1-Summary.db | Bin 0 -> 92 bytes .../legacy_tables-legacy_ka_14766-ka-1-TOC.txt | 8 ++ .../cassandra/io/sstable/LegacySSTableTest.java | 27 ++++- 11 files changed, 112 insertions(+), 40 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 195c97c..43628b2 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 3.0.18 + * DESC order reads can fail to return the last Unfiltered in the partition (CASSANDRA-14766) * Fix corrupted collection deletions for dropped columns in 3.0 <-> 2.{1,2} messages (CASSANDRA-14568) * Fix corrupted static collection deletions in 3.0 <-> 2.{1,2} messages (CASSANDRA-14568) * Handle failures in parallelAllSSTableOperation (cleanup/upgradesstables/etc) (CASSANDRA-14657) http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java b/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java index 0aa5741..62ad76a 100644 --- a/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java +++ b/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java @@ -245,10 +245,14 @@ public abstract class UnfilteredDeserializer // The next Unfiltered to return, computed by hasNext() private Unfiltered next; - // A temporary storage for an unfiltered that isn't returned next but should be looked at just afterwards - private Unfiltered saved; - private boolean isFirst = true; + // Saved position in the input after the next Unfiltered that will be consumed + private long nextConsumedPosition; + + // A temporary storage for an Unfiltered that isn't returned next but should be looked at just afterwards + private Stash stash; + + private boolean couldBeStartOfPartition = true; // The Unfiltered as read from the old format input private final UnfilteredIterator iterator; @@ -258,7 +262,15 @@ public abstract class UnfilteredDeserializer // Tracks the size of the last LegacyAtom read from disk, because this needs to be accounted // for when marking lastConsumedPosition after readNext/skipNext - private long bytesReadForNextAtom; + // Reading/skipping an Unfiltered consumes LegacyAtoms from the underlying legacy atom iterator + // e.g. hasNext() -> iterator.hasNext() -> iterator.readRow() -> atoms.next() + // The stop condition of the loop which groups legacy atoms into rows causes that AtomIterator + // to read in the first atom which doesn't belong in the row. So by that point, our position + // is actually past the end of the next Unfiltered. To compensate, we record the size of + // the last LegacyAtom read and subtract it from the current position when we calculate lastConsumedPosition. + // If we don't, then when reading an indexed block, we can over correct and may think that we've + // exhausted the block before we actually have. + private long bytesReadForNextAtom = 0L; private OldFormatDeserializer(CFMetaData metadata, DataInputPlus in, @@ -313,27 +325,55 @@ public abstract class UnfilteredDeserializer { while (next == null) { - if (saved == null && !iterator.hasNext()) - return false; - - next = saved == null ? iterator.next() : saved; - saved = null; - - // The sstable iterators assume that if there is one, the static row is the first thing this deserializer will return. - // However, in the old format, a range tombstone with an empty start would sort before any static cell. So we should - // detect that case and return the static parts first if necessary. - if (isFirst && iterator.hasNext() && isStatic(iterator.peek())) + if (null != stash) + { + next = stash.unfiltered; + nextConsumedPosition = stash.consumedPosition; + stash = null; + } + else { - saved = next; + if (!iterator.hasNext()) + return false; next = iterator.next(); + nextConsumedPosition = currentPosition() - bytesReadForNextAtom; + } + + /* + * The sstable iterators assume that if there is one, the static row is the first thing this deserializer will return. + * However, in the old format, a range tombstone with an empty start would sort before any static cell. So we should + * detect that case and return the static parts first if necessary. + */ + if (couldBeStartOfPartition && next.isRangeTombstoneMarker() && next.clustering().size() == 0 && iterator.hasNext()) + { + Unfiltered unfiltered = iterator.next(); + long consumedPosition = currentPosition() - bytesReadForNextAtom; + + stash = new Stash(unfiltered, consumedPosition); + + /* + * reorder next and stash (see the comment above that explains why), but retain their positions + * it's ok to do so since consumedPosition value is only used to determine if we have gone past + * the end of the index âblockâ; since the edge case requires that the first value be the âbottomâ + * RT bound (i.e. with no byte buffers), this has a small and well-defined size, and it must be + * the case that both unfiltered are in the same index âblockâ if we began at the beginning of it. + * if we don't do this, however, we risk aborting early and not returning the BOTTOM rt bound, + * if the static row is large enough to cross block boundaries. + */ + if (isStatic(unfiltered)) + { + stash.unfiltered = next; + next = unfiltered; + } } - isFirst = false; + couldBeStartOfPartition = false; // When reading old tables, we sometimes want to skip static data (due to how staticly defined column of compact // tables are handled). if (skipStatic && isStatic(next)) next = null; } + return true; } catch (IOError e) @@ -376,18 +416,17 @@ public abstract class UnfilteredDeserializer throw new IllegalStateException(); Unfiltered toReturn = next; next = null; - lastConsumedPosition = currentPosition() - bytesReadForNextAtom(); + lastConsumedPosition = nextConsumedPosition; return toReturn; } public void skipNext() throws IOException { - if (!hasNext()) - throw new UnsupportedOperationException(); - next = null; - lastConsumedPosition = currentPosition() - bytesReadForNextAtom(); + readNext(); } + // in case we had to reorder an empty RT bound with a static row, this won't be returning the precise unconsumed size, + // that corresponds to the last returned Unfiltered, but use the natural order in the sstable instead public long bytesReadForUnconsumedData() { if (!(in instanceof FileDataInput)) @@ -396,28 +435,26 @@ public abstract class UnfilteredDeserializer return currentPosition() - lastConsumedPosition; } - // Reading/skipping an Unfiltered consumes LegacyAtoms from the underlying legacy atom iterator - // e.g. hasNext() -> iterator.hasNext() -> iterator.readRow() -> atoms.next() - // The stop condition of the loop which groups legacy atoms into rows causes that AtomIterator - // to read in the first atom which doesn't belong in the row. So by that point, our position - // is actually past the end of the next Unfiltered. To compensate, we record the size of - // the last LegacyAtom read and subtract it from the current position when we calculate lastConsumedPosition. - // If we don't, then when reading an indexed block, we can over correct and may think that we've - // exhausted the block before we actually have. - private long bytesReadForNextAtom() - { - // If we've read anything at all then we will have recorded this in bytesReadForNextAtom, - // but being extra careful here just incase this method is called before any reads happen. - return iterator.atoms.next == null ? 0 : bytesReadForNextAtom; - } - public void clearState() { next = null; - saved = null; + stash = null; + couldBeStartOfPartition = true; iterator.clearState(); lastConsumedPosition = currentPosition(); - bytesReadForNextAtom = 0; + bytesReadForNextAtom = 0L; + } + + private static final class Stash + { + private Unfiltered unfiltered; + long consumedPosition; + + private Stash(Unfiltered unfiltered, long consumedPosition) + { + this.unfiltered = unfiltered; + this.consumedPosition = consumedPosition; + } } // Groups atoms from the input into proper Unfiltered. @@ -543,7 +580,7 @@ public abstract class UnfilteredDeserializer // Wraps the input of the deserializer to provide an iterator (and skip shadowed atoms). // Note: this could use guava AbstractIterator except that we want to be able to clear // the internal state of the iterator so it's cleaner to do it ourselves. - private class AtomIterator implements PeekingIterator<LegacyLayout.LegacyAtom> + private static class AtomIterator implements PeekingIterator<LegacyLayout.LegacyAtom> { private final Supplier<LegacyLayout.LegacyAtom> atomReader; private boolean isDone; http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-CompressionInfo.db ---------------------------------------------------------------------- diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-CompressionInfo.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-CompressionInfo.db new file mode 100644 index 0000000..b5b5246 Binary files /dev/null and b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-CompressionInfo.db differ http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Data.db ---------------------------------------------------------------------- diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Data.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Data.db new file mode 100644 index 0000000..18cf478 Binary files /dev/null and b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Data.db differ http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Digest.sha1 ---------------------------------------------------------------------- diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Digest.sha1 b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Digest.sha1 new file mode 100644 index 0000000..f37a2b3 --- /dev/null +++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Digest.sha1 @@ -0,0 +1 @@ +1576541413 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Filter.db ---------------------------------------------------------------------- diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Filter.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Filter.db new file mode 100644 index 0000000..7a31048 Binary files /dev/null and b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Filter.db differ http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Index.db ---------------------------------------------------------------------- diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Index.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Index.db new file mode 100644 index 0000000..5e4995c Binary files /dev/null and b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Index.db differ http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Statistics.db ---------------------------------------------------------------------- diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Statistics.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Statistics.db new file mode 100644 index 0000000..d4b0526 Binary files /dev/null and b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Statistics.db differ http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Summary.db ---------------------------------------------------------------------- diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Summary.db b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Summary.db new file mode 100644 index 0000000..38cc933 Binary files /dev/null and b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-Summary.db differ http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-TOC.txt ---------------------------------------------------------------------- diff --git a/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-TOC.txt b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-TOC.txt new file mode 100644 index 0000000..db5ac46 --- /dev/null +++ b/test/data/legacy-sstables/ka/legacy_tables/legacy_ka_14766/legacy_tables-legacy_ka_14766-ka-1-TOC.txt @@ -0,0 +1,8 @@ +Data.db +TOC.txt +Digest.sha1 +Filter.db +Statistics.db +CompressionInfo.db +Summary.db +Index.db http://git-wip-us.apache.org/repos/asf/cassandra/blob/45937def/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java ---------------------------------------------------------------------- diff --git a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java index ede4ab6..f10114b 100644 --- a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java @@ -188,7 +188,32 @@ public class LegacySSTableTest } @Test - public void verifyOldSSTables() throws Exception + public void test14766() throws Exception + { + /* + * During upgrades from 2.1 to 3.0, reading from old sstables in reverse order could omit the very last row if the + * last indexed block had only two Unfiltered-s. See CASSANDRA-14766 for details. + * + * The sstable used here has two indexed blocks, with 2 cells/rows of ~500 bytes each, with column index interval of 1kb. + * Without the fix SELECT * returns 4 rows in ASC order, but only 3 rows in DESC order, omitting the last one. + */ + + QueryProcessor.executeInternal("CREATE TABLE legacy_tables.legacy_ka_14766 (pk int, ck int, value text, PRIMARY KEY (pk, ck));"); + loadLegacyTable("legacy_%s_14766%s", "ka", ""); + + UntypedResultSet rs; + + // read all rows in ASC order, expect all 4 to be returned + rs = QueryProcessor.executeInternal("SELECT * FROM legacy_tables.legacy_ka_14766 WHERE pk = 0 ORDER BY ck ASC;"); + Assert.assertEquals(4, rs.size()); + + // read all rows in DESC order, expect all 4 to be returned + rs = QueryProcessor.executeInternal("SELECT * FROM legacy_tables.legacy_ka_14766 WHERE pk = 0 ORDER BY ck DESC;"); + Assert.assertEquals(4, rs.size()); + } + + @Test + public void testVerifyOldSSTables() throws Exception { for (String legacyVersion : legacyVersions) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@cassandra.apache.org For additional commands, e-mail: commits-h...@cassandra.apache.org