Repository: cassandra Updated Branches: refs/heads/trunk e375a9b1b -> 2ca2fffee
Support EQ/PREFIX queries in SASI CONTAINS mode without tokenization patch by jrwest and xedin; reviewed by xedin for CASSANDRA-11434 Project: http://git-wip-us.apache.org/repos/asf/cassandra/repo Commit: http://git-wip-us.apache.org/repos/asf/cassandra/commit/2ca2fffe Tree: http://git-wip-us.apache.org/repos/asf/cassandra/tree/2ca2fffe Diff: http://git-wip-us.apache.org/repos/asf/cassandra/diff/2ca2fffe Branch: refs/heads/trunk Commit: 2ca2fffee80c75e3d1aa38badbefbd3b35851901 Parents: e375a9b Author: Jordan West <jorda...@gmail.com> Authored: Thu Mar 24 23:32:45 2016 -0700 Committer: Pavel Yaskevich <xe...@apache.org> Committed: Sun Apr 3 00:27:54 2016 -0700 ---------------------------------------------------------------------- CHANGES.txt | 1 + .../cassandra/index/sasi/SSTableIndex.java | 11 +++++ .../org/apache/cassandra/index/sasi/Term.java | 24 +++++++++-- .../cassandra/index/sasi/TermIterator.java | 9 ++++ .../cassandra/index/sasi/conf/ColumnIndex.java | 6 ++- .../cassandra/index/sasi/disk/Descriptor.java | 2 +- .../cassandra/index/sasi/disk/OnDiskIndex.java | 23 ++++++++-- .../index/sasi/disk/OnDiskIndexBuilder.java | 36 +++++++++++----- .../index/sasi/memory/TrieMemIndex.java | 1 + .../cassandra/index/sasi/plan/Expression.java | 10 +++++ .../cassandra/index/sasi/sa/IndexedTerm.java | 43 +++++++++++++++++++ .../cassandra/index/sasi/sa/IntegralSA.java | 5 ++- .../cassandra/index/sasi/sa/SuffixSA.java | 41 ++++++++++++------ .../cassandra/index/sasi/sa/TermIterator.java | 2 +- .../index/sasi/utils/CombinedTerm.java | 5 +++ .../index/sasi/utils/CombinedTermIterator.java | 5 ++- .../cassandra/index/sasi/SASIIndexTest.java | 45 ++++++++++++++++---- .../index/sasi/disk/OnDiskIndexTest.java | 9 +++- 18 files changed, 230 insertions(+), 48 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index a01196f..e5ee3d8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 3.6 + * Support EQ/PREFIX queries in SASI CONTAINS mode without tokenization (CASSANDRA-11434) * Support LIKE operator in prepared statements (CASSANDRA-11456) * Add a command to see if a Materialized View has finished building (CASSANDRA-9967) * Log endpoint and port associated with streaming operation (CASSANDRA-8777) http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java b/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java index 7b65232..c67c39c 100644 --- a/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java +++ b/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java @@ -27,6 +27,7 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.index.sasi.conf.ColumnIndex; import org.apache.cassandra.index.sasi.disk.OnDiskIndex; +import org.apache.cassandra.index.sasi.disk.OnDiskIndexBuilder; import org.apache.cassandra.index.sasi.disk.Token; import org.apache.cassandra.index.sasi.plan.Expression; import org.apache.cassandra.index.sasi.utils.RangeIterator; @@ -67,6 +68,16 @@ public class SSTableIndex this.index = new OnDiskIndex(indexFile, validator, new DecoratedKeyFetcher(sstable)); } + public OnDiskIndexBuilder.Mode mode() + { + return index.mode(); + } + + public boolean hasMarkedPartials() + { + return index.hasMarkedPartials(); + } + public ByteBuffer minTerm() { return index.minTerm(); http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/Term.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/Term.java b/src/java/org/apache/cassandra/index/sasi/Term.java index 8e8ceb2..8f42d58 100644 --- a/src/java/org/apache/cassandra/index/sasi/Term.java +++ b/src/java/org/apache/cassandra/index/sasi/Term.java @@ -23,30 +23,41 @@ import org.apache.cassandra.index.sasi.disk.OnDiskIndexBuilder.TermSize; import org.apache.cassandra.index.sasi.utils.MappedBuffer; import org.apache.cassandra.db.marshal.AbstractType; +import static org.apache.cassandra.index.sasi.disk.OnDiskIndexBuilder.IS_PARTIAL_BIT; + public class Term { protected final MappedBuffer content; protected final TermSize termSize; + private final boolean hasMarkedPartials; - public Term(MappedBuffer content, TermSize size) + public Term(MappedBuffer content, TermSize size, boolean hasMarkedPartials) { this.content = content; this.termSize = size; + this.hasMarkedPartials = hasMarkedPartials; } public ByteBuffer getTerm() { long offset = termSize.isConstant() ? content.position() : content.position() + 2; - int length = termSize.isConstant() ? termSize.size : content.getShort(content.position()); + int length = termSize.isConstant() ? termSize.size : readLength(content.position()); return content.getPageRegion(offset, length); } + public boolean isPartial() + { + return !termSize.isConstant() + && hasMarkedPartials + && (content.getShort(content.position()) & (1 << IS_PARTIAL_BIT)) != 0; + } + public long getDataOffset() { long position = content.position(); - return position + (termSize.isConstant() ? termSize.size : 2 + content.getShort(position)); + return position + (termSize.isConstant() ? termSize.size : 2 + readLength(position)); } public int compareTo(AbstractType<?> comparator, ByteBuffer query) @@ -58,8 +69,13 @@ public class Term { long position = content.position(); int padding = termSize.isConstant() ? 0 : 2; - int len = termSize.isConstant() ? termSize.size : content.getShort(position); + int len = termSize.isConstant() ? termSize.size : readLength(position); return content.comparePageTo(position + padding, checkFully ? len : Math.min(len, query.remaining()), comparator, query); } + + private short readLength(long position) + { + return (short) (content.getShort(position) & ~(1 << IS_PARTIAL_BIT)); + } } http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/TermIterator.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/TermIterator.java b/src/java/org/apache/cassandra/index/sasi/TermIterator.java index d2db88c..5b08a56 100644 --- a/src/java/org/apache/cassandra/index/sasi/TermIterator.java +++ b/src/java/org/apache/cassandra/index/sasi/TermIterator.java @@ -24,6 +24,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.index.sasi.disk.OnDiskIndexBuilder; import org.apache.cassandra.index.sasi.disk.Token; import org.apache.cassandra.index.sasi.plan.Expression; import org.apache.cassandra.index.sasi.utils.RangeUnionIterator; @@ -101,6 +102,14 @@ public class TermIterator extends RangeIterator<Long, Token> for (final SSTableIndex index : perSSTableIndexes) { + if (e.getOp() == Expression.Op.PREFIX && + index.mode() == OnDiskIndexBuilder.Mode.CONTAINS && !index.hasMarkedPartials()) + throw new UnsupportedOperationException(String.format("The index %s has not yet been upgraded " + + "to support prefix queries in CONTAINS mode. " + + "Wait for compaction or rebuild the index.", + index.getPath())); + + if (!index.reference()) { latch.countDown(); http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java b/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java index 89129e7..8ee94d1 100644 --- a/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java +++ b/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java @@ -39,6 +39,7 @@ import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.index.sasi.analyzer.AbstractAnalyzer; import org.apache.cassandra.index.sasi.conf.view.View; +import org.apache.cassandra.index.sasi.disk.OnDiskIndexBuilder; import org.apache.cassandra.index.sasi.disk.Token; import org.apache.cassandra.index.sasi.memory.IndexMemtable; import org.apache.cassandra.index.sasi.plan.Expression; @@ -217,8 +218,9 @@ public class ColumnIndex Op operator = Op.valueOf(op); return !(isTokenized && operator == Op.EQ) // EQ is only applicable to non-tokenized indexes - && !(isLiteral() && operator == Op.RANGE) // RANGE only applicable to indexes non-literal indexes - && mode.supports(operator); // for all other cases let's refer to index itself + && !(isTokenized && mode.mode == OnDiskIndexBuilder.Mode.CONTAINS && operator == Op.PREFIX) // PREFIX not supported on tokenized CONTAINS mode indexes + && !(isLiteral() && operator == Op.RANGE) // RANGE only applicable to indexes non-literal indexes + && mode.supports(operator); // for all other cases let's refer to index itself } http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/disk/Descriptor.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/disk/Descriptor.java b/src/java/org/apache/cassandra/index/sasi/disk/Descriptor.java index a719f50..3aa6f14 100644 --- a/src/java/org/apache/cassandra/index/sasi/disk/Descriptor.java +++ b/src/java/org/apache/cassandra/index/sasi/disk/Descriptor.java @@ -18,7 +18,7 @@ package org.apache.cassandra.index.sasi.disk; /** - * Object descriptor for SSTableAttachedSecondaryIndex files. Similar to, and based upon, the sstable descriptor. + * Object descriptor for SASIIndex files. Similar to, and based upon, the sstable descriptor. */ public class Descriptor { http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndex.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndex.java b/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndex.java index ef487f1..80092ef 100644 --- a/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndex.java +++ b/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndex.java @@ -104,6 +104,7 @@ public class OnDiskIndex implements Iterable<OnDiskIndex.DataTerm>, Closeable protected final AbstractType<?> comparator; protected final MappedBuffer indexFile; protected final long indexSize; + protected final boolean hasMarkedPartials; protected final Function<Long, DecoratedKey> keyFetcher; @@ -138,6 +139,7 @@ public class OnDiskIndex implements Iterable<OnDiskIndex.DataTerm>, Closeable maxKey = ByteBufferUtil.readWithShortLength(backingFile); mode = OnDiskIndexBuilder.Mode.mode(backingFile.readUTF()); + hasMarkedPartials = backingFile.readBoolean(); indexSize = backingFile.length(); indexFile = new MappedBuffer(new ChannelProxy(indexPath, backingFile.getChannel())); @@ -167,6 +169,16 @@ public class OnDiskIndex implements Iterable<OnDiskIndex.DataTerm>, Closeable } } + public boolean hasMarkedPartials() + { + return hasMarkedPartials; + } + + public OnDiskIndexBuilder.Mode mode() + { + return mode; + } + public ByteBuffer minTerm() { return minTerm; @@ -209,6 +221,9 @@ public class OnDiskIndex implements Iterable<OnDiskIndex.DataTerm>, Closeable { assert mode.supports(exp.getOp()); + if (exp.getOp() == Expression.Op.PREFIX && mode == OnDiskIndexBuilder.Mode.CONTAINS && !hasMarkedPartials) + throw new UnsupportedOperationException("prefix queries in CONTAINS mode are not supported by this index"); + // optimization in case single term is requested from index // we don't really need to build additional union iterator if (exp.getOp() == Op.EQ) @@ -602,7 +617,7 @@ public class OnDiskIndex implements Iterable<OnDiskIndex.DataTerm>, Closeable protected PointerTerm cast(MappedBuffer data) { - return new PointerTerm(data, termSize); + return new PointerTerm(data, termSize, hasMarkedPartials); } } @@ -612,7 +627,7 @@ public class OnDiskIndex implements Iterable<OnDiskIndex.DataTerm>, Closeable protected DataTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size, TokenTree perBlockIndex) { - super(content, size); + super(content, size, hasMarkedPartials); this.perBlockIndex = perBlockIndex; } @@ -660,9 +675,9 @@ public class OnDiskIndex implements Iterable<OnDiskIndex.DataTerm>, Closeable protected static class PointerTerm extends Term { - public PointerTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size) + public PointerTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size, boolean hasMarkedPartials) { - super(content, size); + super(content, size, hasMarkedPartials); } public int getBlock() http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndexBuilder.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndexBuilder.java b/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndexBuilder.java index 888dc63..8acbb05 100644 --- a/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndexBuilder.java +++ b/src/java/org/apache/cassandra/index/sasi/disk/OnDiskIndexBuilder.java @@ -24,6 +24,7 @@ import java.util.*; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.index.sasi.plan.Expression.Op; +import org.apache.cassandra.index.sasi.sa.IndexedTerm; import org.apache.cassandra.index.sasi.sa.IntegralSA; import org.apache.cassandra.index.sasi.sa.SA; import org.apache.cassandra.index.sasi.sa.TermIterator; @@ -51,7 +52,7 @@ public class OnDiskIndexBuilder public enum Mode { PREFIX(EnumSet.of(Op.EQ, Op.MATCH, Op.PREFIX, Op.NOT_EQ, Op.RANGE)), - CONTAINS(EnumSet.of(Op.MATCH, Op.CONTAINS, Op.SUFFIX, Op.NOT_EQ)), + CONTAINS(EnumSet.of(Op.EQ, Op.MATCH, Op.CONTAINS, Op.PREFIX, Op.SUFFIX, Op.NOT_EQ)), SPARSE(EnumSet.of(Op.EQ, Op.NOT_EQ, Op.RANGE)); Set<Op> supportedOps; @@ -128,6 +129,7 @@ public class OnDiskIndexBuilder public static final int BLOCK_SIZE = 4096; public static final int MAX_TERM_SIZE = 1024; public static final int SUPER_BLOCK_SIZE = 64; + public static final int IS_PARTIAL_BIT = 15; private final List<MutableLevel<InMemoryPointerTerm>> levels = new ArrayList<>(); private MutableLevel<InMemoryDataTerm> dataLevel; @@ -138,17 +140,24 @@ public class OnDiskIndexBuilder private final Map<ByteBuffer, TokenTreeBuilder> terms; private final Mode mode; + private final boolean marksPartials; private ByteBuffer minKey, maxKey; private long estimatedBytes; public OnDiskIndexBuilder(AbstractType<?> keyComparator, AbstractType<?> comparator, Mode mode) { + this(keyComparator, comparator, mode, true); + } + + public OnDiskIndexBuilder(AbstractType<?> keyComparator, AbstractType<?> comparator, Mode mode, boolean marksPartials) + { this.keyComparator = keyComparator; this.termComparator = comparator; this.terms = new HashMap<>(); this.termSize = TermSize.sizeOf(comparator); this.mode = mode; + this.marksPartials = marksPartials; } public OnDiskIndexBuilder add(ByteBuffer term, DecoratedKey key, long keyPosition) @@ -269,6 +278,7 @@ public class OnDiskIndexBuilder ByteBufferUtil.writeWithShortLength(range.right, out); out.writeUTF(mode.toString()); + out.writeBoolean(marksPartials); out.skipBytes((int) (BLOCK_SIZE - out.position())); @@ -276,7 +286,7 @@ public class OnDiskIndexBuilder : new MutableLevel<>(out, new MutableDataBlock(termComparator, mode)); while (terms.hasNext()) { - Pair<ByteBuffer, TokenTreeBuilder> term = terms.next(); + Pair<IndexedTerm, TokenTreeBuilder> term = terms.next(); addTerm(new InMemoryDataTerm(term.left, term.right), out); } @@ -333,24 +343,30 @@ public class OnDiskIndexBuilder private class InMemoryTerm { - protected final ByteBuffer term; + protected final IndexedTerm term; - public InMemoryTerm(ByteBuffer term) + public InMemoryTerm(IndexedTerm term) { this.term = term; } public int serializedSize() { - return (termSize.isConstant() ? 0 : 2) + term.remaining(); + return (termSize.isConstant() ? 0 : 2) + term.getBytes().remaining(); } public void serialize(DataOutputPlus out) throws IOException { if (termSize.isConstant()) - out.write(term); + { + out.write(term.getBytes()); + } else - ByteBufferUtil.writeWithShortLength(term, out); + { + out.writeShort(term.getBytes().remaining() | ((marksPartials && term.isPartial() ? 1 : 0) << IS_PARTIAL_BIT)); + out.write(term.getBytes()); + } + } } @@ -358,7 +374,7 @@ public class OnDiskIndexBuilder { protected final int blockCnt; - public InMemoryPointerTerm(ByteBuffer term, int blockCnt) + public InMemoryPointerTerm(IndexedTerm term, int blockCnt) { super(term); this.blockCnt = blockCnt; @@ -380,7 +396,7 @@ public class OnDiskIndexBuilder { private final TokenTreeBuilder keys; - public InMemoryDataTerm(ByteBuffer term, TokenTreeBuilder keys) + public InMemoryDataTerm(IndexedTerm term, TokenTreeBuilder keys) { super(term); this.keys = keys; @@ -577,7 +593,7 @@ public class OnDiskIndexBuilder { if (keys.getTokenCount() > MAX_KEYS_SPARSE) throw new IOException(String.format("Term - '%s' belongs to more than %d keys in %s mode, which is not allowed.", - comparator.getString(term.term), MAX_KEYS_SPARSE, mode.name())); + comparator.getString(term.term.getBytes()), MAX_KEYS_SPARSE, mode.name())); writeTerm(term, keys); } http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/memory/TrieMemIndex.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/memory/TrieMemIndex.java b/src/java/org/apache/cassandra/index/sasi/memory/TrieMemIndex.java index 7c72bb7..ca60ac5 100644 --- a/src/java/org/apache/cassandra/index/sasi/memory/TrieMemIndex.java +++ b/src/java/org/apache/cassandra/index/sasi/memory/TrieMemIndex.java @@ -228,6 +228,7 @@ public class TrieMemIndex extends MemIndex case SUFFIX: return trie.getValuesForKeysEndingWith(value); + case PREFIX: case CONTAINS: return trie.getValuesForKeysContaining(value); http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/plan/Expression.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/plan/Expression.java b/src/java/org/apache/cassandra/index/sasi/plan/Expression.java index f2fd02a..ce420d1 100644 --- a/src/java/org/apache/cassandra/index/sasi/plan/Expression.java +++ b/src/java/org/apache/cassandra/index/sasi/plan/Expression.java @@ -322,6 +322,9 @@ public class Expression if (!hasLower()) return true; + if (nonMatchingPartial(term)) + return false; + int cmp = term.compareTo(validator, lower.value, false); return cmp > 0 || cmp == 0 && lower.inclusive; } @@ -331,6 +334,9 @@ public class Expression if (!hasUpper()) return true; + if (nonMatchingPartial(term)) + return false; + int cmp = term.compareTo(validator, upper.value, false); return cmp < 0 || cmp == 0 && upper.inclusive; } @@ -379,6 +385,10 @@ public class Expression && exclusions.equals(o.exclusions); } + private boolean nonMatchingPartial(OnDiskIndex.DataTerm term) + { + return term.isPartial() && operation == Op.PREFIX; + } public static class Bound { http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/sa/IndexedTerm.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/sa/IndexedTerm.java b/src/java/org/apache/cassandra/index/sasi/sa/IndexedTerm.java new file mode 100644 index 0000000..8e27134 --- /dev/null +++ b/src/java/org/apache/cassandra/index/sasi/sa/IndexedTerm.java @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.index.sasi.sa; + +import java.nio.ByteBuffer; + +public class IndexedTerm +{ + private final ByteBuffer term; + private final boolean isPartial; + + public IndexedTerm(ByteBuffer term, boolean isPartial) + { + this.term = term; + this.isPartial = isPartial; + } + + public ByteBuffer getBytes() + { + return term; + } + + public boolean isPartial() + { + return isPartial; + } +} http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/sa/IntegralSA.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/sa/IntegralSA.java b/src/java/org/apache/cassandra/index/sasi/sa/IntegralSA.java index 8356585..e3d591f 100644 --- a/src/java/org/apache/cassandra/index/sasi/sa/IntegralSA.java +++ b/src/java/org/apache/cassandra/index/sasi/sa/IntegralSA.java @@ -22,6 +22,7 @@ import java.util.Collections; import java.util.Comparator; import java.util.Iterator; +import org.apache.cassandra.index.Index; import org.apache.cassandra.index.sasi.disk.OnDiskIndexBuilder; import org.apache.cassandra.index.sasi.disk.TokenTreeBuilder; import org.apache.cassandra.db.marshal.AbstractType; @@ -72,13 +73,13 @@ public class IntegralSA extends SA<ByteBuffer> return terms.get(terms.size() - 1).getTerm(); } - protected Pair<ByteBuffer, TokenTreeBuilder> computeNext() + protected Pair<IndexedTerm, TokenTreeBuilder> computeNext() { if (!termIterator.hasNext()) return endOfData(); Term<ByteBuffer> term = termIterator.next(); - return Pair.create(term.getTerm(), term.getTokens().finish()); + return Pair.create(new IndexedTerm(term.getTerm(), false), term.getTokens().finish()); } } } http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/sa/SuffixSA.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/sa/SuffixSA.java b/src/java/org/apache/cassandra/index/sasi/sa/SuffixSA.java index 592299e..59c50b4 100644 --- a/src/java/org/apache/cassandra/index/sasi/sa/SuffixSA.java +++ b/src/java/org/apache/cassandra/index/sasi/sa/SuffixSA.java @@ -48,10 +48,13 @@ public class SuffixSA extends SA<CharBuffer> private class SASuffixIterator extends TermIterator { + + private static final int COMPLETE_BIT = 31; + private final long[] suffixes; private int current = 0; - private ByteBuffer lastProcessedSuffix; + private IndexedTerm lastProcessedSuffix; private TokenTreeBuilder container; public SASuffixIterator() @@ -61,43 +64,55 @@ public class SuffixSA extends SA<CharBuffer> suffixes = new long[charCount]; long termIndex = -1, currentTermLength = -1; + boolean isComplete = false; for (int i = 0; i < charCount; i++) { if (i >= currentTermLength || currentTermLength == -1) { Term currentTerm = terms.get((int) ++termIndex); currentTermLength = currentTerm.getPosition() + currentTerm.length(); + isComplete = true; } suffixes[i] = (termIndex << 32) | i; + if (isComplete) + suffixes[i] |= (1L << COMPLETE_BIT); + + isComplete = false; } Primitive.sort(suffixes, (a, b) -> { Term aTerm = terms.get((int) (a >>> 32)); Term bTerm = terms.get((int) (b >>> 32)); - return comparator.compare(aTerm.getSuffix(((int) a) - aTerm.getPosition()), - bTerm.getSuffix(((int) b) - bTerm.getPosition())); + return comparator.compare(aTerm.getSuffix(clearCompleteBit(a) - aTerm.getPosition()), + bTerm.getSuffix(clearCompleteBit(b) - bTerm.getPosition())); }); } - private Pair<ByteBuffer, TokenTreeBuilder> suffixAt(int position) + private int clearCompleteBit(long value) + { + return (int) (value & ~(1L << COMPLETE_BIT)); + } + + private Pair<IndexedTerm, TokenTreeBuilder> suffixAt(int position) { long index = suffixes[position]; Term term = terms.get((int) (index >>> 32)); - return Pair.create(term.getSuffix(((int) index) - term.getPosition()), term.getTokens()); + boolean isPartitial = (index & ((long) 1 << 31)) == 0; + return Pair.create(new IndexedTerm(term.getSuffix(clearCompleteBit(index) - term.getPosition()), isPartitial), term.getTokens()); } public ByteBuffer minTerm() { - return suffixAt(0).left; + return suffixAt(0).left.getBytes(); } public ByteBuffer maxTerm() { - return suffixAt(suffixes.length - 1).left; + return suffixAt(suffixes.length - 1).left.getBytes(); } - protected Pair<ByteBuffer, TokenTreeBuilder> computeNext() + protected Pair<IndexedTerm, TokenTreeBuilder> computeNext() { while (true) { @@ -106,27 +121,27 @@ public class SuffixSA extends SA<CharBuffer> if (lastProcessedSuffix == null) return endOfData(); - Pair<ByteBuffer, TokenTreeBuilder> result = finishSuffix(); + Pair<IndexedTerm, TokenTreeBuilder> result = finishSuffix(); lastProcessedSuffix = null; return result; } - Pair<ByteBuffer, TokenTreeBuilder> suffix = suffixAt(current++); + Pair<IndexedTerm, TokenTreeBuilder> suffix = suffixAt(current++); if (lastProcessedSuffix == null) { lastProcessedSuffix = suffix.left; container = new DynamicTokenTreeBuilder(suffix.right); } - else if (comparator.compare(lastProcessedSuffix, suffix.left) == 0) + else if (comparator.compare(lastProcessedSuffix.getBytes(), suffix.left.getBytes()) == 0) { lastProcessedSuffix = suffix.left; container.add(suffix.right); } else { - Pair<ByteBuffer, TokenTreeBuilder> result = finishSuffix(); + Pair<IndexedTerm, TokenTreeBuilder> result = finishSuffix(); lastProcessedSuffix = suffix.left; container = new DynamicTokenTreeBuilder(suffix.right); @@ -136,7 +151,7 @@ public class SuffixSA extends SA<CharBuffer> } } - private Pair<ByteBuffer, TokenTreeBuilder> finishSuffix() + private Pair<IndexedTerm, TokenTreeBuilder> finishSuffix() { return Pair.create(lastProcessedSuffix, container.finish()); } http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/sa/TermIterator.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/sa/TermIterator.java b/src/java/org/apache/cassandra/index/sasi/sa/TermIterator.java index 916aa07..c8572a9 100644 --- a/src/java/org/apache/cassandra/index/sasi/sa/TermIterator.java +++ b/src/java/org/apache/cassandra/index/sasi/sa/TermIterator.java @@ -24,7 +24,7 @@ import org.apache.cassandra.utils.Pair; import com.google.common.collect.AbstractIterator; -public abstract class TermIterator extends AbstractIterator<Pair<ByteBuffer, TokenTreeBuilder>> +public abstract class TermIterator extends AbstractIterator<Pair<IndexedTerm, TokenTreeBuilder>> { public abstract ByteBuffer minTerm(); public abstract ByteBuffer maxTerm(); http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/utils/CombinedTerm.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/utils/CombinedTerm.java b/src/java/org/apache/cassandra/index/sasi/utils/CombinedTerm.java index ba7123a..81e535d 100644 --- a/src/java/org/apache/cassandra/index/sasi/utils/CombinedTerm.java +++ b/src/java/org/apache/cassandra/index/sasi/utils/CombinedTerm.java @@ -41,6 +41,11 @@ public class CombinedTerm implements CombinedValue<DataTerm> return term.getTerm(); } + public boolean isPartial() + { + return term.isPartial(); + } + public RangeIterator<Long, Token> getTokenIterator() { RangeIterator.Builder<Long, Token> union = RangeUnionIterator.builder(); http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/src/java/org/apache/cassandra/index/sasi/utils/CombinedTermIterator.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/cassandra/index/sasi/utils/CombinedTermIterator.java b/src/java/org/apache/cassandra/index/sasi/utils/CombinedTermIterator.java index c29de06..4b004e0 100644 --- a/src/java/org/apache/cassandra/index/sasi/utils/CombinedTermIterator.java +++ b/src/java/org/apache/cassandra/index/sasi/utils/CombinedTermIterator.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import org.apache.cassandra.index.sasi.disk.Descriptor; import org.apache.cassandra.index.sasi.disk.OnDiskIndex; import org.apache.cassandra.index.sasi.disk.TokenTreeBuilder; +import org.apache.cassandra.index.sasi.sa.IndexedTerm; import org.apache.cassandra.index.sasi.sa.TermIterator; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.utils.Pair; @@ -72,7 +73,7 @@ public class CombinedTermIterator extends TermIterator return max; } - protected Pair<ByteBuffer, TokenTreeBuilder> computeNext() + protected Pair<IndexedTerm, TokenTreeBuilder> computeNext() { if (!union.hasNext()) { @@ -81,7 +82,7 @@ public class CombinedTermIterator extends TermIterator else { CombinedTerm term = union.next(); - return Pair.create(term.getTerm(), term.getTokenTreeBuilder()); + return Pair.create(new IndexedTerm(term.getTerm(), term.isPartial()), term.getTokenTreeBuilder()); } } http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java ---------------------------------------------------------------------- diff --git a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java index b3d641c..c44c48c 100644 --- a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java +++ b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java @@ -1789,30 +1789,43 @@ public class SASIIndexTest String containsTable = "sasi_like_contains_test"; String prefixTable = "sasi_like_prefix_test"; String analyzedPrefixTable = "sasi_like_analyzed_prefix_test"; + String tokenizedContainsTable = "sasi_like_analyzed_contains_test"; QueryProcessor.executeOnceInternal(String.format("CREATE TABLE IF NOT EXISTS %s.%s (k int primary key, v text);", KS_NAME, containsTable)); QueryProcessor.executeOnceInternal(String.format("CREATE TABLE IF NOT EXISTS %s.%s (k int primary key, v text);", KS_NAME, prefixTable)); QueryProcessor.executeOnceInternal(String.format("CREATE TABLE IF NOT EXISTS %s.%s (k int primary key, v text);", KS_NAME, analyzedPrefixTable)); + QueryProcessor.executeOnceInternal(String.format("CREATE TABLE IF NOT EXISTS %s.%s (k int primary key, v text);", KS_NAME, tokenizedContainsTable)); QueryProcessor.executeOnceInternal(String.format("CREATE CUSTOM INDEX IF NOT EXISTS ON %s.%s(v) " + - "USING 'org.apache.cassandra.index.sasi.SASIIndex' WITH OPTIONS = { 'mode' : 'CONTAINS' };", KS_NAME, containsTable)); + "USING 'org.apache.cassandra.index.sasi.SASIIndex' WITH OPTIONS = { 'mode' : 'CONTAINS', " + + "'analyzer_class': 'org.apache.cassandra.index.sasi.analyzer.NonTokenizingAnalyzer', " + + "'case_sensitive': 'false' };", + KS_NAME, containsTable)); QueryProcessor.executeOnceInternal(String.format("CREATE CUSTOM INDEX IF NOT EXISTS ON %s.%s(v) " + "USING 'org.apache.cassandra.index.sasi.SASIIndex' WITH OPTIONS = { 'mode' : 'PREFIX' };", KS_NAME, prefixTable)); QueryProcessor.executeOnceInternal(String.format("CREATE CUSTOM INDEX IF NOT EXISTS ON %s.%s(v) " + "USING 'org.apache.cassandra.index.sasi.SASIIndex' WITH OPTIONS = { 'mode' : 'PREFIX', 'analyzed': 'true' };", KS_NAME, analyzedPrefixTable)); - - testLIKEAndEQSemanticsWithDifferenceKindsOfIndexes(containsTable, prefixTable, analyzedPrefixTable, false); - testLIKEAndEQSemanticsWithDifferenceKindsOfIndexes(containsTable, prefixTable, analyzedPrefixTable, true); + QueryProcessor.executeOnceInternal(String.format("CREATE CUSTOM INDEX IF NOT EXISTS ON %s.%s(v) " + + "USING 'org.apache.cassandra.index.sasi.SASIIndex' WITH OPTIONS = " + + "{ 'mode' : 'CONTAINS', 'analyzer_class': 'org.apache.cassandra.index.sasi.analyzer.StandardAnalyzer'," + + "'analyzed': 'true', 'tokenization_enable_stemming': 'true', 'tokenization_normalize_lowercase': 'true', " + + "'tokenization_locale': 'en' };", + KS_NAME, tokenizedContainsTable)); + + testLIKEAndEQSemanticsWithDifferenceKindsOfIndexes(containsTable, prefixTable, analyzedPrefixTable, tokenizedContainsTable, false); + testLIKEAndEQSemanticsWithDifferenceKindsOfIndexes(containsTable, prefixTable, analyzedPrefixTable, tokenizedContainsTable, true); } private void testLIKEAndEQSemanticsWithDifferenceKindsOfIndexes(String containsTable, String prefixTable, String analyzedPrefixTable, + String tokenizedContainsTable, boolean forceFlush) { QueryProcessor.executeOnceInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (?, ?);", KS_NAME, containsTable), 0, "Pavel"); QueryProcessor.executeOnceInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (?, ?);", KS_NAME, prefixTable), 0, "Jean-Claude"); QueryProcessor.executeOnceInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (?, ?);", KS_NAME, analyzedPrefixTable), 0, "Jean-Claude"); + QueryProcessor.executeOnceInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (?, ?);", KS_NAME, tokenizedContainsTable), 0, "Pavel"); if (forceFlush) { @@ -1829,30 +1842,46 @@ public class SASIIndexTest Assert.assertNotNull(results); Assert.assertEquals(0, results.size()); + results = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v LIKE 'Pav%%';", KS_NAME, containsTable)); + Assert.assertNotNull(results); + Assert.assertEquals(1, results.size()); + results = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v LIKE 'Pavel';", KS_NAME, containsTable)); Assert.assertNotNull(results); Assert.assertEquals(1, results.size()); + results = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v = 'Pav';", KS_NAME, containsTable)); + Assert.assertNotNull(results); + Assert.assertEquals(0, results.size()); + + results = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v = 'Pavel';", KS_NAME, containsTable)); + Assert.assertNotNull(results); + Assert.assertEquals(1, results.size()); + try { - QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v = 'Pav';", KS_NAME, containsTable)); + QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v = 'Pav';", KS_NAME, tokenizedContainsTable)); Assert.fail(); } catch (InvalidRequestException e) { - // expected since CONTAINS indexes only support LIKE + // expected since CONTAINS + analyzed indexes only support LIKE } try { - QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v LIKE 'Pav%%';", KS_NAME, containsTable)); + QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v LIKE 'Pav%%';", KS_NAME, tokenizedContainsTable)); Assert.fail(); } catch (InvalidRequestException e) { - // expected since CONTAINS indexes only support LIKE '%<term>' and LIKE '%<term>%' + // expected since CONTAINS + analyzed only support LIKE } + QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v LIKE 'Pav%%';", KS_NAME, containsTable)); + Assert.assertNotNull(results); + Assert.assertEquals(1, results.size()); + results = QueryProcessor.executeOnceInternal(String.format("SELECT * FROM %s.%s WHERE v LIKE '%%Pav';", KS_NAME, containsTable)); Assert.assertNotNull(results); Assert.assertEquals(0, results.size()); http://git-wip-us.apache.org/repos/asf/cassandra/blob/2ca2fffe/test/unit/org/apache/cassandra/index/sasi/disk/OnDiskIndexTest.java ---------------------------------------------------------------------- diff --git a/test/unit/org/apache/cassandra/index/sasi/disk/OnDiskIndexTest.java b/test/unit/org/apache/cassandra/index/sasi/disk/OnDiskIndexTest.java index 628bd36..bac23ea 100644 --- a/test/unit/org/apache/cassandra/index/sasi/disk/OnDiskIndexTest.java +++ b/test/unit/org/apache/cassandra/index/sasi/disk/OnDiskIndexTest.java @@ -65,6 +65,7 @@ public class OnDiskIndexTest put(UTF8Type.instance.decompose("foo"), keyBuilder(7L)); put(UTF8Type.instance.decompose("bar"), keyBuilder(9L, 10L)); put(UTF8Type.instance.decompose("michael"), keyBuilder(11L, 12L, 1L)); + put(UTF8Type.instance.decompose("am"), keyBuilder(15L)); }}; OnDiskIndexBuilder builder = new OnDiskIndexBuilder(UTF8Type.instance, UTF8Type.instance, OnDiskIndexBuilder.Mode.CONTAINS); @@ -100,6 +101,7 @@ public class OnDiskIndexTest Assert.assertEquals(convert(7), convert(onDisk.search(expressionFor("oo")))); Assert.assertEquals(convert(7), convert(onDisk.search(expressionFor("o")))); Assert.assertEquals(convert(1, 2, 3, 4, 6), convert(onDisk.search(expressionFor("t")))); + Assert.assertEquals(convert(1, 2, 11, 12), convert(onDisk.search(expressionFor("m", Operator.LIKE_PREFIX)))); Assert.assertEquals(Collections.<DecoratedKey>emptySet(), convert(onDisk.search(expressionFor("hello")))); @@ -851,7 +853,12 @@ public class OnDiskIndexTest private static Expression expressionFor(String term) { - return expressionFor(UTF8Type.instance, UTF8Type.instance.decompose(term)); + return expressionFor(term, Operator.LIKE_CONTAINS); + } + + private static Expression expressionFor(String term, Operator op) + { + return expressionFor(op, UTF8Type.instance, UTF8Type.instance.decompose(term)); } private static void addAll(OnDiskIndexBuilder builder, ByteBuffer term, TokenTreeBuilder tokens)