Taewoo Kim has submitted this change and it was merged. Change subject: Optimized the binary tokenizer - get the total number of tokens Change-Id: Ifa9a18a43a097766da22633bb48371ffc78406ae Reviewed-on: https://asterix-gerrit.ics.uci.edu/348 Tested-by: Jenkins <[email protected]> Reviewed-by: Young-Seok Kim <k ......................................................................
Optimized the binary tokenizer - get the total number of tokens Change-Id: Ifa9a18a43a097766da22633bb48371ffc78406ae Reviewed-on: https://asterix-gerrit.ics.uci.edu/348 Tested-by: Jenkins <[email protected]> Reviewed-by: Young-Seok Kim <[email protected]> --- M hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java M hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java M hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java M hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java M hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java 5 files changed, 60 insertions(+), 36 deletions(-) Approvals: Young-Seok Kim: Looks good to me, approved Jenkins: Verified diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java index 231adbd..8ac1e3c 100644 --- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java +++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java @@ -31,8 +31,7 @@ import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer; import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken; -public class BinaryTokenizerOperatorNodePushable extends - AbstractUnaryInputUnaryOutputOperatorNodePushable { +public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable { private final IHyracksTaskContext ctx; private final IBinaryTokenizer tokenizer; @@ -48,9 +47,8 @@ private GrowableArray builderData; private FrameTupleAppender appender; - public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx, - RecordDescriptor inputRecDesc, RecordDescriptor outputRecDesc, - IBinaryTokenizer tokenizer, int docField, int[] keyFields, + public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx, RecordDescriptor inputRecDesc, + RecordDescriptor outputRecDesc, IBinaryTokenizer tokenizer, int docField, int[] keyFields, boolean addNumTokensKey, boolean writeKeyFieldsFirst) { this.ctx = ctx; this.tokenizer = tokenizer; @@ -78,26 +76,16 @@ for (int i = 0; i < tupleCount; i++) { short numTokens = 0; - if (addNumTokensKey) { - // Run through the tokens to get the total number of tokens. - tokenizer.reset( - accessor.getBuffer().array(), - accessor.getTupleStartOffset(i) - + accessor.getFieldSlotsLength() - + accessor.getFieldStartOffset(i, docField), - accessor.getFieldLength(i, docField)); - while (tokenizer.hasNext()) { - tokenizer.next(); - numTokens++; - } - } tokenizer.reset( accessor.getBuffer().array(), - accessor.getTupleStartOffset(i) - + accessor.getFieldSlotsLength() - + accessor.getFieldStartOffset(i, docField), - accessor.getFieldLength(i, docField)); + accessor.getTupleStartOffset(i) + accessor.getFieldSlotsLength() + + accessor.getFieldStartOffset(i, docField), accessor.getFieldLength(i, docField)); + + if (addNumTokensKey) { + // Get the total number of tokens. + numTokens = tokenizer.getTokensCount(); + } // Write token and data into frame by following the order specified // in the writeKeyFieldsFirst field. @@ -151,8 +139,8 @@ } - FrameUtils.appendToWriter(writer, appender, builder.getFieldEndOffsets(), - builder.getByteArray(), 0, builder.getSize()); + FrameUtils.appendToWriter(writer, appender, builder.getFieldEndOffsets(), builder.getByteArray(), 0, + builder.getSize()); } diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java index af20ad2..5ac4aa4 100644 --- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java +++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java @@ -3,9 +3,9 @@ * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * you may obtain a copy of the License from - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,7 +24,10 @@ protected int length; protected int tokenLength; protected int index; + protected int originalIndex; protected int utf8Length; + protected boolean tokenCountCalculated = false; + protected short tokenCount; protected final IntArray tokensStart; protected final IntArray tokensLength; @@ -69,5 +72,10 @@ tokensStart.reset(); tokensLength.reset(); } + + // Needed for calculating the number of tokens + originalIndex = index; + tokenCountCalculated = false; + tokenCount = 0; } } diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java index daf853a..c4a6994 100644 --- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java +++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java @@ -3,9 +3,9 @@ * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * you may obtain a copy of the License from - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -74,4 +74,24 @@ // set token token.reset(data, currentTokenStart, index, tokenLength, tokenCount); } + + @Override + public short getTokensCount() { + if (!tokenCountCalculated) { + tokenCount = 0; + boolean previousCharIsSeparator = true; + while (originalIndex < length) { + if (isSeparator(UTF8StringPointable.charAt(data, originalIndex))) { + previousCharIsSeparator = true; + } else { + if (previousCharIsSeparator) { + tokenCount++; + previousCharIsSeparator = false; + } + } + originalIndex += UTF8StringPointable.charSize(data, originalIndex); + } + } + return tokenCount; + } } \ No newline at end of file diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java index 207df81..206175b 100644 --- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java +++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java @@ -3,9 +3,9 @@ * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * you may obtain a copy of the License from - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,11 +16,14 @@ package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers; public interface IBinaryTokenizer { - public IToken getToken(); + public IToken getToken(); - public boolean hasNext(); + public boolean hasNext(); - public void next(); + public void next(); - public void reset(byte[] data, int start, int length); + public void reset(byte[] data, int start, int length); + + // Get the total number of tokens + public short getTokensCount(); } diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java index b1d722e..d19da58 100644 --- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java +++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java @@ -3,9 +3,9 @@ * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * you may obtain a copy of the License from - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -114,4 +114,9 @@ public void setPrePost(boolean usePrePost) { this.usePrePost = usePrePost; } + + @Override + public short getTokensCount() { + return (short) totalGrams; + } } -- To view, visit https://asterix-gerrit.ics.uci.edu/348 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ifa9a18a43a097766da22633bb48371ffc78406ae Gerrit-PatchSet: 4 Gerrit-Project: hyracks Gerrit-Branch: master Gerrit-Owner: Taewoo Kim <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Taewoo Kim <[email protected]> Gerrit-Reviewer: Young-Seok Kim <[email protected]>
