Taewoo Kim has uploaded a new change for review.
https://asterix-gerrit.ics.uci.edu/348
Change subject: Optimized the binary tokenizer - get the total number of tokens
Change-Id: Ifa9a18a43a097766da22633bb48371ffc78406ae
......................................................................
Optimized the binary tokenizer - get the total number of tokens
Change-Id: Ifa9a18a43a097766da22633bb48371ffc78406ae
---
M
hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
M
hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
M
hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
M
hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
M
hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
5 files changed, 64 insertions(+), 36 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/hyracks refs/changes/48/348/1
diff --git
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 231adbd..bdd36ad 100644
---
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -31,8 +31,7 @@
import
edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
-public class BinaryTokenizerOperatorNodePushable extends
- AbstractUnaryInputUnaryOutputOperatorNodePushable {
+public class BinaryTokenizerOperatorNodePushable extends
AbstractUnaryInputUnaryOutputOperatorNodePushable {
private final IHyracksTaskContext ctx;
private final IBinaryTokenizer tokenizer;
@@ -48,9 +47,11 @@
private GrowableArray builderData;
private FrameTupleAppender appender;
- public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx,
- RecordDescriptor inputRecDesc, RecordDescriptor outputRecDesc,
- IBinaryTokenizer tokenizer, int docField, int[] keyFields,
+ private long nextTime = 0;
+ private long calcTime = 0;
+
+ public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx,
RecordDescriptor inputRecDesc,
+ RecordDescriptor outputRecDesc, IBinaryTokenizer tokenizer, int
docField, int[] keyFields,
boolean addNumTokensKey, boolean writeKeyFieldsFirst) {
this.ctx = ctx;
this.tokenizer = tokenizer;
@@ -78,26 +79,16 @@
for (int i = 0; i < tupleCount; i++) {
short numTokens = 0;
- if (addNumTokensKey) {
- // Run through the tokens to get the total number of tokens.
- tokenizer.reset(
- accessor.getBuffer().array(),
- accessor.getTupleStartOffset(i)
- + accessor.getFieldSlotsLength()
- + accessor.getFieldStartOffset(i, docField),
- accessor.getFieldLength(i, docField));
- while (tokenizer.hasNext()) {
- tokenizer.next();
- numTokens++;
- }
- }
tokenizer.reset(
accessor.getBuffer().array(),
- accessor.getTupleStartOffset(i)
- + accessor.getFieldSlotsLength()
- + accessor.getFieldStartOffset(i, docField),
- accessor.getFieldLength(i, docField));
+ accessor.getTupleStartOffset(i) +
accessor.getFieldSlotsLength()
+ + accessor.getFieldStartOffset(i, docField),
accessor.getFieldLength(i, docField));
+
+ if (addNumTokensKey) {
+ // Get the total number of tokens.
+ numTokens = tokenizer.getTokensCount();
+ }
// Write token and data into frame by following the order specified
// in the writeKeyFieldsFirst field.
@@ -151,13 +142,14 @@
}
- FrameUtils.appendToWriter(writer, appender,
builder.getFieldEndOffsets(),
- builder.getByteArray(), 0, builder.getSize());
+ FrameUtils.appendToWriter(writer, appender,
builder.getFieldEndOffsets(), builder.getByteArray(), 0,
+ builder.getSize());
}
}
+ System.out.println("BinaryTokenizerOperatorNodePushable- nextTime:" +
nextTime + " calcTime:" + calcTime);
}
@Override
diff --git
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
index af20ad2..5ac4aa4 100644
---
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
+++
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,7 +24,10 @@
protected int length;
protected int tokenLength;
protected int index;
+ protected int originalIndex;
protected int utf8Length;
+ protected boolean tokenCountCalculated = false;
+ protected short tokenCount;
protected final IntArray tokensStart;
protected final IntArray tokensLength;
@@ -69,5 +72,10 @@
tokensStart.reset();
tokensLength.reset();
}
+
+ // Needed for calculating the number of tokens
+ originalIndex = index;
+ tokenCountCalculated = false;
+ tokenCount = 0;
}
}
diff --git
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index daf853a..b0bccf8 100644
---
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -74,4 +74,24 @@
// set token
token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
}
+
+ @Override
+ public short getTokensCount() {
+ if (!tokenCountCalculated) {
+ tokenCount = 0;
+ boolean previousCharIsNonSeparator = false;
+ while (originalIndex < length) {
+ if (isSeparator(UTF8StringPointable.charAt(data,
originalIndex))) {
+ previousCharIsNonSeparator = false;
+ } else {
+ if (!previousCharIsNonSeparator) {
+ tokenCount++;
+ previousCharIsNonSeparator = true;
+ }
+ }
+ originalIndex += UTF8StringPointable.charSize(data,
originalIndex);
+ }
+ }
+ return tokenCount;
+ }
}
\ No newline at end of file
diff --git
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
index 207df81..206175b 100644
---
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
+++
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,11 +16,14 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
public interface IBinaryTokenizer {
- public IToken getToken();
+ public IToken getToken();
- public boolean hasNext();
+ public boolean hasNext();
- public void next();
+ public void next();
- public void reset(byte[] data, int start, int length);
+ public void reset(byte[] data, int start, int length);
+
+ // Get the total number of tokens
+ public short getTokensCount();
}
diff --git
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index b1d722e..d19da58 100644
---
a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++
b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -114,4 +114,9 @@
public void setPrePost(boolean usePrePost) {
this.usePrePost = usePrePost;
}
+
+ @Override
+ public short getTokensCount() {
+ return (short) totalGrams;
+ }
}
--
To view, visit https://asterix-gerrit.ics.uci.edu/348
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ifa9a18a43a097766da22633bb48371ffc78406ae
Gerrit-PatchSet: 1
Gerrit-Project: hyracks
Gerrit-Branch: master
Gerrit-Owner: Taewoo Kim <[email protected]>