This is an automated email from the ASF dual-hosted git repository. imaxon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/asterixdb.git
commit f85cee9e60c4a68bc2a7de87284c6b812794f76b Author: Rui Guo <ru...@uci.edu> AuthorDate: Mon Apr 13 10:04:19 2020 -0700 [NO ISSUE] Remove out-of-date tokenizer The string-based Tokenizer should be replaced with the array-based IBinaryTokenizer. The Tokenizer is not used in the codebase in a meaningful way, so let's remove it to make things clear. Change-Id: I483604bf2a5e20c18f6224ac2a153667828dabfb Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5763 Integration-Tests: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Tested-by: Jenkins <jenk...@fulliautomatix.ics.uci.edu> Reviewed-by: Ian Maxon <ima...@uci.edu> --- .../asterix/fuzzyjoin/FuzzyJoinAppendLength.java | 58 --------- .../asterix/fuzzyjoin/FuzzyJoinTokenize.java | 133 --------------------- .../fuzzyjoin/similarity/SimilarityMetric.java | 4 - .../similarity/SimilarityMetricJaccard.java | 21 ---- .../fuzzyjoin/tokenizer/NGramTokenizer.java | 90 -------------- .../apache/asterix/fuzzyjoin/tokenizer/Token.java | 118 ------------------ .../asterix/fuzzyjoin/tokenizer/Tokenizer.java | 27 ----- .../fuzzyjoin/tokenizer/TokenizerBuffered.java | 30 ----- .../tokenizer/TokenizerBufferedFactory.java | 34 ------ .../fuzzyjoin/tokenizer/TokenizerFactory.java | 31 ----- .../asterix/fuzzyjoin/tokenizer/WordTokenizer.java | 68 ----------- .../fuzzyjoin/tokenizer/WordTokenizerBuffered.java | 92 -------------- .../fuzzyjoin/tokenorder/IntTokenCountRank.java | 28 ----- .../tokenorder/IntTokenCountRankFrequency.java | 58 --------- .../asterix/fuzzyjoin/tokenorder/IntTokenRank.java | 28 ----- .../tokenorder/IntTokenRankFrequency.java | 54 --------- .../asterix/fuzzyjoin/tokenorder/TokenLoad.java | 61 ---------- .../asterix/fuzzyjoin/tokenorder/TokenRank.java | 31 ----- .../tokenorder/TokenRankBufferedFrequency.java | 75 ------------ .../fuzzyjoin/tokenorder/TokenRankFrequency.java | 61 ---------- 20 files changed, 1102 deletions(-) diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java deleted file mode 100644 index 8be6f0c..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinAppendLength.java +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Collection; - -import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer; -import org.apache.asterix.fuzzyjoin.tokenizer.TokenizerFactory; - -public class FuzzyJoinAppendLength { - public static void main(String args[]) throws IOException { - final String inputFileName = args[0]; - final String outputFileName = args[1]; - - BufferedReader input = new BufferedReader(new FileReader(inputFileName)); - BufferedWriter output = new BufferedWriter(new FileWriter(outputFileName)); - - Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE, - FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); - - int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3"); - - String line; - while ((line = input.readLine()) != null) { - String[] splits = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); - Collection<String> tokens = - tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR)); - output.write(splits[0] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[1] + FuzzyJoinConfig.RECORD_SEPARATOR - + splits[2] + FuzzyJoinConfig.RECORD_SEPARATOR + splits[3] + FuzzyJoinConfig.RECORD_SEPARATOR - + tokens.size() + "\n"); - } - - input.close(); - output.close(); - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java deleted file mode 100644 index 4c85f25..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/FuzzyJoinTokenize.java +++ /dev/null @@ -1,133 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin; - -import java.io.BufferedOutputStream; -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; - -import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer; -import org.apache.asterix.fuzzyjoin.tokenizer.TokenizerFactory; -import org.apache.asterix.fuzzyjoin.tokenorder.TokenLoad; -import org.apache.asterix.fuzzyjoin.tokenorder.TokenRank; -import org.apache.asterix.fuzzyjoin.tokenorder.TokenRankFrequency; - -public class FuzzyJoinTokenize { - public static class TokenCount implements Comparable<Object> { - public String token; - public MutableInteger count; - - public TokenCount(String token, MutableInteger count) { - this.token = token; - this.count = count; - } - - @Override - public int compareTo(Object o) { - TokenCount tc = (TokenCount) o; - return count.compareTo(tc.count); - } - - public String getToken() { - return token; - } - - @Override - public String toString() { - return token + " " + count; - } - } - - public static void main(String args[]) throws IOException { - final String inputFileName = args[0]; - final String tokensFileName = args[1]; - final String tokenizedFileName = args[2]; - - BufferedReader input = new BufferedReader(new FileReader(inputFileName)); - - Tokenizer tokenizer = TokenizerFactory.getTokenizer(FuzzyJoinConfig.TOKENIZER_VALUE, - FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR); - - int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3"); - - String line; - HashMap<String, MutableInteger> tokenCount = new HashMap<String, MutableInteger>(); - while ((line = input.readLine()) != null) { - Collection<String> tokens = - tokenizer.tokenize(FuzzyJoinUtil.getData(line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX), - dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR)); - - for (String token : tokens) { - MutableInteger count = tokenCount.get(token); - if (count == null) { - tokenCount.put(token, new MutableInteger(1)); - } else { - count.inc(); - } - } - } - - input.close(); - - ArrayList<TokenCount> tokenCounts = new ArrayList<TokenCount>(); - tokenCount.forEach((key, value) -> tokenCounts.add(new TokenCount(key, value))); - Collections.sort(tokenCounts); - - BufferedWriter outputTokens = new BufferedWriter(new FileWriter(tokensFileName)); - for (TokenCount tc : tokenCounts) { - outputTokens.write(tc.getToken() + "\n"); - } - outputTokens.close(); - - TokenRank tokenRank = new TokenRankFrequency(); - TokenLoad tokenLoad = new TokenLoad(tokensFileName, tokenRank); - tokenLoad.loadTokenRank(); - - input = new BufferedReader(new FileReader(inputFileName)); - LittleEndianIntOutputStream outputTokenized = - new LittleEndianIntOutputStream(new BufferedOutputStream(new FileOutputStream(tokenizedFileName))); - while ((line = input.readLine()) != null) { - String splits[] = line.split(FuzzyJoinConfig.RECORD_SEPARATOR_REGEX); - int rid = Integer.parseInt(splits[FuzzyJoinConfig.RECORD_KEY]); - outputTokenized.writeInt(rid); - Collection<String> tokens = - tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns, FuzzyJoinConfig.TOKEN_SEPARATOR)); - Collection<Integer> tokensRanked = tokenRank.getTokenRanks(tokens); - outputTokenized.writeInt(tokensRanked.size()); - for (Integer token : tokensRanked) { - outputTokenized.writeInt(token); - } - // for (int i = 0; i < tokens.size() - tokensRanked.size(); i++) { - // outputTokenized.writeInt(Integer.MAX_VALUE); - // } - } - - input.close(); - outputTokenized.close(); - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java index 3348d4c..1133246 100644 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java @@ -19,7 +19,6 @@ package org.apache.asterix.fuzzyjoin.similarity; -import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.data.std.util.ISequenceIterator; @@ -118,7 +117,4 @@ public abstract class SimilarityMetric { public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY); - public abstract float getSimilarity(int[] tokensX, int[] tokensY); - - public abstract float getSimilarity(String stringX, String stringY, Tokenizer tokenizer); } diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java index 63d3077..f72400f 100644 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java +++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java @@ -22,7 +22,6 @@ package org.apache.asterix.fuzzyjoin.similarity; import java.util.Set; import java.util.TreeSet; -import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer; import org.apache.hyracks.api.exceptions.HyracksDataException; import org.apache.hyracks.data.std.util.ISequenceIterator; @@ -82,24 +81,4 @@ public class SimilarityMetricJaccard extends SimilarityMetric implements IGeneri return (float) intersectionSize / (totalSize - intersectionSize); } - @Override - public float getSimilarity(int[] tokensX, int[] tokensY) { - return getSimilarity(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length); - } - - @Override - public float getSimilarity(String stringX, String stringY, Tokenizer tokenizer) { - Set<String> setX = new TreeSet<String>(); - for (String token : tokenizer.tokenize(stringX)) { - setX.add(token); - } - Set<String> setY = new TreeSet<String>(); - for (String token : tokenizer.tokenize(stringY)) { - setY.add(token); - } - int lengthX = setX.size(); - int lengthY = setY.size(); - setX.retainAll(setY); - return ((float) setX.size()) / (lengthX + lengthY - setX.size()); - } } diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java deleted file mode 100644 index 5594e43..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/NGramTokenizer.java +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -public class NGramTokenizer implements Tokenizer { - - /** - * - */ - private static final long serialVersionUID = 1L; - - public static void main(String args[]) { - Tokenizer tokenizer = new NGramTokenizer(); - String a = "hadoopoop"; - System.out.println(a + ":" + tokenizer.tokenize(a)); - } - - private final int gramLength; - - /** - * padding used in q gram calculation. - */ - private final char QGRAMENDPADDING = '$'; - - /** - * padding used in q gram calculation. - */ - private final char QGRAMSTARTPADDING = '$'; - - public NGramTokenizer() { - gramLength = 3; - } - - public NGramTokenizer(int gramLength) { - this.gramLength = gramLength; - } - - private StringBuffer getAdjustedString(String input) { - final StringBuffer adjustedString = new StringBuffer(); - for (int i = 0; i < gramLength - 1; i++) { - adjustedString.append(QGRAMSTARTPADDING); - } - adjustedString.append(input); - for (int i = 0; i < gramLength - 1; i++) { - adjustedString.append(QGRAMENDPADDING); - } - return adjustedString; - } - - public List<String> tokenize(String input) { - final ArrayList<String> returnVect = new ArrayList<String>(); - final StringBuffer adjustedString = getAdjustedString(input); - int curPos = 0; - final int length = adjustedString.length() - (gramLength - 1); - final HashMap<String, Integer> grams = new HashMap<String, Integer>(); - while (curPos < length) { - final String term = adjustedString.substring(curPos, curPos + gramLength); - Integer count = grams.get(term); - if (count == null) { - count = new Integer(0); - } - count++; - grams.put(term, count); - returnVect.add(term + count); - curPos++; - } - return returnVect; - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java deleted file mode 100644 index 720d269..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Token.java +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.Serializable; - -public class Token implements Serializable { - /** - * - */ - private static final long serialVersionUID = 1L; - - private CharSequence data; - private int start; - private int length; - private int count; - - /** Cache the hash code for the string */ - private int hash; // Default to 0 - - public Token() { - } - - public Token(CharSequence data, int start, int length, int count) { - set(data, start, length, count); - } - - @Override - public boolean equals(Object o) { - if (o == null) { - return false; - } - if (!(o instanceof Token)) { - return false; - } - Token t = (Token) o; - if (t.length != length) { - return false; - } - for (int i = 0; i < length; i++) { - if (t.data.charAt(t.start + i) != data.charAt(start + i)) { - return false; - } - } - return true; - } - - public CharSequence getCharSequence() { - return data; - } - - public int getCount() { - return count; - } - - public int getLength() { - return length; - } - - public int getStart() { - return start; - } - - @Override - public int hashCode() { - int h = hash; - if (h == 0 && length > 0) { - for (int i = 0; i < length; i++) { - h = 31 * h + data.charAt(start + i); - } - h = 31 * h + count; - hash = h; - } - return h; - } - - public int length() { - return length; - } - - public void set(CharSequence data, int start, int length, int count) { - this.data = data; - this.start = start; - this.length = length; - this.count = count; - hash = 0; - } - - public void set(String data, int count) { - this.data = data; - start = 0; - length = data.length(); - this.count = count; - hash = 0; - } - - @Override - public String toString() { - return "(" + data.subSequence(start, start + length) + ", " + count + ")"; - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java deleted file mode 100644 index 71078d5..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/Tokenizer.java +++ /dev/null @@ -1,27 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.io.Serializable; -import java.util.List; - -public interface Tokenizer extends Serializable { - public List<String> tokenize(String text); -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java deleted file mode 100644 index 19fcf18..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBuffered.java +++ /dev/null @@ -1,30 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public interface TokenizerBuffered { - public void advance(); - - public boolean end(); - - public Token getToken(); - - public void reset(); -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java deleted file mode 100644 index 2f4e8c6..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerBufferedFactory.java +++ /dev/null @@ -1,34 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public class TokenizerBufferedFactory { - public static TokenizerBuffered getTokenizer(String tokenizer, StringBuilder buffer) { - if (tokenizer.equals("Word")) { - return new WordTokenizerBuffered(buffer); - } - throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\"."); - } - - public static boolean isSeparator(char c) { - return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER - || Character.getType(c) == Character.OTHER_NUMBER); - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java deleted file mode 100644 index 9b1856a..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/TokenizerFactory.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -public class TokenizerFactory { - public static Tokenizer getTokenizer(String tokenizer, String wordSeparator, char tokenSeparator) { - if (tokenizer.equals("NGram")) { - return new NGramTokenizer(); - } else if (tokenizer.equals("Word")) { - return new WordTokenizer(wordSeparator, tokenSeparator); - } - throw new RuntimeException("Unknown tokenizer \"" + tokenizer + "\"."); - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java deleted file mode 100644 index fa0bfe7..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizer.java +++ /dev/null @@ -1,68 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; - -public class WordTokenizer implements Tokenizer { - - /** - * - */ - private static final long serialVersionUID = 1L; - - public static void main(String args[]) { - Tokenizer tokenizer = new WordTokenizer("_", '_'); - String a = "hadoop_rocks_in_java"; - System.out.println(a + ":" + tokenizer.tokenize(a)); - } - - private final String wordSeparator; - private final char tokenSeparator; - - public WordTokenizer() { - this(" ", '_'); - } - - public WordTokenizer(String wordSeparator, char tokenSeparator) { - this.wordSeparator = wordSeparator; - this.tokenSeparator = tokenSeparator; - } - - public List<String> tokenize(String input) { - final ArrayList<String> returnVect = new ArrayList<String>(); - final HashMap<String, Integer> tokens = new HashMap<String, Integer>(); - for (String term : input.split(wordSeparator)) { - if (term.length() == 0) { - continue; - } - Integer count = tokens.get(term); - if (count == null) { - count = 0; - } - count++; - tokens.put(term, count); - returnVect.add(term + tokenSeparator + count); - } - return returnVect; - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java deleted file mode 100644 index 29206f9..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenizer/WordTokenizerBuffered.java +++ /dev/null @@ -1,92 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenizer; - -import org.apache.asterix.fuzzyjoin.IntArray; - -public class WordTokenizerBuffered implements TokenizerBuffered { - - private final StringBuilder buffer; - private int index; - private final Token token; - - private final IntArray tokensStart, tokensLength; - - public WordTokenizerBuffered(StringBuilder buffer) { - this.buffer = buffer; - token = new Token(); - tokensStart = new IntArray(); - tokensLength = new IntArray(); - reset(); - } - - @Override - public void advance() { - while (index < buffer.length() && TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) { - index++; - } - int start = index; - while (index < buffer.length() && !TokenizerBufferedFactory.isSeparator(buffer.charAt(index))) { - buffer.setCharAt(index, Character.toLowerCase(buffer.charAt(index))); - index++; - } - int length = index - start; - int count = 1; - if (length > 0) { - // search if we got the same token before - for (int i = 0; i < tokensStart.length(); ++i) { - if (length == tokensLength.get(i)) { - int tokenStart = tokensStart.get(i); - count++; // assume we found it - for (int j = 0; j < length; ++j) { - if (buffer.charAt(start + j) != buffer.charAt(tokenStart + j)) { - count--; // token not found - break; - } - } - } - } - // add the new token to the list of seen tokens - tokensStart.add(start); - tokensLength.add(length); - } - // set token - token.set(buffer, start, length, count); - } - - @Override - public boolean end() { - return token.length() <= 0; - } - - @Override - public Token getToken() { - return token; - } - - @Override - public void reset() { - index = 0; - tokensStart.reset(); - tokensLength.reset(); - advance(); - } - -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java deleted file mode 100644 index 90f8c6a..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRank.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenorder; - -import java.io.Serializable; - -public interface IntTokenCountRank extends Serializable { - public int add(int token, int count); - - public int getRank(int token, int count); -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java deleted file mode 100644 index d54c7d6..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenCountRankFrequency.java +++ /dev/null @@ -1,58 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenorder; - -import java.util.HashMap; - -import org.apache.asterix.fuzzyjoin.IntPair; - -public class IntTokenCountRankFrequency implements IntTokenCountRank { - /** - * - */ - private static final long serialVersionUID = 1L; - - private final HashMap<IntPair, Integer> ranksMap = new HashMap<IntPair, Integer>(); - private final IntPair tmpPair = new IntPair(); - private int crtRank = 0; - - @Override - public int add(int token, int count) { - int prevRank = crtRank; - ranksMap.put(new IntPair(token, count), prevRank); - crtRank++; - return prevRank; - } - - @Override - public int getRank(int token, int count) { - tmpPair.set(token, count); - Integer rank = ranksMap.get(tmpPair); - if (rank == null) { - return -1; - } - return rank; - } - - @Override - public String toString() { - return "[" + crtRank + ",\n " + ranksMap + "\n]"; - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java deleted file mode 100644 index b8e2082..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRank.java +++ /dev/null @@ -1,28 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenorder; - -import java.io.Serializable; - -public interface IntTokenRank extends Serializable { - public int add(int token); - - public int getRank(int token); -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java deleted file mode 100644 index 08d1c93..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/IntTokenRankFrequency.java +++ /dev/null @@ -1,54 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenorder; - -import java.util.HashMap; - -public class IntTokenRankFrequency implements IntTokenRank { - /** - * - */ - private static final long serialVersionUID = 1L; - - private final HashMap<Integer, Integer> ranksMap = new HashMap<Integer, Integer>(); - private int crtRank = 0; - - @Override - public int add(int token) { - int prevRank = crtRank; - ranksMap.put(token, prevRank); - crtRank++; - return prevRank; - } - - @Override - public int getRank(int token) { - Integer rank = ranksMap.get(token); - if (rank == null) { - return -1; - } - return rank; - } - - @Override - public String toString() { - return "[" + crtRank + ",\n " + ranksMap + "\n]"; - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java deleted file mode 100644 index 3578d94..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenLoad.java +++ /dev/null @@ -1,61 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenorder; - -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.Serializable; - -import org.apache.asterix.fuzzyjoin.FuzzyJoinConfig; - -public class TokenLoad implements Serializable { - private final String path; - private final TokenRank rank; - - public TokenLoad(String path, TokenRank rank) { - this.path = path; - this.rank = rank; - } - - public void loadTokenRank() { - loadTokenRank(1); - } - - public void loadTokenRank(int factor) { - try (BufferedReader fis = new BufferedReader( - // new FileReader(path.toString()) - new InputStreamReader(new FileInputStream(path), "UTF-8"))) { - String token = null; - while ((token = fis.readLine()) != null) { - rank.add(token); - // only used when increasing the token dictionary - for (int i = 1; i < factor; i++) { - // remove _COUNT at the end of the token (it is removed in - // the new records anyway) - rank.add(token.split(FuzzyJoinConfig.TOKEN_SEPARATOR_REGEX)[0] + i); - } - } - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java deleted file mode 100644 index 42cdfa7..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRank.java +++ /dev/null @@ -1,31 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenorder; - -import java.io.Serializable; -import java.util.Collection; - -public interface TokenRank extends Serializable { - public int add(String token); - - public Integer getRank(String token); - - public Collection<Integer> getTokenRanks(Iterable<String> tokens); -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java deleted file mode 100644 index 57fc325..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankBufferedFrequency.java +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenorder; - -import java.util.Collection; -import java.util.HashMap; - -import org.apache.asterix.fuzzyjoin.tokenizer.Token; - -public class TokenRankBufferedFrequency implements TokenRank { - /** - * - */ - private static final long serialVersionUID = 1L; - - private final HashMap<Token, Integer> ranksMap = new HashMap<Token, Integer>(); - private int crtRank = 0; - - public int add(String stringWithCount) { - int end = stringWithCount.lastIndexOf('_'); - int count = 0; - for (int i = end + 1; i < stringWithCount.length(); ++i) { - count = count * 10 + (stringWithCount.charAt(i) - '0'); - } - return add(stringWithCount.substring(0, end), count); - } - - public int add(String string, int count) { - Token token = new Token(string, 0, string.length(), count); - return add(token); - } - - public int add(Token token) { - int prevRank = crtRank; - ranksMap.put(token, prevRank); - crtRank++; - return prevRank; - } - - @Override - public Integer getRank(String token) { - throw new UnsupportedOperationException(); - } - - public Integer getRank(Token token) { - return ranksMap.get(token); - } - - @Override - public Collection<Integer> getTokenRanks(Iterable<String> tokens) { - throw new UnsupportedOperationException(); - } - - @Override - public String toString() { - return "[" + crtRank + ",\n " + ranksMap + "\n]"; - } -} diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java deleted file mode 100644 index 97b9503..0000000 --- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/tokenorder/TokenRankFrequency.java +++ /dev/null @@ -1,61 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.asterix.fuzzyjoin.tokenorder; - -import java.util.Collection; -import java.util.HashMap; -import java.util.TreeSet; - -public class TokenRankFrequency implements TokenRank { - /** - * - */ - private static final long serialVersionUID = 1L; - - private final HashMap<String, Integer> ranksMap = new HashMap<String, Integer>(); - private int crtRank = 0; - - public int add(String token) { - int prevRank = crtRank; - ranksMap.put(token, prevRank); - crtRank++; - return prevRank; - } - - public Integer getRank(String token) { - return ranksMap.get(token); - } - - public Collection<Integer> getTokenRanks(Iterable<String> tokens) { - TreeSet<Integer> ranksCol = new TreeSet<Integer>(); - for (String token : tokens) { - Integer rank = getRank(token); - if (rank != null) { - ranksCol.add(rank); - } - } - return ranksCol; - } - - @Override - public String toString() { - return "[" + crtRank + ",\n " + ranksMap + "\n]"; - } -}