This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch 2.x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5925bcb58e5ab0aeba28855f5aa8ab8bddfa4a2d Author: tballison <[email protected]> AuthorDate: Mon Feb 27 20:00:00 2017 -0500 TIKA-2279 - simplify token counting --- .../org/apache/tika/eval/AbstractProfiler.java | 10 +++--- .../eval/tokens/AlphaIdeographFilterFactory.java | 30 +++++++++-------- .../tika/eval/tokens/AnalyzerDeserializer.java | 15 +++++---- .../apache/tika/eval/tokens/AnalyzerManager.java | 19 ++--------- .../tika/eval/tokens/CommonTokenCountManager.java | 14 ++++++-- .../apache/tika/eval/tokens/CommonTokenResult.java | 21 +++++++++--- .../org/apache/tika/eval/tokens/TokenCounter.java | 24 +------------ tika-eval/src/main/resources/lucene-analyzers.json | 39 ++++++++-------------- .../org/apache/tika/eval/AnalyzerManagerTest.java | 32 +++++++++++++++--- .../tika/eval/tokens/LuceneTokenCounter.java | 8 ++--- .../apache/tika/eval/tokens/TokenCounterTest.java | 19 ++++------- 11 files changed, 107 insertions(+), 124 deletions(-) diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java index f792e77..0e89849 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java @@ -169,8 +169,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { langIder = new LanguageIDWrapper(); try { analyzerManager = AnalyzerManager.newInstance(); - tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer(), - analyzerManager.getAlphaIdeoAnalyzer()); + tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer()); } catch (IOException e) { throw new RuntimeException(e); } @@ -317,20 +316,19 @@ public abstract class AbstractProfiler extends FileResourceConsumer { CommonTokenResult commonTokenResult = null; try { commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, - tokenCounter.getAlphaTokens(fieldName)); + tokenCounter.getTokens(fieldName)); } catch (IOException e) { logger.error(e.getMessage(), e); } data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode()); - data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getTokens())); + data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens())); TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName); - TokenStatistics alphaTokenStatistics = tokenCounter.getAlphaTokenStatistics(fieldName); data.put(Cols.NUM_UNIQUE_TOKENS, Integer.toString(tokenStatistics.getTotalUniqueTokens())); data.put(Cols.NUM_TOKENS, Integer.toString(tokenStatistics.getTotalTokens())); data.put(Cols.NUM_ALPHABETIC_TOKENS, - Integer.toString(alphaTokenStatistics.getTotalTokens())); + Integer.toString(commonTokenResult.getAlphabeticTokens())); data.put(Cols.TOKEN_ENTROPY_RATE, Double.toString(tokenStatistics.getEntropy())); diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java index 2c046ad..771322b 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java @@ -53,22 +53,26 @@ public class AlphaIdeographFilterFactory extends TokenFilterFactory { @Override protected boolean accept() throws IOException { - char[] buff = termAtt.buffer(); - for (int i = 0; i < termAtt.length(); i++) { - int cp = buff[i]; - if (Character.isHighSurrogate(buff[i])) { - if (i < termAtt.length()-1) { - cp = Character.toCodePoint(buff[i], buff[i + 1]); - i++; - } - } + return isAlphabetic(termAtt.buffer()); + } + } - if (Character.isAlphabetic(cp) || - Character.isIdeographic(cp)) { - return true; + public static boolean isAlphabetic(char[] token) { + for (int i = 0; i < token.length; i++) { + int cp = token[i]; + if (Character.isHighSurrogate(token[i])) { + if (i < token.length-1) { + cp = Character.toCodePoint(token[i], token[i + 1]); + i++; } } - return false; + + if (Character.isAlphabetic(cp) || + Character.isIdeographic(cp)) { + return true; + } } + return false; } + } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java index cfc2d4f..83ca557 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java @@ -44,12 +44,13 @@ import org.apache.lucene.analysis.util.TokenizerFactory; class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> { - private static String ANALYZERS = "analyzers"; - private static String CHAR_FILTERS = "charfilters"; - private static String TOKEN_FILTERS = "tokenfilters"; - private static String TOKENIZER = "tokenizer"; - private static String FACTORY = "factory"; - private static String PARAMS = "params"; + private static final String ANALYZERS = "analyzers"; + private static final String CHAR_FILTERS = "charfilters"; + private static final String TOKEN_FILTERS = "tokenfilters"; + private static final String TOKENIZER = "tokenizer"; + private static final String FACTORY = "factory"; + private static final String PARAMS = "params"; + private static final String COMMENT = "_comment"; @Override public Map<String, Analyzer> deserialize(JsonElement element, Type type, @@ -100,7 +101,7 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> { tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName); } else if (k.equals(TOKENIZER)) { tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName); - } else { + } else if (! k.equals(COMMENT)) { throw new IllegalArgumentException("Should have one of three values here:"+ CHAR_FILTERS + ", "+ TOKENIZER+", "+ diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java index db6ae26..0e951b8 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java @@ -34,14 +34,10 @@ public class AnalyzerManager { private static final String COMMON_TOKENS = "common_tokens"; private final Analyzer generalAnalyzer; - private final Analyzer alphaIdeoAnalyzer; private final Analyzer commonTokensAnalyzer; - private AnalyzerManager(Analyzer generalAnalyzer, - Analyzer alphaIdeoAnalyzer, - Analyzer commonTokensAnalyzer) { + private AnalyzerManager(Analyzer generalAnalyzer, Analyzer commonTokensAnalyzer) { this.generalAnalyzer = generalAnalyzer; - this.alphaIdeoAnalyzer = alphaIdeoAnalyzer; this.commonTokensAnalyzer = commonTokensAnalyzer; } @@ -58,14 +54,11 @@ public class AnalyzerManager { if (general == null) { throw new JsonParseException("Must specify "+GENERAL + " analyzer"); } - if (alphaIdeo == null) { - throw new JsonParseException("Must specify "+ ALPHA_IDEOGRAPH + " analyzer"); - } if (common == null) { throw new JsonParseException("Must specify "+ COMMON_TOKENS + " analyzer"); } - return new AnalyzerManager(general, alphaIdeo, common); + return new AnalyzerManager(general,common); } /** @@ -77,14 +70,6 @@ public class AnalyzerManager { } /** - * This analyzer is used to extract "alphabetic" tokens. - * @return - */ - public Analyzer getAlphaIdeoAnalyzer() { - return alphaIdeoAnalyzer; - } - - /** * This analyzer should be used to generate common tokens lists from * large corpora. It is not used by tika-eval in profiling or comparing. * @return diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java index 0370f4d..c6d9947 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java @@ -65,15 +65,23 @@ public class CommonTokenCountManager { Map<String, MutableInt> tokens) throws IOException { String actualLangCode = getActualLangCode(langCode); int overlap = 0; + int alphabeticTokens = 0; Set<String> commonTokens = commonTokenMap.get(actualLangCode); for (Map.Entry<String, MutableInt> e : tokens.entrySet()) { - if (commonTokens.contains(e.getKey())) { - overlap += e.getValue().intValue(); + String token = e.getKey(); + int count = e.getValue().intValue(); + if (AlphaIdeographFilterFactory.isAlphabetic(token.toCharArray())) { + alphabeticTokens += count; } + if (commonTokens.contains(token)) { + overlap += count; + } + } - return new CommonTokenResult(actualLangCode, overlap); + return new CommonTokenResult(actualLangCode, overlap, alphabeticTokens); } + //return langcode for lang that you are actually using //lazily load the appropriate model private String getActualLangCode(String langCode) { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java index 340ceca..317697a 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java @@ -20,18 +20,29 @@ package org.apache.tika.eval.tokens; public class CommonTokenResult { private final String langCode; - private final int tokens; + private final int commonTokens; + private final int alphabeticTokens; - public CommonTokenResult(String langCode, int tokens) { + public CommonTokenResult(String langCode, int commonTokens, int alphabeticTokens) { this.langCode = langCode; - this.tokens = tokens; + this.commonTokens = commonTokens; + this.alphabeticTokens = alphabeticTokens; } public String getLangCode() { return langCode; } - public int getTokens() { - return tokens; + public int getCommonTokens() { + return commonTokens; + } + + /** + * + * @return number of tokens that had at least one alphabetic/ideographic character + * whether or not a common token + */ + public int getAlphabeticTokens() { + return alphabeticTokens; } } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java index 28e1c78..890dd22 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java @@ -30,8 +30,6 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; public class TokenCounter { - private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a"; - Map<String, Map<String, MutableInt>> map = new HashMap<>(); //Map<field, Map<token, count>> Map<String, TokenStatistics> tokenStatistics = new HashMap<>(); @@ -40,18 +38,15 @@ public class TokenCounter { 0, 0, new TokenIntPair[0], 0.0d, new SummaryStatistics()); private final Analyzer generalAnalyzer; - private final Analyzer alphaIdeoAnalyzer; private int topN = 10; - public TokenCounter(Analyzer generalAnalyzer, Analyzer alphaIdeoAnalyzer) throws IOException { + public TokenCounter(Analyzer generalAnalyzer) throws IOException { this.generalAnalyzer = generalAnalyzer; - this.alphaIdeoAnalyzer = alphaIdeoAnalyzer; } public void add(String field, String content) throws IOException { _add(field, generalAnalyzer, content); - _add(field+ALPHA_IDEOGRAPH_SUFFIX, alphaIdeoAnalyzer, content); } private void _add(String field, Analyzer analyzer, String content) throws IOException { @@ -136,23 +131,10 @@ public class TokenCounter { if (tokenMap != null) { tokenMap.clear(); } - Map<String, MutableInt> alphaMap = map.get(field+ALPHA_IDEOGRAPH_SUFFIX); - if (alphaMap != null) { - alphaMap.clear(); - } - tokenStatistics.put(field+ALPHA_IDEOGRAPH_SUFFIX, NULL_TOKEN_STAT); tokenStatistics.put(field, NULL_TOKEN_STAT); } - public Map<String, MutableInt> getAlphaTokens(String field) { - Map<String, MutableInt> ret = map.get(field+ALPHA_IDEOGRAPH_SUFFIX); - if (ret == null) { - return Collections.emptyMap(); - } - return ret; - } - public Map<String, MutableInt> getTokens(String field) { Map<String, MutableInt> ret = map.get(field); if (ret == null) { @@ -160,8 +142,4 @@ public class TokenCounter { } return ret; } - - public TokenStatistics getAlphaTokenStatistics(String fieldName) { - return getTokenStatistics(fieldName+ALPHA_IDEOGRAPH_SUFFIX); - } } diff --git a/tika-eval/src/main/resources/lucene-analyzers.json b/tika-eval/src/main/resources/lucene-analyzers.json index f7141f7..663ebe2 100644 --- a/tika-eval/src/main/resources/lucene-analyzers.json +++ b/tika-eval/src/main/resources/lucene-analyzers.json @@ -23,18 +23,19 @@ "params": { "outputUnigrams": "false" } - } - ] - }, - "alpha": { - "charfilters": [ + }, { - "factory": "oala.charfilter.MappingCharFilterFactory", + "factory": "oala.miscellaneous.LimitTokenCountFilterFactory", "params": { - "mapping": "/lucene-char-mapping.txt" + "maxTokenCount": "100000", + "consumeAllTokens": "false" } } - ], + ] + }, + "common_tokens": { + "_comment" : "Use this analyzer for counting common tokens in a corpus.", + "_comment" : "This isn't used by tika-eval during profiling or comparing", "tokenizer": { "factory": "oala.standard.UAX29URLEmailTokenizerFactory", "params": {} @@ -45,6 +46,10 @@ "params": {} }, { + "factory": "org.apache.tika.eval.tokens.AlphaIdeographFilterFactory", + "params": {} + }, + { "factory": "oala.pattern.PatternReplaceFilterFactory", "params": { "pattern": "^[\\w+\\.]{1,30}@(?:\\w+\\.){1,10}\\w+$", @@ -67,24 +72,6 @@ } }, { - "factory": "org.apache.tika.eval.tokens.AlphaIdeographFilterFactory", - "params": {} - } - ] - }, - "common_tokens": { - "tokenizer": { - "factory": "oala.standard.UAX29URLEmailTokenizerFactory", - "params": {} - }, - "tokenfilters": [ - { - "factory": "oala.cjk.CJKBigramFilterFactory", - "params": { - "outputUnigrams": "false" - } - }, - { "factory": "org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory", "params": { "min": 4, diff --git a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java index f1b9163..7a8a8fb 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java @@ -17,6 +17,8 @@ package org.apache.tika.eval; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -26,6 +28,7 @@ import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.tika.eval.tokens.AlphaIdeographFilterFactory; import org.apache.tika.eval.tokens.AnalyzerManager; import org.junit.Test; @@ -55,13 +58,14 @@ public class AnalyzerManagerTest { @Test public void testCommon() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); - Analyzer common = analyzerManager.getAlphaIdeoAnalyzer(); + Analyzer common = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog"); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { - if (termAtt.toString().contains("5")) { + String t = termAtt.toString(); + if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray()) && t.contains("5")) { fail("Shouldn't have found a numeric"); } seen.add(termAtt.toString()); @@ -69,10 +73,28 @@ public class AnalyzerManagerTest { ts.end(); ts.close(); - assertTrue(seen.contains("the")); - assertTrue(seen.contains("and")); - assertTrue(seen.contains("dog")); + assertTrue(seen.contains("dirty")); + assertFalse(seen.contains("the")); + + } + + @Test + public void testTokenCountFilter() throws Exception { + AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 101000; i++) { + sb.append("the "); + } + TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString()); + ts.reset(); + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); + Set<String> seen = new HashSet<>(); + int tokens = 0; + while (ts.incrementToken()) { + tokens++; + } + assertEquals(100000, tokens); } diff --git a/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java b/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java index 486cac7..db37450 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java @@ -41,17 +41,15 @@ public class LuceneTokenCounter { private final LeafReader leafReader; private final MemoryIndex memoryIndex; private final Analyzer generalAnalyzer; - private final Analyzer alphaIdeographAnalyzer; private int topN = 10; Map<String, TokenStatistics> fieldStats = new HashMap<>(); - public LuceneTokenCounter(Analyzer generalAnalyzer, Analyzer alphaIdeographAnalyzer) throws IOException { + public LuceneTokenCounter(Analyzer generalAnalyzer) throws IOException { memoryIndex = new MemoryIndex(); IndexSearcher searcher = memoryIndex.createSearcher(); leafReader = (LeafReader)searcher.getIndexReader(); this.generalAnalyzer = generalAnalyzer; - this.alphaIdeographAnalyzer = alphaIdeographAnalyzer; } public void add(String field, String content) throws IOException { @@ -128,9 +126,7 @@ public class LuceneTokenCounter { public TokenStatistics getTokenStatistics(String field) { return fieldStats.get(field); } - public Terms getAlphaTerms(String field) throws IOException { - return leafReader.terms(field+ALPHA_IDEOGRAPH_SUFFIX); - } + public Terms getTerms(String field) throws IOException { return leafReader.terms(field); } diff --git a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java index 719b56c..40abdaa 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java @@ -48,12 +48,10 @@ public class TokenCounterTest { @Test public void testBasic() throws Exception { String s = " bde cde def abc efg f f f f ghijklmnop a a a a a a a a a a a a a a a a a b b b b b b b b b b b b b"; - TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer(), - analyzerManager.getAlphaIdeoAnalyzer()); + TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer()); counter.add(FIELD, s); TokenStatistics simpleTokenStatistics = counter.getTokenStatistics(FIELD); - LuceneTokenCounter tokenCounter = new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer(), - analyzerManager.getAlphaIdeoAnalyzer()); + LuceneTokenCounter tokenCounter = new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer()); tokenCounter.add(FIELD, s); assertEquals(simpleTokenStatistics, tokenCounter.getTokenStatistics(FIELD)); } @@ -67,30 +65,25 @@ public class TokenCounterTest { for (int i = 0; i < numberOfTests; i++) { String s = generateString(); long start = new Date().getTime(); - TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer(), - analyzerManager.getAlphaIdeoAnalyzer()); + TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer()); counter.add(FIELD, s); simple += new Date().getTime()-start; TokenStatistics simpleTokenStatistics = counter.getTokenStatistics(FIELD); start = new Date().getTime(); - LuceneTokenCounter tokenCounter = new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer(), - analyzerManager.getAlphaIdeoAnalyzer()); + LuceneTokenCounter tokenCounter = new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer()); tokenCounter.add(FIELD, s); lucene += new Date().getTime()-start; assertEquals(s, simpleTokenStatistics, tokenCounter.getTokenStatistics(FIELD)); } - - //System.out.println("SIMPLE: " + simple + " lucene: "+lucene); } @Test public void testCommonTokens() throws Exception { - TokenCounter tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer(), - analyzerManager.getAlphaIdeoAnalyzer()); + TokenCounter tokenCounter = new TokenCounter(analyzerManager.getCommonTokensAnalyzer()); String s = "the http://www.cnn.com and [email protected] are in valuable www.sites.org 普林斯顿大学"; tokenCounter.add(FIELD, s); - Map<String, MutableInt> tokens = tokenCounter.getAlphaTokens(FIELD); + Map<String, MutableInt> tokens = tokenCounter.getTokens(FIELD); assertEquals(new MutableInt(2), tokens.get("___url___")); assertEquals(new MutableInt(1), tokens.get("___email___")); } -- To stop receiving notification emails like this one, please contact "[email protected]" <[email protected]>.
