This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit ed0c91fddbcce94af47cb8bedf77d3eec74b3fa0 Author: tallison <[email protected]> AuthorDate: Fri Jul 24 17:12:37 2020 -0400 TIKA-3146 -- clean up text profile signature and add unit test for cjk --- .../tika/eval/textstats/TextProfileSignature.java | 16 ++++++++--- .../apache/tika/eval/textstats/TextStatsTest.java | 31 ++++++++++++++++++++++ 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java index bc65351..a1270ca 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java @@ -47,7 +47,7 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> { public String calculate(TokenCounts tokenCounts) { int maxFreq = -1; for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){ - if (e.getKey().length() > minTokenLength) { + if (e.getKey().length() >= minTokenLength) { if (e.getValue().intValue() > maxFreq) { maxFreq = e.getValue().intValue(); } @@ -64,8 +64,11 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> { } List<Token> profile = new ArrayList<>(); - for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){ - profile.add(new Token((e.getValue().intValue()/quant)*quant, e.getKey())); + for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()) { + String token = e.getKey(); + if (token.length() >= minTokenLength) { + profile.add(new Token((e.getValue().intValue() / quant) * quant, e.getKey())); + } } Collections.sort(profile, new TokenComparator()); StringBuffer newText = new StringBuffer(); @@ -79,6 +82,13 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> { return base32.encodeAsString(DigestUtils.sha256(newText.toString())); } + /** + * Be careful -- for CJK languages, the default analyzer uses character + * bigrams. You will "ignore" all cjk language tokens if you set + * minTokenLength > 2! + * + * @param minTokenLength -- include tokens of this length or greater. + */ public void setMinTokenLength(int minTokenLength) { this.minTokenLength = minTokenLength; } diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java index 8206977..a2252b6 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java @@ -71,4 +71,35 @@ public class TextStatsTest { DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))), stats.get(TextSha256Signature.class)); } + + @Test + public void testCJK() throws Exception { + String txt = "普林斯顿大学"; + List<TextStatsCalculator> calcs = new ArrayList<>(); + calcs.add(new TextProfileSignature()); + CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs); + + Map<Class, Object> stats = calc.calculate(txt); + String textProfileSignature = (String)stats.get(TextProfileSignature.class); + assertEquals("XKXLY6FNIGK2KGEF6HOSKSVGYDLLOFIAGO73RLMJ22PZVXBTXFFA====", textProfileSignature); + + //now test that if a user accidentally sets mintoken length > 2 + //the output will the be same as empty text + calcs.clear(); + calcs.add(new TextProfileSignature()); + calc = new CompositeTextStatsCalculator(calcs); + + stats = calc.calculate(""); + String emptyStringSignature = (String)stats.get(TextProfileSignature.class); + + calcs.clear(); + TextProfileSignature tPs = new TextProfileSignature(); + tPs.setMinTokenLength(3); + calcs.add(tPs); + calc = new CompositeTextStatsCalculator(calcs); + + stats = calc.calculate(txt); + assertEquals(emptyStringSignature, (String)stats.get(TextProfileSignature.class)); + + } }
