This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ed0c91fddbcce94af47cb8bedf77d3eec74b3fa0
Author: tallison <[email protected]>
AuthorDate: Fri Jul 24 17:12:37 2020 -0400

    TIKA-3146 -- clean up text profile signature and add unit test for cjk
---
 .../tika/eval/textstats/TextProfileSignature.java  | 16 ++++++++---
 .../apache/tika/eval/textstats/TextStatsTest.java  | 31 ++++++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
index bc65351..a1270ca 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -47,7 +47,7 @@ public class TextProfileSignature implements 
TokenCountStatsCalculator<String> {
     public String calculate(TokenCounts tokenCounts) {
         int maxFreq = -1;
         for (Map.Entry<String, MutableInt> e : 
tokenCounts.getTokens().entrySet()){
-            if (e.getKey().length() > minTokenLength) {
+            if (e.getKey().length() >= minTokenLength) {
                 if (e.getValue().intValue() > maxFreq) {
                     maxFreq = e.getValue().intValue();
                 }
@@ -64,8 +64,11 @@ public class TextProfileSignature implements 
TokenCountStatsCalculator<String> {
         }
 
         List<Token> profile = new ArrayList<>();
-        for (Map.Entry<String, MutableInt> e : 
tokenCounts.getTokens().entrySet()){
-            profile.add(new Token((e.getValue().intValue()/quant)*quant, 
e.getKey()));
+        for (Map.Entry<String, MutableInt> e : 
tokenCounts.getTokens().entrySet()) {
+            String token = e.getKey();
+            if (token.length() >= minTokenLength) {
+                profile.add(new Token((e.getValue().intValue() / quant) * 
quant, e.getKey()));
+            }
         }
         Collections.sort(profile, new TokenComparator());
         StringBuffer newText = new StringBuffer();
@@ -79,6 +82,13 @@ public class TextProfileSignature implements 
TokenCountStatsCalculator<String> {
         return base32.encodeAsString(DigestUtils.sha256(newText.toString()));
     }
 
+    /**
+     * Be careful -- for CJK languages, the default analyzer uses character
+     * bigrams.  You will "ignore" all cjk language tokens if you set
+     * minTokenLength > 2!
+     *
+     * @param minTokenLength -- include tokens of this length or greater.
+     */
     public void setMinTokenLength(int minTokenLength) {
         this.minTokenLength = minTokenLength;
     }
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index 8206977..a2252b6 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -71,4 +71,35 @@ public class TextStatsTest {
                 
DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))),
                 stats.get(TextSha256Signature.class));
     }
+
+    @Test
+    public void testCJK() throws Exception {
+        String txt = "普林斯顿大学";
+        List<TextStatsCalculator> calcs = new ArrayList<>();
+        calcs.add(new TextProfileSignature());
+        CompositeTextStatsCalculator calc = new 
CompositeTextStatsCalculator(calcs);
+
+        Map<Class, Object> stats = calc.calculate(txt);
+        String textProfileSignature = 
(String)stats.get(TextProfileSignature.class);
+        
assertEquals("XKXLY6FNIGK2KGEF6HOSKSVGYDLLOFIAGO73RLMJ22PZVXBTXFFA====", 
textProfileSignature);
+
+        //now test that if a user accidentally sets mintoken length > 2
+        //the output will the be same as empty text
+        calcs.clear();
+        calcs.add(new TextProfileSignature());
+        calc = new CompositeTextStatsCalculator(calcs);
+
+        stats = calc.calculate("");
+        String emptyStringSignature = 
(String)stats.get(TextProfileSignature.class);
+
+        calcs.clear();
+        TextProfileSignature tPs = new TextProfileSignature();
+        tPs.setMinTokenLength(3);
+        calcs.add(tPs);
+        calc = new CompositeTextStatsCalculator(calcs);
+
+        stats = calc.calculate(txt);
+        assertEquals(emptyStringSignature, 
(String)stats.get(TextProfileSignature.class));
+
+    }
 }

Reply via email to