This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new ca4852d  TIKA-3147 -- drop tokens below quant value.
ca4852d is described below

commit ca4852db326445059cdd444dcda0d5d2ac414e23
Author: tballison <[email protected]>
AuthorDate: Mon Jul 27 15:29:46 2020 -0400

    TIKA-3147 -- drop tokens below quant value.
---
 .../java/org/apache/tika/eval/textstats/TextProfileSignature.java     | 4 ++++
 .../src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
index a1270ca..7628e49 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -67,6 +67,10 @@ public class TextProfileSignature implements 
TokenCountStatsCalculator<String> {
         for (Map.Entry<String, MutableInt> e : 
tokenCounts.getTokens().entrySet()) {
             String token = e.getKey();
             if (token.length() >= minTokenLength) {
+                int quantCnt = (e.getValue().intValue() / quant) * quant;
+                if (quantCnt < quant) {
+                    continue;
+                }
                 profile.add(new Token((e.getValue().intValue() / quant) * 
quant, e.getKey()));
             }
         }
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index 6290150..486791b 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -65,7 +65,7 @@ public class TextStatsTest {
         assertEquals(0.02, probabilities.get(1).getConfidence(), 0.01);
 
         String textProfileSignature = 
(String)stats.get(TextProfileSignature.class);
-        
assertEquals("NCUFXDJOUJL45VIFW775OY47BQSYYBQOLJFXALMS3F3J7DFJQNPA====", 
textProfileSignature);
+        
assertEquals("XF3W27O7IWOJVVNQ4HLKYYPCPPX3L2M72YSEMZ3WADL4VTXVITIA====", 
textProfileSignature);
 
         assertEquals(new Base32().encodeAsString(
                 
DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))),

Reply via email to