This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6d58ea584cf327cc28af1e3fc338094cc70416d2 Author: tallison <[email protected]> AuthorDate: Fri Jul 24 16:59:04 2020 -0400 TIKA-3145 -- add TextSha256Signature --- .../tika/eval/textstats/BytesRefCalculator.java | 33 ++++++++++ .../textstats/CompositeTextStatsCalculator.java | 74 ++++++++++++++++++---- .../tika/eval/textstats/TextProfileSignature.java | 8 ++- .../tika/eval/textstats/TextSha256Signature.java | 54 ++++++++++++++++ .../apache/tika/eval/textstats/TextStatsTest.java | 13 +++- 5 files changed, 165 insertions(+), 17 deletions(-) diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java new file mode 100644 index 0000000..048b798 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.textstats; + +/** + * Interface for calculators that require a string + * @param <T> + */ +public interface BytesRefCalculator<T> extends TextStatsCalculator { + + public BytesRefCalcInstance<T> getInstance(); + + interface BytesRefCalcInstance<T> { + void update(byte[] bytes, int start, int len); + T finish(); + Class getOuterClass(); + } + +} diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java index 2c7c673..a16c767 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java @@ -17,29 +17,38 @@ package org.apache.tika.eval.textstats; import java.io.IOException; +import java.security.MessageDigest; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.util.BytesRef; import org.apache.tika.eval.langid.Language; import org.apache.tika.eval.langid.LanguageIDWrapper; import org.apache.tika.eval.tokens.AnalyzerManager; import org.apache.tika.eval.tokens.TokenCounts; +import org.apache.tika.metadata.Message; public class CompositeTextStatsCalculator { private static final String FIELD = "f"; private static final int DEFAULT_MAX_TOKENS = 10_000_000; + private final byte[] whitespace = new byte[]{' '}; private final Analyzer analyzer; private final LanguageIDWrapper languageIDWrapper; private final List<LanguageAwareTokenCountStats> languageAwareTokenCountStats = new ArrayList<>(); private final List<TokenCountStatsCalculator> tokenCountStatCalculators = new ArrayList<>(); private final List<StringStatsCalculator> stringStatCalculators = new ArrayList<>(); + private final List<BytesRefCalculator> bytesRefCalculators = new ArrayList<>(); public CompositeTextStatsCalculator(List<TextStatsCalculator> calculators) { this(calculators, @@ -68,6 +77,14 @@ public class CompositeTextStatsCalculator { "a TokenCountStats: "+t.getClass() ); } + } else if (t instanceof BytesRefCalculator) { + bytesRefCalculators.add((BytesRefCalculator)t); + if (analyzer == null) { + throw new IllegalArgumentException( + "Analyzer must not be null if you are using "+ + "a BytesRefCalculator: "+t.getClass() + ); + } } else { throw new IllegalArgumentException( "I regret I don't yet handle: "+t.getClass() @@ -83,9 +100,11 @@ public class CompositeTextStatsCalculator { } TokenCounts tokenCounts = null; - if (tokenCountStatCalculators.size() > 0 || languageAwareTokenCountStats.size() > 0) { + if (tokenCountStatCalculators.size() > 0 + || languageAwareTokenCountStats.size() > 0 + || bytesRefCalculators.size() > 0) { try { - tokenCounts = tokenize(txt); + tokenCounts = tokenize(txt, results); } catch (IOException e) { throw new RuntimeException(e); } @@ -106,20 +125,51 @@ public class CompositeTextStatsCalculator { return results; } - private TokenCounts tokenize(String txt) throws IOException { + private TokenCounts tokenize(String txt, Map<Class, Object> results) throws IOException { TokenCounts counts = new TokenCounts(); TokenStream ts = analyzer.tokenStream(FIELD, txt); - try { - CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); - ts.reset(); - while (ts.incrementToken()) { - String token = termAtt.toString(); - counts.increment(token); + if (bytesRefCalculators.size() == 0) { + try { + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + String token = termAtt.toString(); + counts.increment(token); + } + } finally { + ts.close(); + ts.end(); + } + } else { + List<BytesRefCalculator.BytesRefCalcInstance> brcis = new ArrayList<>(); + for (BytesRefCalculator brf : bytesRefCalculators) { + brcis.add(brf.getInstance()); + } + try { + TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class); + ts.reset(); + int i = 0; + while (ts.incrementToken()) { + final BytesRef bytesRef = termAtt.getBytesRef(); + String token = termAtt.toString(); + counts.increment(token); + for (BytesRefCalculator.BytesRefCalcInstance brci : brcis) { + if (i > 0) { + brci.update(whitespace, 0, 1); + } + brci.update(bytesRef.bytes, bytesRef.offset, bytesRef.length); + } + i++; + } + for (BytesRefCalculator.BytesRefCalcInstance brc : brcis) { + results.put(brc.getOuterClass(), brc.finish()); + } + } finally { + ts.close(); + ts.end(); } - } finally { - ts.close(); - ts.end(); } + return counts; } } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java index b72cc99..bc65351 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java @@ -16,7 +16,7 @@ */ package org.apache.tika.eval.textstats; -import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.mutable.MutableInt; import org.apache.tika.eval.tokens.TokenCounts; @@ -32,6 +32,8 @@ import java.util.Map; * https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/TextProfileSignature.java * * See documentation: https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html + * + * This returns the base32 encoded sha256 */ public class TextProfileSignature implements TokenCountStatsCalculator<String> { @@ -39,7 +41,7 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> { float quantRate = 0.01f; boolean secondaryLexicographicSorting = true; - Base64 base64 = new Base64(); + Base32 base32 = new Base32(); @Override public String calculate(TokenCounts tokenCounts) { @@ -74,7 +76,7 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> { } newText.append(t.val); } - return base64.encodeAsString(DigestUtils.sha256(newText.toString())); + return base32.encodeAsString(DigestUtils.sha256(newText.toString())); } public void setMinTokenLength(int minTokenLength) { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java new file mode 100644 index 0000000..eab4e08 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.textstats; + +import org.apache.commons.codec.binary.Base32; +import org.apache.commons.codec.digest.DigestUtils; + +import java.security.MessageDigest; + +/** + * Calculates the base32 encoded SHA-256 checksum on the analyzed text + */ +public class TextSha256Signature implements BytesRefCalculator<String> { + + static Base32 BASE32 = new Base32(); + + @Override + public BytesRefCalcInstance<String> getInstance() { + return new TextSha256Instance(); + } + + class TextSha256Instance implements BytesRefCalcInstance<String> { + private MessageDigest messageDigest = DigestUtils.getSha256Digest(); + @Override + public void update(byte[] bytes, int start, int len) { + messageDigest.update(bytes, start, len); + + } + + @Override + public String finish() { + return BASE32.encodeAsString(messageDigest.digest()); + } + + @Override + public Class getOuterClass() { + return TextSha256Signature.class; + } + } +} diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java index ceb39ad..8206977 100644 --- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java +++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java @@ -16,11 +16,14 @@ */ package org.apache.tika.eval.textstats; +import org.apache.commons.codec.binary.Base32; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.tika.eval.langid.Language; import org.apache.tika.eval.langid.LanguageIDWrapper; import org.apache.tika.eval.tokens.CommonTokenResult; import org.junit.Test; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -31,12 +34,14 @@ public class TextStatsTest { @Test public void testBasic() throws Exception { - String txt = "the quick brown fox &&^&%@! 8675309 jumped over the lazy wombat"; + String txt = "The quick brown fox &&^&%@! 8675309 jumped over tHe lazy wombat"; + String txtCleaned = "the quick brown fox 8675309 jumped over the lazy wombat"; List<TextStatsCalculator> calcs = new ArrayList<>(); calcs.add(new TextProfileSignature()); calcs.add(new ContentLengthCalculator()); calcs.add(new TokenEntropy()); calcs.add(new CommonTokens()); + calcs.add(new TextSha256Signature()); CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs); Map<Class, Object> stats = calc.calculate(txt); @@ -60,6 +65,10 @@ public class TextStatsTest { assertEquals(0.01, probabilities.get(1).getConfidence(), 0.01); String textProfileSignature = (String)stats.get(TextProfileSignature.class); - assertEquals("aKhbjS6iV87VBbf/12OfDCWMBg5aS3Atktl2n4ypg14=", textProfileSignature); + assertEquals("NCUFXDJOUJL45VIFW775OY47BQSYYBQOLJFXALMS3F3J7DFJQNPA====", textProfileSignature); + + assertEquals(new Base32().encodeAsString( + DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))), + stats.get(TextSha256Signature.class)); } }
