This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3388d28f0276b50ba5accb2fe9daad3cc2152d6d Author: tallison <[email protected]> AuthorDate: Thu Jul 16 17:27:06 2020 -0400 TIKA-3140 -- initial commit --- .../tika/eval/metadata/TikaEvalMetadataFilter.java | 104 +++++++++++++++++++++ .../eval/metadata/TikaEvalMetadataFilterTest.java | 51 ++++++++++ 2 files changed, 155 insertions(+) diff --git a/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java new file mode 100644 index 0000000..2c69801 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.metadata; + +import org.apache.commons.lang3.StringUtils; +import org.apache.tika.eval.langid.Language; +import org.apache.tika.eval.langid.LanguageIDWrapper; +import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator; +import org.apache.tika.eval.textstats.CommonTokens; +import org.apache.tika.eval.textstats.CompositeTextStatsCalculator; +import org.apache.tika.eval.textstats.TextStatsCalculator; +import org.apache.tika.eval.tokens.CommonTokenResult; +import org.apache.tika.eval.tokens.TokenCounts; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Property; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class TikaEvalMetadataFilter implements MetadataFilter { + + public static String TIKA_EVAL_NS = "tika-eval"+ TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; + + public static Property NUM_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numTokens"); + + public static Property NUM_UNIQUE_TOKENS = + Property.externalInteger(TIKA_EVAL_NS+"numUniqueTokens"); + + public static Property NUM_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numAlphaTokens"); + + public static Property NUM_UNIQUE_ALPHA_TOKENS = + Property.externalInteger(TIKA_EVAL_NS+"numUniqueAlphaTokens"); + + public static Property LANGUAGE = Property.externalText(TIKA_EVAL_NS+"lang"); + + public static Property LANGUAGE_CONFIDENCE = Property.externalReal(TIKA_EVAL_NS+"langConfidence"); + + public static Property OUT_OF_VOCABULARY = Property.externalReal(TIKA_EVAL_NS+"oov"); + + + static CompositeTextStatsCalculator TEXT_STATS_CALCULATOR; + static { + List<TextStatsCalculator> calcs = new ArrayList<>(); + calcs.add(new BasicTokenCountStatsCalculator()); + calcs.add(new CommonTokens()); + TEXT_STATS_CALCULATOR = new CompositeTextStatsCalculator(calcs); + } + + + @Override + public void filter(Metadata metadata) throws TikaException { + String content = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); + if (StringUtils.isAllBlank(content)) { + return; + } + calcStats(content, metadata); + } + + private void calcStats(String content, Metadata metadata) { + Map<Class, Object> results = TEXT_STATS_CALCULATOR.calculate(content); + + TokenCounts tokenCounts = (TokenCounts)results.get(BasicTokenCountStatsCalculator.class); + metadata.set(NUM_TOKENS, tokenCounts.getTotalTokens()); + metadata.set(NUM_UNIQUE_TOKENS, tokenCounts.getTotalUniqueTokens()); + + + //common token results + CommonTokenResult commonTokenResult = (CommonTokenResult)results.get(CommonTokens.class); + metadata.set(NUM_ALPHA_TOKENS, commonTokenResult.getAlphabeticTokens()); + metadata.set(NUM_UNIQUE_ALPHA_TOKENS, commonTokenResult.getUniqueAlphabeticTokens()); + if (commonTokenResult.getAlphabeticTokens() > 0) { + metadata.set(OUT_OF_VOCABULARY, commonTokenResult.getOOV()); + } else { + metadata.set(OUT_OF_VOCABULARY, -1.0f); + } + + //languages + List<Language> probabilities = (List<Language>) results.get(LanguageIDWrapper.class); + if (probabilities.size() > 0) { + metadata.set(LANGUAGE, probabilities.get(0).getLanguage()); + metadata.set(LANGUAGE_CONFIDENCE, probabilities.get(0).getConfidence()); + } + } + +} diff --git a/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java new file mode 100644 index 0000000..1b3d006 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.metadata; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class TikaEvalMetadataFilterTest { + + @Test + public void testBasic() throws Exception { + Metadata metadata = new Metadata(); + String content = "the quick brown fox, Zothro 1234 1235, jumped over the lazy dog"; + metadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content); + TikaEvalMetadataFilter filter = new TikaEvalMetadataFilter(); + filter.filter(metadata); + assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE)); + assertEquals(12, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_TOKENS)); + assertEquals(11, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS)); + assertEquals(10, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_ALPHA_TOKENS)); + assertEquals(9, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_ALPHA_TOKENS)); + + + assertEquals(0.0999, + Double.parseDouble(metadata.get(TikaEvalMetadataFilter.OUT_OF_VOCABULARY)), + 0.1); + assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE)); + + assertEquals(0.0196, + Double.parseDouble(metadata.get(TikaEvalMetadataFilter.LANGUAGE_CONFIDENCE)), + 0.1); + + } +}
