This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit db4498d1de534f8348e94b0f27c641353a26b083 Author: tallison <[email protected]> AuthorDate: Thu Jul 16 15:58:00 2020 -0400 TIKA-3137 -- first pass, need to add unit tests for tika-batch --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 4 +- .../batch/fs/RecursiveParserWrapperFSConsumer.java | 9 +- .../tika/batch/fs/StreamOutRPWFSConsumer.java | 20 ++- .../fs/builders/BasicTikaFSConsumersBuilder.java | 11 +- .../RecursiveParserWrapperFSConsumerTest.java | 5 +- .../java/org/apache/tika/config/TikaConfig.java | 108 ++++++++++++- .../metadata/filter/ClearByMimeMetadataFilter.java | 74 +++++++++ .../metadata/filter/CompositeMetadataFilter.java | 38 +++++ .../metadata/filter/DefaultMetadataFilter.java | 46 ++++++ .../filter/ExcludeFieldMetadataFilter.java | 53 +++++++ .../filter/IncludeFieldMetadataFilter.java | 58 +++++++ .../tika/metadata/filter/MetadataFilter.java | 33 ++++ .../apache/tika/metadata/filter/NoOpFilter.java | 34 +++++ .../tika/sax/RecursiveParserWrapperHandler.java | 31 +++- .../org.apache.tika.metadata.filter.MetadataFilter | 16 ++ .../org/apache/tika/config/TikaConfigTest.java | 2 + .../tika/metadata/filter/MockUpperCaseFilter.java | 39 +++++ .../tika/metadata/filter/TestMetadataFilter.java | 170 +++++++++++++++++++++ .../org/apache/tika/config/TIKA-3137-exclude.xml | 26 ++++ .../apache/tika/config/TIKA-3137-include-uc.xml | 27 ++++ .../org/apache/tika/config/TIKA-3137-include.xml | 26 ++++ .../org/apache/tika/config/TIKA-3137-mimes-uc.xml | 27 ++++ .../tika/parser/RecursiveParserWrapperTest.java | 43 ++++++ .../org/apache/tika/parser/TIKA-3137-include.xml | 31 ++++ .../server/resource/RecursiveMetadataResource.java | 3 +- .../java/org/apache/tika/server/CXFTestBase.java | 7 +- .../tika/server/RecursiveMetadataFilterTest.java | 107 +++++++++++++ .../org/apache/tika/server/TIKA-3137-include.xml | 31 ++++ 28 files changed, 1062 insertions(+), 17 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 8077114..46f82ee 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -513,7 +513,9 @@ public class TikaCLI { private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1); + RecursiveParserWrapperHandler handler = + new RecursiveParserWrapperHandler(getContentHandlerFactory(type), + -1, config.getMetadataFilter()); try (InputStream input = TikaInputStream.get(url, metadata)) { wrapper.parse(input, handler, metadata, context); } diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java index 56b8b58..9732781 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java @@ -32,6 +32,8 @@ import org.apache.tika.batch.ParserFactory; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -50,6 +52,7 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer { private final Parser parser; private final ContentHandlerFactory contentHandlerFactory; private final OutputStreamFactory fsOSFactory; + private final MetadataFilter metadataFilter; private String outputEncoding = "UTF-8"; /** @@ -62,11 +65,12 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer { public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue<FileResource> queue, Parser parser, ContentHandlerFactory contentHandlerFactory, - OutputStreamFactory fsOSFactory) { + OutputStreamFactory fsOSFactory, MetadataFilter metadataFilter) { super(queue); this.contentHandlerFactory = contentHandlerFactory; this.fsOSFactory = fsOSFactory; this.parser = parser; + this.metadataFilter = metadataFilter; } @Override @@ -95,7 +99,8 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer { Throwable thrown = null; List<Metadata> metadataList = null; Metadata containerMetadata = fileResource.getMetadata(); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory, -1); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory, + -1, metadataFilter); try { parse(fileResource.getResourceId(), parser, is, handler, containerMetadata, context); diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java index 018c1a9..dd39a6c 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java @@ -20,12 +20,15 @@ package org.apache.tika.batch.fs; import org.apache.commons.io.IOUtils; +import org.apache.tika.Tika; import org.apache.tika.batch.FileResource; import org.apache.tika.batch.OutputStreamFactory; import org.apache.tika.batch.ParserFactory; import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.serialization.JsonStreamingSerializer; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; @@ -53,17 +56,19 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer { private final Parser parser; private final ContentHandlerFactory contentHandlerFactory; private final OutputStreamFactory fsOSFactory; + private final MetadataFilter metadataFilter; private String outputEncoding = "UTF-8"; public StreamOutRPWFSConsumer(ArrayBlockingQueue<FileResource> queue, Parser parser, ContentHandlerFactory contentHandlerFactory, - OutputStreamFactory fsOSFactory) { + OutputStreamFactory fsOSFactory, MetadataFilter metadataFilter) { super(queue); this.contentHandlerFactory = contentHandlerFactory; this.fsOSFactory = fsOSFactory; this.parser = parser; + this.metadataFilter = metadataFilter; } @Override @@ -93,7 +98,8 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer { JsonStreamingSerializer writer = new JsonStreamingSerializer( new OutputStreamWriter(os, StandardCharsets.UTF_8)); - WriteoutRPWHandler handler = new WriteoutRPWHandler(contentHandlerFactory, writer); + WriteoutRPWHandler handler = new WriteoutRPWHandler(contentHandlerFactory, + writer, metadataFilter); Throwable thrown = null; try { parse(fileResource.getResourceId(), parser, is, handler, @@ -137,16 +143,24 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer { //be written straight to disk. private class WriteoutRPWHandler extends AbstractRecursiveParserWrapperHandler { private final JsonStreamingSerializer jsonWriter; + private final MetadataFilter metadataFilter; - public WriteoutRPWHandler(ContentHandlerFactory contentHandlerFactory, JsonStreamingSerializer writer) { + public WriteoutRPWHandler(ContentHandlerFactory contentHandlerFactory, JsonStreamingSerializer writer, + MetadataFilter metadataFilter) { super(contentHandlerFactory); this.jsonWriter = writer; + this.metadataFilter = metadataFilter; } @Override public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { metadata.add(RecursiveParserWrapperHandler.TIKA_CONTENT, contentHandler.toString()); try { + metadataFilter.filter(metadata); + } catch (TikaException e) { + throw new SAXException(e); + } + try { jsonWriter.add(metadata); } catch (IOException e) { throw new SAXException(e); diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java index 88171ee..4f05324 100644 --- a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java +++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java @@ -42,6 +42,9 @@ import org.apache.tika.batch.fs.FSUtil; import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer; import org.apache.tika.batch.fs.StreamOutRPWFSConsumer; import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.filter.NoOpFilter; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.sax.BasicContentHandlerFactory; @@ -145,15 +148,19 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder { contentHandlerFactory, recursiveParserWrapper); Parser parser = parserFactory.getParser(config); if (recursiveParserWrapper) { + MetadataFilter metadataFilter = config.getMetadataFilter(); parser = new RecursiveParserWrapper(parser); + for (int i = 0; i < numConsumers; i++) { FileResourceConsumer c = null; if (streamOut){ c = new StreamOutRPWFSConsumer(queue, - parser, contentHandlerFactory, outputStreamFactory); + parser, contentHandlerFactory, + outputStreamFactory, metadataFilter); } else { c = new RecursiveParserWrapperFSConsumer(queue, - parser, contentHandlerFactory, outputStreamFactory); + parser, contentHandlerFactory, + outputStreamFactory, metadataFilter); } consumers.add(c); } diff --git a/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java b/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java index 7ebe564..6a61414 100644 --- a/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java +++ b/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java @@ -34,6 +34,7 @@ import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.metadata.serialization.JsonMetadataList; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; @@ -75,7 +76,7 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest { Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig())); RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer( queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - mockOSFactory); + mockOSFactory, NoOpFilter.NOOP_FILTER); IFileProcessorFutureResult result = consumer.call(); mockOSFactory.getStreams().get(0).flush(); @@ -123,7 +124,7 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest { Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig())); RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer( queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), - mockOSFactory); + mockOSFactory, NoOpFilter.NOOP_FILTER); IFileProcessorFutureResult result = consumer.call(); mockOSFactory.getStreams().get(0).flush(); diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index 92485d3..a0cc102 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -50,6 +50,9 @@ import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.language.translate.DefaultTranslator; import org.apache.tika.language.translate.Translator; +import org.apache.tika.metadata.filter.CompositeMetadataFilter; +import org.apache.tika.metadata.filter.DefaultMetadataFilter; +import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypeException; @@ -104,6 +107,10 @@ public class TikaConfig { return new SimpleThreadPoolExecutor(); } + private static MetadataFilter getDefaultMetadataFilter(ServiceLoader loader) { + return new DefaultMetadataFilter(loader); + } + //use this to look for unneeded instantiations of TikaConfig protected static AtomicInteger TIMES_INSTANTIATED = new AtomicInteger(); @@ -115,6 +122,7 @@ public class TikaConfig { private final MimeTypes mimeTypes; private final ExecutorService executorService; private final EncodingDetector encodingDetector; + private final MetadataFilter metadataFilter; public TikaConfig(String file) throws TikaException, IOException, SAXException { @@ -180,6 +188,7 @@ public class TikaConfig { TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader(); ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader(); EncodingDetectorXmlLoader encodingDetectorXmlLoader = new EncodingDetectorXmlLoader(); + MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader(); updateXMLReaderUtils(element); this.mimeTypes = typesFromDomElement(element); this.detector = detectorLoader.loadOverall(element, mimeTypes, loader); @@ -189,6 +198,7 @@ public class TikaConfig { this.parser = parserLoader.loadOverall(element, mimeTypes, loader); this.translator = translatorLoader.loadOverall(element, mimeTypes, loader); this.executorService = executorLoader.loadOverall(element, mimeTypes, loader); + this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, loader); this.serviceLoader = loader; TIMES_INSTANTIATED.incrementAndGet(); } @@ -214,6 +224,7 @@ public class TikaConfig { this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector); this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); + this.metadataFilter = getDefaultMetadataFilter(serviceLoader); TIMES_INSTANTIATED.incrementAndGet(); } @@ -249,6 +260,7 @@ public class TikaConfig { this.detector = getDefaultDetector(mimeTypes, serviceLoader); this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); + this.metadataFilter = getDefaultMetadataFilter(serviceLoader); } else { ServiceLoader tmpServiceLoader = new ServiceLoader(); try (InputStream stream = getConfigInputStream(config, tmpServiceLoader)) { @@ -259,7 +271,8 @@ public class TikaConfig { EncodingDetectorXmlLoader encodingDetectorLoader = new EncodingDetectorXmlLoader(); TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader(); ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader(); - + MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader(); + this.mimeTypes = typesFromDomElement(element); this.encodingDetector = encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader); @@ -269,6 +282,7 @@ public class TikaConfig { this.detector = detectorLoader.loadOverall(element, mimeTypes, serviceLoader); this.translator = translatorLoader.loadOverall(element, mimeTypes, serviceLoader); this.executorService = executorLoader.loadOverall(element, mimeTypes, serviceLoader); + this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, serviceLoader); } catch (SAXException e) { throw new TikaException( "Specified Tika configuration has syntax errors: " @@ -393,6 +407,9 @@ public class TikaConfig { return serviceLoader; } + public MetadataFilter getMetadataFilter() { + return metadataFilter; + } /** * Provides a default configuration (TikaConfig). Currently creates a * new instance each time it's called; we may be able to have it @@ -1101,7 +1118,8 @@ public class TikaConfig { } @Override - CompositeEncodingDetector createComposite(List<EncodingDetector> encodingDetectors, MimeTypes mimeTypes, ServiceLoader loader) { + CompositeEncodingDetector createComposite(List<EncodingDetector> encodingDetectors, + MimeTypes mimeTypes, ServiceLoader loader) { return new CompositeEncodingDetector(encodingDetectors); } @@ -1142,5 +1160,91 @@ public class TikaConfig { } } + private static class MetadataFilterXmlLoader extends + XmlLoader<MetadataFilter, MetadataFilter> { + + boolean supportsComposite() { + return true; + } + + String getParentTagName() { + return "metadataFilters"; + } + + String getLoaderTagName() { + return "metadataFilter"; + } + + @Override + Class<? extends MetadataFilter> getLoaderClass() { + return MetadataFilter.class; + } + + + @Override + boolean isComposite(MetadataFilter loaded) { + return loaded instanceof CompositeMetadataFilter; + } + + @Override + boolean isComposite(Class<? extends MetadataFilter> loadedClass) { + return CompositeMetadataFilter.class.isAssignableFrom(loadedClass); + } + + @Override + MetadataFilter preLoadOne(Class<? extends MetadataFilter> loadedClass, + String classname, MimeTypes mimeTypes) throws TikaException { + // Check for classes which can't be set in config + // Continue with normal loading + return null; + } + + @Override + MetadataFilter createDefault(MimeTypes mimeTypes, ServiceLoader loader) { + return getDefaultMetadataFilter(loader); + } + + //this ignores the service loader + @Override + MetadataFilter createComposite(List<MetadataFilter> loaded, MimeTypes mimeTypes, ServiceLoader loader) { + return new DefaultMetadataFilter(loaded); + } + + @Override + MetadataFilter createComposite(Class<? extends MetadataFilter> metadataFilterClass, + List<MetadataFilter> childMetadataFilters, + Set<Class<? extends MetadataFilter>> excludeFilters, + Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader) + throws InvocationTargetException, IllegalAccessException, + InstantiationException { + MetadataFilter metadataFilter = null; + Constructor<? extends MetadataFilter> c; + + // Try the possible default and composite detector constructors + if (metadataFilter == null) { + try { + c = metadataFilterClass.getConstructor(ServiceLoader.class, Collection.class); + metadataFilter = c.newInstance(loader, excludeFilters); + } catch (NoSuchMethodException me) { + me.printStackTrace(); + } + } + if (metadataFilter == null) { + try { + c = metadataFilterClass.getConstructor(List.class); + metadataFilter = c.newInstance(childMetadataFilters); + } catch (NoSuchMethodException me) { + me.printStackTrace(); + } + } + + return metadataFilter; + } + + @Override + MetadataFilter decorate(MetadataFilter created, Element element) { + return created; // No decoration of MetadataFilters + } + } } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java new file mode 100644 index 0000000..05324f2 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import org.apache.tika.config.Field; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * This class clears the entire metadata object if the + * mime matches the mime filter. The idea is that you might not want + * to store/transmit metadata for images or specific file types. + */ +public class ClearByMimeMetadataFilter implements MetadataFilter { + private final Set<String> mimes; + + public ClearByMimeMetadataFilter() { + this(new HashSet<>()); + } + + public ClearByMimeMetadataFilter(Set<String> mimes) { + this.mimes = mimes; + } + + @Override + public void filter(Metadata metadata) throws TikaException { + String mimeString = metadata.get(Metadata.CONTENT_TYPE); + if (mimeString == null) { + return; + } + MediaType mt = MediaType.parse(mimeString); + if (mt != null) { + mimeString = mt.getBaseType().toString(); + } + if (mimes.contains(mimeString)) { + for (String n : metadata.names()) { + metadata.remove(n); + } + + } + } + + /** + * + * @param mimesString comma-delimited list of mimes that will trigger complete removal of metadata + */ + @Field + public void setMimes(String mimesString) { + for (String include : mimesString.split(",")) { + mimes.add(include); + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java new file mode 100644 index 0000000..4d592c9 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +import java.util.List; + +public class CompositeMetadataFilter implements MetadataFilter { + + private final List<MetadataFilter> filters; + + public CompositeMetadataFilter(List<MetadataFilter> filters) { + this.filters = filters; + } + + @Override + public void filter(Metadata metadata) throws TikaException { + for (MetadataFilter filter : filters) { + filter.filter(metadata); + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java new file mode 100644 index 0000000..7671f50 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import org.apache.tika.config.ServiceLoader; +import org.apache.tika.mime.MimeTypes; +import org.apache.tika.utils.ServiceLoaderUtils; + +import java.util.List; + +public class DefaultMetadataFilter extends CompositeMetadataFilter { + + private static List<MetadataFilter> getDefaultFilters( + ServiceLoader loader) { + List<MetadataFilter> detectors = loader.loadStaticServiceProviders(MetadataFilter.class); + ServiceLoaderUtils.sortLoadedClasses(detectors); + + return detectors; + } + + public DefaultMetadataFilter(ServiceLoader serviceLoader) { + super(getDefaultFilters(serviceLoader)); + } + + public DefaultMetadataFilter(List<MetadataFilter> metadataFilters) { + super(metadataFilters); + } + + public DefaultMetadataFilter() { + this(new ServiceLoader()); + } +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java new file mode 100644 index 0000000..3b6e2a0 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import org.apache.tika.config.Field; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +import java.util.HashSet; +import java.util.Set; + +public class ExcludeFieldMetadataFilter implements MetadataFilter { + private final Set<String> exclude; + + public ExcludeFieldMetadataFilter() { + this(new HashSet<>()); + } + public ExcludeFieldMetadataFilter(Set<String> exclude) { + this.exclude = exclude; + } + + @Override + public void filter(Metadata metadata) throws TikaException { + for (String field : exclude) { + metadata.remove(field); + } + } + + /** + * + * @param excludeString comma-delimited list of fields to exclude + */ + @Field + public void setExclude(String excludeString) { + for (String include : excludeString.split(",")) { + exclude.add(include); + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java new file mode 100644 index 0000000..4bc6c9e --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import org.apache.tika.config.Field; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class IncludeFieldMetadataFilter implements MetadataFilter { + private final Set<String> includeSet; + + public IncludeFieldMetadataFilter() { + this(new HashSet<>()); + } + + public IncludeFieldMetadataFilter(Set<String> fields) { + this.includeSet = fields; + } + + /** + * + * @param includeString comma-delimited list of fields to include + */ + @Field + public void setInclude(String includeString) { + for (String include : includeString.split(",")) { + includeSet.add(include); + } + } + + @Override + public void filter(Metadata metadata) throws TikaException { + + for (String n : metadata.names()) { + if (! includeSet.contains(n)) { + metadata.remove(n); + } + } + } +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java new file mode 100644 index 0000000..7a8f345 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.metadata.filter; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +import java.io.Serializable; + +/** + * Filters the metadata in place + * + * @since Apache Tika 1.25 + */ +public interface MetadataFilter extends Serializable { + + void filter(Metadata metadata) throws TikaException; +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java new file mode 100644 index 0000000..9cd1ec3 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +/** + * This filter performs no operations on the metadata + * and leaves it untouched. + */ +public class NoOpFilter implements MetadataFilter { + + public static NoOpFilter NOOP_FILTER = new NoOpFilter(); + + @Override + public void filter(Metadata metadata) throws TikaException { + //no op + } +} diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java index 408598f..50f0fb8 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java @@ -16,7 +16,10 @@ */ package org.apache.tika.sax; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.utils.ParserUtils; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -40,12 +43,13 @@ import java.util.List; public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler { protected final List<Metadata> metadataList = new LinkedList<>(); + private final MetadataFilter metadataFilter; /** * Create a handler with no limit on the number of embedded resources */ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) { - super(contentHandlerFactory); + this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER); } /** @@ -54,7 +58,13 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe * @param maxEmbeddedResources number of embedded resources that will be parsed */ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) { + this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER); + } + + public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources, + MetadataFilter metadataFilter) { super(contentHandlerFactory, maxEmbeddedResources); + this.metadataFilter = metadataFilter; } /** @@ -79,7 +89,15 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endEmbeddedDocument(contentHandler, metadata); addContent(contentHandler, metadata); - metadataList.add(ParserUtils.cloneMetadata(metadata)); + try { + metadataFilter.filter(metadata); + } catch (TikaException e) { + throw new SAXException(e); + } + + if (metadata.size() > 0) { + metadataList.add(ParserUtils.cloneMetadata(metadata)); + } } /** @@ -92,8 +110,15 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endDocument(contentHandler, metadata); addContent(contentHandler, metadata); + try { + metadataFilter.filter(metadata); + } catch (TikaException e) { + throw new SAXException(e); + } - metadataList.add(0, ParserUtils.cloneMetadata(metadata)); + if (metadata.size() > 0) { + metadataList.add(0, ParserUtils.cloneMetadata(metadata)); + } } /** diff --git a/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter b/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter new file mode 100644 index 0000000..604a480 --- /dev/null +++ b/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.metadata.filter.NoOpFilter \ No newline at end of file diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java index 5c406cd..1b8722d 100644 --- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java +++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java @@ -327,4 +327,6 @@ public class TikaConfigTest extends AbstractTikaConfigTest { getConfig("TIKA-2732-xmlreaderutils-exc.xml"); } + + } \ No newline at end of file diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java new file mode 100644 index 0000000..0632dd4 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +import java.util.Locale; + +/** + * Mock Filter for testing uppercasing of all values + */ +public class MockUpperCaseFilter implements MetadataFilter { + + @Override + public void filter(Metadata metadata) throws TikaException { + for (String n : metadata.names()) { + String[] vals = metadata.getValues(n); + metadata.remove(n); + for (int i = 0; i < vals.length; i++) { + metadata.add(n, vals[i].toUpperCase(Locale.US)); + } + } + } +} diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java new file mode 100644 index 0000000..e933d0c --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.metadata.filter; + +import org.apache.tika.config.AbstractTikaConfigTest; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.junit.Test; + +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +public class TestMetadataFilter extends AbstractTikaConfigTest { + + @Test + public void testDefault() throws Exception { + Metadata metadata = new Metadata(); + metadata.set("title", "title"); + metadata.set("author", "author"); + + MetadataFilter defaultFilter = new DefaultMetadataFilter(); + defaultFilter.filter(metadata); + + assertEquals(2, metadata.names().length); + assertEquals("title", metadata.get("title")); + assertEquals("author", metadata.get("author")); + } + + @Test + public void testIncludeFilter() throws Exception { + Metadata metadata = new Metadata(); + metadata.set("title", "title"); + metadata.set("author", "author"); + + MetadataFilter filter = new IncludeFieldMetadataFilter(set("title")); + filter.filter(metadata); + assertEquals(1, metadata.names().length); + assertEquals("title", metadata.get("title")); + assertNull(metadata.get("author")); + } + + @Test + public void testExcludeFilter() throws Exception { + Metadata metadata = new Metadata(); + metadata.set("title", "title"); + metadata.set("author", "author"); + + MetadataFilter filter = new ExcludeFieldMetadataFilter(set("title")); + filter.filter(metadata); + assertEquals(1, metadata.names().length); + assertEquals("author", metadata.get("author")); + assertNull(metadata.get("title")); + } + + @Test + public void testConfigIncludeFilter() throws Exception { + TikaConfig config = getConfig("TIKA-3137-include.xml"); + Metadata metadata = new Metadata(); + metadata.set("title", "title"); + metadata.set("author", "author"); + metadata.set("content", "content"); + + config.getMetadataFilter().filter(metadata); + + assertEquals(2, metadata.size()); + assertEquals("title", metadata.get("title")); + assertEquals("author", metadata.get("author")); + } + + @Test + public void testConfigExcludeFilter() throws Exception { + TikaConfig config = getConfig("TIKA-3137-exclude.xml"); + Metadata metadata = new Metadata(); + metadata.set("title", "title"); + metadata.set("author", "author"); + metadata.set("content", "content"); + + config.getMetadataFilter().filter(metadata); + + assertEquals(1, metadata.size()); + assertEquals("content", metadata.get("content")); + } + + @Test + public void testConfigIncludeAndUCFilter() throws Exception { + TikaConfig config = getConfig("TIKA-3137-include-uc.xml"); + String[] expectedTitles = new String[]{ + "TITLE1", "TITLE2", "TITLE3" + }; + Metadata metadata = new Metadata(); + metadata.add("title", "title1"); + metadata.add("title", "title2"); + metadata.add("title", "title3"); + metadata.set("author", "author"); + metadata.set("content", "content"); + + config.getMetadataFilter().filter(metadata); + + assertEquals(2, metadata.size()); + assertArrayEquals(expectedTitles, metadata.getValues("title")); + assertEquals("AUTHOR", metadata.get("author")); + } + + @Test + public void testMimeClearingFilter() throws Exception { + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, MediaType.image("jpeg").toString()); + metadata.set("author", "author"); + + MetadataFilter filter = new ClearByMimeMetadataFilter(set("image/jpeg","application/pdf")); + filter.filter(metadata); + assertEquals(0, metadata.size()); + + metadata.set(Metadata.CONTENT_TYPE, MediaType.text("plain").toString()); + metadata.set("author", "author"); + filter.filter(metadata); + assertEquals(2, metadata.size()); + assertEquals("author", metadata.get("author")); + + } + + @Test + public void testMimeClearingFilterConfig() throws Exception { + TikaConfig config = getConfig("TIKA-3137-mimes-uc.xml"); + + Metadata metadata = new Metadata(); + metadata.set(Metadata.CONTENT_TYPE, MediaType.image("jpeg").toString()); + metadata.set("author", "author"); + + MetadataFilter filter = config.getMetadataFilter(); + filter.filter(metadata); + debug(metadata); + assertEquals(0, metadata.size()); + + metadata.set(Metadata.CONTENT_TYPE, MediaType.text("plain").toString()); + metadata.set("author", "author"); + filter.filter(metadata); + assertEquals(2, metadata.size()); + assertEquals("AUTHOR", metadata.get("author")); + + } + + private static Set<String> set(String ... items) { + Set<String> set = new HashSet<>(); + for (String item : items) { + set.add(item); + } + return set; + } +} diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml new file mode 100644 index 0000000..27517f6 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.ExcludeFieldMetadataFilter"> + <params> + <param name="exclude" type="string">title,author</param> + </params> + </metadataFilter> + </metadataFilters> +</properties> diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml new file mode 100644 index 0000000..e0df476 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter"> + <params> + <param name="include" type="string">title,author</param> + </params> + </metadataFilter> + <metadataFilter class="org.apache.tika.metadata.filter.MockUpperCaseFilter"/> + </metadataFilters> +</properties> diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml new file mode 100644 index 0000000..e92dff8 --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml @@ -0,0 +1,26 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter"> + <params> + <param name="include" type="string">title,author</param> + </params> + </metadataFilter> + </metadataFilters> +</properties> diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml new file mode 100644 index 0000000..486280c --- /dev/null +++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter"> + <params> + <param name="mimes" type="string">image/jpeg,application/pdf</param> + </params> + </metadataFilter> + <metadataFilter class="org.apache.tika.metadata.filter.MockUpperCaseFilter"/> + </metadataFilters> +</properties> diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index a5182c6..349f271 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -21,6 +21,7 @@ package org.apache.tika.parser; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; import java.io.IOException; import java.io.InputStream; @@ -30,6 +31,7 @@ import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.apache.tika.io.ClosedInputStream; import org.apache.tika.io.ProxyInputStream; @@ -365,6 +367,47 @@ public class RecursiveParserWrapperTest extends TikaTest { } + @Test + public void testIncludeFilter() throws Exception { + //TIKA-3137 + ParseContext context = new ParseContext(); + Metadata metadata = new Metadata(); + TikaConfig tikaConfig = new TikaConfig(getClass().getResourceAsStream("TIKA-3137-include.xml")); + Parser p = new AutoDetectParser(tikaConfig); + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, true); + String path = "/test-documents/test_recursive_embedded.docx"; + ContentHandlerFactory contentHandlerFactory = + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, + -1); + + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory, + -1, tikaConfig.getMetadataFilter()); + try (InputStream is = getClass().getResourceAsStream(path)) { + wrapper.parse(is, handler, metadata, context); + } + List<Metadata> metadataList = handler.getMetadataList(); + assertEquals(5, metadataList.size()); + + Set<String> expectedKeys = new HashSet<>(); + expectedKeys.add("X-TIKA:content"); + expectedKeys.add("extended-properties:Application"); + expectedKeys.add("Content-Type"); + for (Metadata m : metadataList) { + if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) { + fail("emf should have been filtered out"); + } + if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) { + fail("text/plain should have been filtered out"); + } + assertTrue(m.names().length >= 2); + for (String n : m.names()) { + if (! expectedKeys.contains(n)) { + fail("didn't expect "+n); + } + } + } + } + private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory, boolean catchEmbeddedExceptions, DigestingParser.Digester digester) throws Exception { diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml new file mode 100644 index 0000000..765bc11 --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter"> + <params> + <param name="include" type="string">X-TIKA:content,extended-properties:Application,Content-Type</param> + </params> + </metadataFilter> + <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter"> + <params> + <param name="mimes" type="string">image/emf,text/plain</param> + </params> + </metadataFilter> + </metadataFilters> +</properties> diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java index 07d20c5..71e7180 100644 --- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java +++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java @@ -152,7 +152,8 @@ public class RecursiveMetadataResource { BasicContentHandlerFactory.HANDLER_TYPE type = BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( - new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources); + new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources, + TikaResource.getConfig().getMetadataFilter()); try { TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context); } catch (SecurityException e) { diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java index 92c9d34..8b5f153 100644 --- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java +++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java @@ -90,7 +90,8 @@ public abstract class CXFTestBase { @Before public void setUp() throws Exception { - this.tika = new TikaConfig(getClass().getResourceAsStream("tika-config-for-server-tests.xml")); + + this.tika = new TikaConfig(getTikaConfigInputStream()); TikaResource.init(tika, new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"), new DefaultInputStreamFactory(), new ServerStatus(true)); @@ -120,6 +121,10 @@ public abstract class CXFTestBase { server = sf.create(); } + protected InputStream getTikaConfigInputStream() { + return getClass().getResourceAsStream("tika-config-for-server-tests.xml"); + } + /** * Have the test do {@link JAXRSServerFactoryBean#setResourceClasses(Class...)} * and {@link JAXRSServerFactoryBean#setResourceProvider(Class, org.apache.cxf.jaxrs.lifecycle.ResourceProvider)} diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java new file mode 100644 index 0000000..748ee77 --- /dev/null +++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.server; + +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.cxf.jaxrs.JAXRSServerFactoryBean; +import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.cxf.jaxrs.ext.multipart.Attachment; +import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.OfficeOpenXMLExtended; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.metadata.serialization.JsonMetadataList; +import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; +import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.apache.tika.server.resource.RecursiveMetadataResource; +import org.apache.tika.server.writer.MetadataListMessageBodyWriter; +import org.junit.Test; + +import javax.ws.rs.core.Response; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.apache.tika.TikaTest.assertNotContained; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +public class RecursiveMetadataFilterTest extends CXFTestBase { + + private static final String META_PATH = "/rmeta"; + + private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx"; + + @Override + protected InputStream getTikaConfigInputStream() { + return getClass().getResourceAsStream("TIKA-3137-include.xml"); + } + @Override + protected void setUpResources(JAXRSServerFactoryBean sf) { + sf.setResourceClasses(RecursiveMetadataResource.class); + sf.setResourceProvider(RecursiveMetadataResource.class, + new SingletonResourceProvider(new RecursiveMetadataResource())); + } + + @Override + protected void setUpProviders(JAXRSServerFactoryBean sf) { + List<Object> providers = new ArrayList<>(); + providers.add(new MetadataListMessageBodyWriter()); + sf.setProviders(providers); + } + + @Test + public void testBasicFilter() throws Exception { + Response response = WebClient + .create(endPoint + META_PATH) + .accept("application/json") + .acceptEncoding("gzip") + .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC)); + + Reader reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8); + List<Metadata> metadataList = JsonMetadataList.fromJson(reader); + assertEquals(5, metadataList.size()); + + Set<String> expectedKeys = new HashSet<>(); + expectedKeys.add("X-TIKA:content"); + expectedKeys.add("extended-properties:Application"); + expectedKeys.add("Content-Type"); + for (Metadata m : metadataList) { + if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) { + fail("emf should have been filtered out"); + } + if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) { + fail("text/plain should have been filtered out"); + } + assertTrue(m.names().length >= 2); + for (String n : m.names()) { + if (! expectedKeys.contains(n)) { + fail("didn't expect "+n); + } + } + } + } +} diff --git a/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml b/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml new file mode 100644 index 0000000..765bc11 --- /dev/null +++ b/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <metadataFilters> + <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter"> + <params> + <param name="include" type="string">X-TIKA:content,extended-properties:Application,Content-Type</param> + </params> + </metadataFilter> + <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter"> + <params> + <param name="mimes" type="string">image/emf,text/plain</param> + </params> + </metadataFilter> + </metadataFilters> +</properties>
