Ard Schrijvers pushed to branch release/3.1 at cms-community / hippo-repository
Commits: afd71e84 by Ard Schrijvers at 2016-03-30T10:27:16+02:00 REPO-1457 [Forward port 3.1.3] Add a couple of known mime-types to an upfront excluded set to avoid certain binaries to be streamed during indexing while never adding any extracted text Jackrabbit returns for example for isSupportedMediaType("image/jpeg") the value true, with as a result, that the binary is streamed from the database and then results in "" as extracted text. Unfortunately, it does not seem to be possible to configure tika in such a way that parser.getSupportedTypes(null); does not return for example 'image/jpeg'. Hence, I override #isSupportedMediaType and return false if the mime-type equals one of the preconfigured excluded types. (cherry picked from commit 0438df180e9ddc5ed81eefc4fb36403b7c7a0d7b) - - - - - 1 changed file: - engine/src/main/java/org/hippoecm/repository/query/lucene/ServicingNodeIndexer.java Changes: ===================================== engine/src/main/java/org/hippoecm/repository/query/lucene/ServicingNodeIndexer.java ===================================== --- a/engine/src/main/java/org/hippoecm/repository/query/lucene/ServicingNodeIndexer.java +++ b/engine/src/main/java/org/hippoecm/repository/query/lucene/ServicingNodeIndexer.java @@ -1,5 +1,5 @@ /* - * Copyright 2008-2015 Hippo B.V. (http://www.onehippo.com) + * Copyright 2008-2016 Hippo B.V. (http://www.onehippo.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,12 +22,15 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; import javax.jcr.ItemNotFoundException; import javax.jcr.NamespaceException; import javax.jcr.PropertyType; import javax.jcr.RepositoryException; +import com.google.common.collect.ImmutableSet; + import org.apache.commons.io.IOUtils; import org.apache.jackrabbit.core.id.PropertyId; import org.apache.jackrabbit.core.query.QueryHandlerContext; @@ -73,6 +76,24 @@ public class ServicingNodeIndexer extends NodeIndexer { private boolean supportSimilarityOnStrings = true; private boolean supportSimilarityOnBinaries; + private static Set<String> UNSUPPORTED_BINARY_TYPES = ImmutableSet.of( + "application/x-archive", + "application/x-bzip", + "application/x-bzip2", + "application/x-cpio", + "application/x-gtar", + "application/x-gzip", + "application/x-tar", + "application/zip", + "image/bmp", + "image/gif", + "image/jpeg", + "image/png", + "image/vnd.wap.wbmp", + "image/x-icon", + "image/x-psd", + "image/x-xcf"); + public ServicingNodeIndexer(NodeState node, QueryHandlerContext context, NamespaceMappings mappings, Parser parser) { super(node, context.getItemStateManager(), mappings, context.getExecutor(), parser); this.queryHandlerContext = context; @@ -554,6 +575,15 @@ public class ServicingNodeIndexer extends NodeIndexer { this.supportSimilarityOnBinaries = supportSimilarityOnBinaries; } + + @Override + protected boolean isSupportedMediaType(final String type) { + if (UNSUPPORTED_BINARY_TYPES.contains(type.toLowerCase())) { + return false; + } + return super.isSupportedMediaType(type); + } + private class BinaryValue { private InternalValue internalValue; View it on GitLab: https://code.onehippo.org/cms-community/hippo-repository/commit/afd71e84ce598690b7b2aed1d56dab7fd764d9b6
_______________________________________________ Hippocms-svn mailing list Hippocms-svn@lists.onehippo.org https://lists.onehippo.org/mailman/listinfo/hippocms-svn