Ard Schrijvers pushed to branch release/3.1 at cms-community / hippo-repository


Commits:
afd71e84 by Ard Schrijvers at 2016-03-30T10:27:16+02:00
REPO-1457 [Forward port 3.1.3] Add a couple of known mime-types to an upfront 
excluded set to avoid certain binaries to be streamed during indexing while 
never adding any extracted text
Jackrabbit returns for example for isSupportedMediaType("image/jpeg") 
the value true, with as a result, that the binary is streamed from the database 
and then results in "" as extracted text. Unfortunately, it does not 
seem to be possible to configure tika in such a way that 
parser.getSupportedTypes(null); does not return for example 'image/jpeg'. 
Hence, I override #isSupportedMediaType and return false if the mime-type 
equals one of the preconfigured excluded types.

(cherry picked from commit 0438df180e9ddc5ed81eefc4fb36403b7c7a0d7b)

- - - - -


1 changed file:

- 
engine/src/main/java/org/hippoecm/repository/query/lucene/ServicingNodeIndexer.java


Changes:

=====================================
engine/src/main/java/org/hippoecm/repository/query/lucene/ServicingNodeIndexer.java
=====================================
--- 
a/engine/src/main/java/org/hippoecm/repository/query/lucene/ServicingNodeIndexer.java
+++ 
b/engine/src/main/java/org/hippoecm/repository/query/lucene/ServicingNodeIndexer.java
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2015 Hippo B.V. (http://www.onehippo.com)
+ *  Copyright 2008-2016 Hippo B.V. (http://www.onehippo.com)
  * 
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,12 +22,15 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
+import java.util.Set;
 
 import javax.jcr.ItemNotFoundException;
 import javax.jcr.NamespaceException;
 import javax.jcr.PropertyType;
 import javax.jcr.RepositoryException;
 
+import com.google.common.collect.ImmutableSet;
+
 import org.apache.commons.io.IOUtils;
 import org.apache.jackrabbit.core.id.PropertyId;
 import org.apache.jackrabbit.core.query.QueryHandlerContext;
@@ -73,6 +76,24 @@ public class ServicingNodeIndexer extends NodeIndexer {
     private boolean supportSimilarityOnStrings = true;
     private boolean supportSimilarityOnBinaries;
 
+    private static Set<String> UNSUPPORTED_BINARY_TYPES = ImmutableSet.of(
+            "application/x-archive",
+            "application/x-bzip",
+            "application/x-bzip2",
+            "application/x-cpio",
+            "application/x-gtar",
+            "application/x-gzip",
+            "application/x-tar",
+            "application/zip",
+            "image/bmp",
+            "image/gif",
+            "image/jpeg",
+            "image/png",
+            "image/vnd.wap.wbmp",
+            "image/x-icon",
+            "image/x-psd",
+            "image/x-xcf");
+
     public ServicingNodeIndexer(NodeState node, QueryHandlerContext context, 
NamespaceMappings mappings, Parser parser) {
         super(node, context.getItemStateManager(), mappings, 
context.getExecutor(), parser);
         this.queryHandlerContext = context;
@@ -554,6 +575,15 @@ public class ServicingNodeIndexer extends NodeIndexer {
         this.supportSimilarityOnBinaries = supportSimilarityOnBinaries;
     }
 
+
+    @Override
+    protected boolean isSupportedMediaType(final String type) {
+        if (UNSUPPORTED_BINARY_TYPES.contains(type.toLowerCase())) {
+            return false;
+        }
+        return super.isSupportedMediaType(type);
+    }
+
     private class BinaryValue {
 
         private InternalValue internalValue;



View it on GitLab: 
https://code.onehippo.org/cms-community/hippo-repository/commit/afd71e84ce598690b7b2aed1d56dab7fd764d9b6
_______________________________________________
Hippocms-svn mailing list
Hippocms-svn@lists.onehippo.org
https://lists.onehippo.org/mailman/listinfo/hippocms-svn

Reply via email to