Author: chetanm
Date: Mon Feb 16 12:47:43 2015
New Revision: 1660100

URL: http://svn.apache.org/r1660100
Log:
OAK-2523 - Provide a default Tika Config similar to JR2 config

Added:
    jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/
    jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/
    jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/
    
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/
    
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/
    
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/
    
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/
    
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
   (with props)
Modified:
    
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java

Modified: 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java?rev=1660100&r1=1660099&r2=1660100&view=diff
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
 (original)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
 Mon Feb 16 12:47:43 2015
@@ -24,6 +24,7 @@ import static org.apache.lucene.store.No
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URL;
 import java.util.Calendar;
 
 import org.apache.commons.io.IOUtils;
@@ -42,12 +43,10 @@ import org.apache.lucene.index.SerialMer
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
 
 public class LuceneIndexEditorContext {
 
@@ -97,7 +96,7 @@ public class LuceneIndexEditorContext {
 
     private final IndexWriterConfig config;
 
-    private static final Parser defaultParser = new AutoDetectParser();
+    private static final Parser defaultParser = createDefaultParser();
 
     private final IndexDefinition definition;
 
@@ -222,18 +221,40 @@ public class LuceneIndexEditorContext {
         if (definition.hasCustomTikaConfig()){
             InputStream is = definition.getTikaConfig();
             try {
+                return new AutoDetectParser(getTikaConfig(is, definition));
+            } finally {
+                IOUtils.closeQuietly(is);
+            }
+        }
+        return defaultParser;
+    }
+
+    private static AutoDetectParser createDefaultParser() {
+        URL configUrl = 
LuceneIndexEditorContext.class.getResource("tika-config.xml");
+        InputStream is = null;
+        if (configUrl != null) {
+            try {
+                is = configUrl.openStream();
                 TikaConfig config = new TikaConfig(is);
+                log.info("Loaded default Tika Config from classpath {}", 
configUrl);
                 return new AutoDetectParser(config);
-            } catch (IOException e){
-                throw new RuntimeException("Error loading TikaConfig for "+ 
definition, e);
-            } catch (SAXException e) {
-                throw new RuntimeException("Error loading TikaConfig for "+ 
definition, e);
-            } catch (TikaException e) {
-                throw new RuntimeException("Error loading TikaConfig for "+ 
definition, e);
+            } catch (Exception e) {
+                log.warn("Tika configuration not available : " + configUrl, e);
             } finally {
                 IOUtils.closeQuietly(is);
             }
+        } else {
+            log.warn("Default Tika configuration not found from {}", 
configUrl);
         }
-        return defaultParser;
+        return new AutoDetectParser();
+    }
+
+    private static TikaConfig getTikaConfig(InputStream configStream, Object 
source){
+        try {
+            return new TikaConfig(configStream);
+        } catch (Exception e) {
+            log.warn("Tika configuration not available : "+source, e);
+        }
+        return TikaConfig.getDefaultConfig();
     }
 }

Added: 
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
URL: 
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml?rev=1660100&view=auto
==============================================================================
--- 
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
 (added)
+++ 
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
 Mon Feb 16 12:47:43 2015
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~ or more contributor license agreements.  See the NOTICE file
+  ~ distributed with this work for additional information
+  ~ regarding copyright ownership.  The ASF licenses this file
+  ~ to you under the Apache License, Version 2.0 (the
+  ~ "License"); you may not use this file except in compliance
+  ~ with the License.  You may obtain a copy of the License at
+  ~
+  ~   http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing,
+  ~ software distributed under the License is distributed on an
+  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  ~ KIND, either express or implied.  See the License for the
+  ~ specific language governing permissions and limitations
+  ~ under the License.
+  -->
+
+<properties>
+  <detectors>
+    <detector class="org.apache.tika.detect.DefaultDetector"/>
+  </detectors>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+    <parser class="org.apache.tika.parser.EmptyParser">
+      <!-- Disable package extraction as it's too resource-intensive -->
+      <mime>application/x-archive</mime>
+      <mime>application/x-bzip</mime>
+      <mime>application/x-bzip2</mime>
+      <mime>application/x-cpio</mime>
+      <mime>application/x-gtar</mime>
+      <mime>application/x-gzip</mime>
+      <mime>application/x-tar</mime>
+      <mime>application/zip</mime>
+      <!-- Disable image extraction as there's no text to be found -->
+      <mime>image/bmp</mime>
+      <mime>image/gif</mime>
+      <mime>image/jpeg</mime>
+      <mime>image/png</mime>
+      <mime>image/vnd.wap.wbmp</mime>
+      <mime>image/x-icon</mime>
+      <mime>image/x-psd</mime>
+      <mime>image/x-xcf</mime>
+    </parser>
+  </parsers>
+</properties>

Propchange: 
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to