Author: chetanm
Date: Mon Feb 16 12:47:43 2015
New Revision: 1660100
URL: http://svn.apache.org/r1660100
Log:
OAK-2523 - Provide a default Tika Config similar to JR2 config
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
(with props)
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java?rev=1660100&r1=1660099&r2=1660100&view=diff
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
(original)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
Mon Feb 16 12:47:43 2015
@@ -24,6 +24,7 @@ import static org.apache.lucene.store.No
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.net.URL;
import java.util.Calendar;
import org.apache.commons.io.IOUtils;
@@ -42,12 +43,10 @@ import org.apache.lucene.index.SerialMer
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
public class LuceneIndexEditorContext {
@@ -97,7 +96,7 @@ public class LuceneIndexEditorContext {
private final IndexWriterConfig config;
- private static final Parser defaultParser = new AutoDetectParser();
+ private static final Parser defaultParser = createDefaultParser();
private final IndexDefinition definition;
@@ -222,18 +221,40 @@ public class LuceneIndexEditorContext {
if (definition.hasCustomTikaConfig()){
InputStream is = definition.getTikaConfig();
try {
+ return new AutoDetectParser(getTikaConfig(is, definition));
+ } finally {
+ IOUtils.closeQuietly(is);
+ }
+ }
+ return defaultParser;
+ }
+
+ private static AutoDetectParser createDefaultParser() {
+ URL configUrl =
LuceneIndexEditorContext.class.getResource("tika-config.xml");
+ InputStream is = null;
+ if (configUrl != null) {
+ try {
+ is = configUrl.openStream();
TikaConfig config = new TikaConfig(is);
+ log.info("Loaded default Tika Config from classpath {}",
configUrl);
return new AutoDetectParser(config);
- } catch (IOException e){
- throw new RuntimeException("Error loading TikaConfig for "+
definition, e);
- } catch (SAXException e) {
- throw new RuntimeException("Error loading TikaConfig for "+
definition, e);
- } catch (TikaException e) {
- throw new RuntimeException("Error loading TikaConfig for "+
definition, e);
+ } catch (Exception e) {
+ log.warn("Tika configuration not available : " + configUrl, e);
} finally {
IOUtils.closeQuietly(is);
}
+ } else {
+ log.warn("Default Tika configuration not found from {}",
configUrl);
}
- return defaultParser;
+ return new AutoDetectParser();
+ }
+
+ private static TikaConfig getTikaConfig(InputStream configStream, Object
source){
+ try {
+ return new TikaConfig(configStream);
+ } catch (Exception e) {
+ log.warn("Tika configuration not available : "+source, e);
+ }
+ return TikaConfig.getDefaultConfig();
}
}
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml?rev=1660100&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
Mon Feb 16 12:47:43 2015
@@ -0,0 +1,49 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one
+ ~ or more contributor license agreements. See the NOTICE file
+ ~ distributed with this work for additional information
+ ~ regarding copyright ownership. The ASF licenses this file
+ ~ to you under the Apache License, Version 2.0 (the
+ ~ "License"); you may not use this file except in compliance
+ ~ with the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing,
+ ~ software distributed under the License is distributed on an
+ ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ ~ KIND, either express or implied. See the License for the
+ ~ specific language governing permissions and limitations
+ ~ under the License.
+ -->
+
+<properties>
+ <detectors>
+ <detector class="org.apache.tika.detect.DefaultDetector"/>
+ </detectors>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <!-- Disable package extraction as it's too resource-intensive -->
+ <mime>application/x-archive</mime>
+ <mime>application/x-bzip</mime>
+ <mime>application/x-bzip2</mime>
+ <mime>application/x-cpio</mime>
+ <mime>application/x-gtar</mime>
+ <mime>application/x-gzip</mime>
+ <mime>application/x-tar</mime>
+ <mime>application/zip</mime>
+ <!-- Disable image extraction as there's no text to be found -->
+ <mime>image/bmp</mime>
+ <mime>image/gif</mime>
+ <mime>image/jpeg</mime>
+ <mime>image/png</mime>
+ <mime>image/vnd.wap.wbmp</mime>
+ <mime>image/x-icon</mime>
+ <mime>image/x-psd</mime>
+ <mime>image/x-xcf</mime>
+ </parser>
+ </parsers>
+</properties>
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/main/resources/org/apache/jackrabbit/oak/plugins/index/lucene/tika-config.xml
------------------------------------------------------------------------------
svn:eol-style = native