Author: jnioche
Date: Mon Apr  2 11:50:35 2012
New Revision: 1308310

URL: http://svn.apache.org/viewvc?rev=1308310&view=rev
Log:
NUTCH-1234 Upgrade to Tika 1.1 (jnioche, markus)

Removed:
    
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/parse-tika/ivy.xml
    nutch/trunk/src/plugin/parse-tika/plugin.xml
    
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1308310&r1=1308309&r2=1308310&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Apr  2 11:50:35 2012
@@ -1,6 +1,6 @@
 Nutch Change Log
 
-* NUTCH-1234 Upgrade to Tika 1.1 (markus)
+* NUTCH-1234 Upgrade to Tika 1.1 (jnioche, markus)
 
 * NUTCH-809 Parse-metatags plugin (jnioche)
 

Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1308310&r1=1308309&r2=1308310&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/ivy.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/ivy.xml Mon Apr  2 11:50:35 2012
@@ -36,8 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="0.10" 
conf="*->default">
-     <exclude org="org.apache.tika" name="tika-core" />
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.1" 
conf="*->default">
     </dependency>
   </dependencies>
   

Modified: nutch/trunk/src/plugin/parse-tika/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/plugin.xml?rev=1308310&r1=1308309&r2=1308310&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/plugin.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/plugin.xml Mon Apr  2 11:50:35 2012
@@ -32,26 +32,30 @@
       <library name="bcmail-jdk15-1.45.jar"/>
       <library name="bcprov-jdk15-1.45.jar"/>
       <library name="boilerpipe-1.1.0.jar"/>
-      <library name="commons-codec-1.4.jar"/>
-      <library name="commons-compress-1.1.jar"/>
-      <library name="commons-httpclient-3.1.jar"/>
+      <library name="commons-codec-1.5.jar"/>
+      <library name="commons-compress-1.3.jar"/>
       <library name="commons-logging-1.1.1.jar"/>
       <library name="dom4j-1.6.1.jar"/>
       <library name="fontbox-1.6.0.jar"/>
       <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/>
+      <library name="isoparser-1.0-beta-5.jar"/>
+      <library name="javassist-3.6.0.GA.jar"/>
       <library name="jdom-1.0.jar"/>
       <library name="jempbox-1.6.0.jar"/>
       <library name="metadata-extractor-2.4.0-beta-1.jar"/>
       <library name="netcdf-4.2-min.jar"/>
       <library name="pdfbox-1.6.0.jar"/>
-      <library name="poi-3.8-beta4.jar"/>
+      <library name="poi-3.8-beta5.jar"/>
       <library name="poi-ooxml-3.8-beta4.jar"/>
-      <library name="poi-ooxml-schemas-3.8-beta4.jar"/>
-      <library name="poi-scratchpad-3.8-beta4.jar"/>
+      <library name="poi-ooxml-schemas-3.8-beta5.jar"/>
+      <library name="poi-scratchpad-3.8-beta5.jar"/>
       <library name="rome-0.9.jar"/>
+      <library name="scannotation-1.0.2.jar"/>
       <library name="slf4j-api-1.5.6.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parsers-0.10.jar"/>
+      <library name="tika-parsers-1.1.jar"/>
+      <library name="vorbis-java-core-0.1.jar"/>
+      <library name="vorbis-java-tika-0.1.jar"/>
       <library name="xmlbeans-2.3.0.jar"/>
    </runtime>
 

Modified: 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1308310&r1=1308309&r2=1308310&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 (original)
+++ 
nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
 Mon Apr  2 11:50:35 2012
@@ -20,11 +20,8 @@ import java.io.ByteArrayInputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.Map;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.nutch.metadata.Nutch;
@@ -38,9 +35,13 @@ import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.protocol.Content;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
 
 /**
@@ -70,7 +71,7 @@ public class TikaParser implements org.a
                }
 
                // get the right parser using the mime type as a clue
-               Parser parser = tikaConfig.getParser(mimeType);
+               Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
                byte[] raw = content.getContent();
 
                if (parser == null) {
@@ -202,7 +203,7 @@ public class TikaParser implements org.a
                        }
                } else {
                        try {
-                               tikaConfig = TikaConfig.getDefaultConfig();
+                               tikaConfig = new 
TikaConfig(this.getClass().getClassLoader());
                        } catch (Exception e2) {
                                String message = "Problem loading default Tika 
configuration";
                                LOG.error(message, e2);


Reply via email to