Author: jnioche Date: Mon Apr 2 11:50:35 2012 New Revision: 1308310 URL: http://svn.apache.org/viewvc?rev=1308310&view=rev Log: NUTCH-1234 Upgrade to Tika 1.1 (jnioche, markus)
Removed: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/parse-tika/ivy.xml nutch/trunk/src/plugin/parse-tika/plugin.xml nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1308310&r1=1308309&r2=1308310&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Apr 2 11:50:35 2012 @@ -1,6 +1,6 @@ Nutch Change Log -* NUTCH-1234 Upgrade to Tika 1.1 (markus) +* NUTCH-1234 Upgrade to Tika 1.1 (jnioche, markus) * NUTCH-809 Parse-metatags plugin (jnioche) Modified: nutch/trunk/src/plugin/parse-tika/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/ivy.xml?rev=1308310&r1=1308309&r2=1308310&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/ivy.xml (original) +++ nutch/trunk/src/plugin/parse-tika/ivy.xml Mon Apr 2 11:50:35 2012 @@ -36,8 +36,7 @@ </publications> <dependencies> - <dependency org="org.apache.tika" name="tika-parsers" rev="0.10" conf="*->default"> - <exclude org="org.apache.tika" name="tika-core" /> + <dependency org="org.apache.tika" name="tika-parsers" rev="1.1" conf="*->default"> </dependency> </dependencies> Modified: nutch/trunk/src/plugin/parse-tika/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/plugin.xml?rev=1308310&r1=1308309&r2=1308310&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/plugin.xml (original) +++ nutch/trunk/src/plugin/parse-tika/plugin.xml Mon Apr 2 11:50:35 2012 @@ -32,26 +32,30 @@ <library name="bcmail-jdk15-1.45.jar"/> <library name="bcprov-jdk15-1.45.jar"/> <library name="boilerpipe-1.1.0.jar"/> - <library name="commons-codec-1.4.jar"/> - <library name="commons-compress-1.1.jar"/> - <library name="commons-httpclient-3.1.jar"/> + <library name="commons-codec-1.5.jar"/> + <library name="commons-compress-1.3.jar"/> <library name="commons-logging-1.1.1.jar"/> <library name="dom4j-1.6.1.jar"/> <library name="fontbox-1.6.0.jar"/> <library name="geronimo-stax-api_1.0_spec-1.0.1.jar"/> + <library name="isoparser-1.0-beta-5.jar"/> + <library name="javassist-3.6.0.GA.jar"/> <library name="jdom-1.0.jar"/> <library name="jempbox-1.6.0.jar"/> <library name="metadata-extractor-2.4.0-beta-1.jar"/> <library name="netcdf-4.2-min.jar"/> <library name="pdfbox-1.6.0.jar"/> - <library name="poi-3.8-beta4.jar"/> + <library name="poi-3.8-beta5.jar"/> <library name="poi-ooxml-3.8-beta4.jar"/> - <library name="poi-ooxml-schemas-3.8-beta4.jar"/> - <library name="poi-scratchpad-3.8-beta4.jar"/> + <library name="poi-ooxml-schemas-3.8-beta5.jar"/> + <library name="poi-scratchpad-3.8-beta5.jar"/> <library name="rome-0.9.jar"/> + <library name="scannotation-1.0.2.jar"/> <library name="slf4j-api-1.5.6.jar"/> <library name="tagsoup-1.2.1.jar"/> - <library name="tika-parsers-0.10.jar"/> + <library name="tika-parsers-1.1.jar"/> + <library name="vorbis-java-core-0.1.jar"/> + <library name="vorbis-java-tika-0.1.jar"/> <library name="xmlbeans-2.3.0.jar"/> </runtime> Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1308310&r1=1308309&r2=1308310&view=diff ============================================================================== --- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java (original) +++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java Mon Apr 2 11:50:35 2012 @@ -20,11 +20,8 @@ import java.io.ByteArrayInputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; -import java.util.HashMap; import java.util.Map; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.html.dom.HTMLDocumentImpl; import org.apache.nutch.metadata.Nutch; @@ -38,9 +35,13 @@ import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.protocol.Content; +import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; /** @@ -70,7 +71,7 @@ public class TikaParser implements org.a } // get the right parser using the mime type as a clue - Parser parser = tikaConfig.getParser(mimeType); + Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); byte[] raw = content.getContent(); if (parser == null) { @@ -202,7 +203,7 @@ public class TikaParser implements org.a } } else { try { - tikaConfig = TikaConfig.getDefaultConfig(); + tikaConfig = new TikaConfig(this.getClass().getClassLoader()); } catch (Exception e2) { String message = "Problem loading default Tika configuration"; LOG.error(message, e2);