svn commit: r1661539 - in /nutch/branches/2.x: ./ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/

2015-02-22 Thread lewismc
Author: lewismc
Date: Sun Feb 22 19:54:21 2015
New Revision: 1661539

URL: http://svn.apache.org/r1661539
Log:
NUTCH-1925 Upgrade Tika to version 1.7

Modified:
nutch/branches/2.x/CHANGES.txt

nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java

nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1661539&r1=1661538&r2=1661539&view=diff
==
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Sun Feb 22 19:54:21 2015
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development 2.4-SNAPSHOT
 
+* NUTCH-1925 Upgrade to Apache Tika 1.7 palsulich.p2.v2.patch (Tyler Palsulich 
via lewismc)
+
 * NUTCH-1925 Upgrade to Apache Tika 1.7 (Tyler Palsulich via markus)
 
 * NUTCH-1924 Nutch + HBase Docker (Radosław Stankiewicz via lewismc)

Modified: 
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java?rev=1661539&r1=1661538&r2=1661539&view=diff
==
--- 
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaConfig.java
 Sun Feb 22 19:54:21 2015
@@ -1,241 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.tika;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URL;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
-import javax.imageio.spi.ServiceRegistry;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.ParserConfigurationException;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.mime.MimeTypesFactory;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-
-/**
- * Parse xml config file.
- */
-public class TikaConfig {
-
-  private final Map parsers = new HashMap();
-
-  private final MimeTypes mimeTypes;
-
-  public TikaConfig(String file) throws TikaException, IOException,
-  SAXException {
-this(new File(file));
-  }
-
-  public TikaConfig(File file) throws TikaException, IOException, SAXException 
{
-this(getBuilder().parse(file));
-  }
-
-  public TikaConfig(URL url) throws TikaException, IOException, SAXException {
-this(getBuilder().parse(url.toString()));
-  }
-
-  public TikaConfig(InputStream stream) throws TikaException, IOException,
-  SAXException {
-this(getBuilder().parse(stream));
-  }
-
-  /**
-   * @deprecated This method will be removed in Apache Tika 1.0
-   * @see https://issues.apache.org/jira/browse/TIKA-275";>TIKA-275
-   */
-  public TikaConfig(InputStream stream, Parser delegate) throws TikaException,
-  IOException, SAXException {
-this(stream);
-  }
-
-  public TikaConfig(Document document) throws TikaException, IOException {
-this(document.getDocumentElement());
-  }
-
-  /**
-   * @deprecated This method will be removed in Apache Tika 1.0
-   * @see https://issues.apache.org/jira/browse/TIKA-275";>TIKA-275
-   */
-  public TikaConfig(Document document, Parser delegate) throws TikaException,
-  IOException {
-this(document);
-  }
-
-  public TikaConfig(Element element) throws TikaException, IOException {
-Element mtr = getChild(element, "mimeTypeRepository");
-if (mtr != null && mtr.hasAttrib

svn commit: r1661600 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/mimetype-filter/ src/plugin/mimetype-filter/sample/ src/plugin/mimetype-filter/src/ src/plugin/mimetype-filter/src/java/ src/plu

2015-02-22 Thread jorgelbg
Author: jorgelbg
Date: Mon Feb 23 02:53:24 2015
New Revision: 1661600

URL: http://svn.apache.org/r1661600
Log:
NUTCH-1928 Indexing filter of documents by the MIME type


Added:
nutch/trunk/src/plugin/mimetype-filter/
nutch/trunk/src/plugin/mimetype-filter/build.xml
nutch/trunk/src/plugin/mimetype-filter/ivy.xml
nutch/trunk/src/plugin/mimetype-filter/plugin.xml
nutch/trunk/src/plugin/mimetype-filter/sample/
nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt
nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt
nutch/trunk/src/plugin/mimetype-filter/src/
nutch/trunk/src/plugin/mimetype-filter/src/java/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/
nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/

nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/

nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
nutch/trunk/src/plugin/mimetype-filter/src/test/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/
nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/

nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/

nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
Modified:
nutch/trunk/build.xml
nutch/trunk/conf/nutch-default.xml
nutch/trunk/default.properties
nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Mon Feb 23 02:53:24 2015
@@ -178,6 +178,7 @@
   
   
   
+  
   
   
   
@@ -584,6 +585,7 @@
   
   
   
+  
   
   
   
@@ -969,6 +971,8 @@
 
 
 
+
+
 
 
 

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Mon Feb 23 02:53:24 2015
@@ -1602,4 +1602,15 @@
   Whether to support multivalued headings.
 
 
+
+
+
+  mimetype.filter.file
+  mimetype-filter.txt
+  
+The configuration file for the mimetype-filter plugin. This file contains
+the rules used to allow or deny the indexing of certain documents.
+  
+
+
 

Modified: nutch/trunk/default.properties
URL: 
http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1661600&r1=1661599&r2=1661600&view=diff
==
--- nutch/trunk/default.properties (original)
+++ nutch/trunk/default.properties Mon Feb 23 02:53:24 2015
@@ -148,6 +148,7 @@ plugins.index=\
org.apache.nutch.indexer.basic*:\
org.apache.nutch.indexer.feed*:\
org.apache.nutch.indexer.geoip*:\
+   org.apache.nutch.indexer.filter*:\
org.apache.nutch.indexer.metadata*:\
org.apache.nutch.indexer.more*:\
org.apache.nutch.indexer.static*:\

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1661600&r1=1661599&r2=1661600&view=diff
==
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Mon Feb 23 02:53:24 2015
@@ -35,6 +35,7 @@
  
  
  
+ 
  
  
  
@@ -88,6 +89,7 @@
  
  
  
+ 
  
  
  
@@ -126,10 +128,11 @@
 
 
 
- 
+
 
 
 
+
 
 
 

Added: nutch/trunk/src/plugin/mimetype-filter/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/build.xml?rev=1661600&view=auto
==
--- nutch/trunk/src/plugin/mimetype-filter/build.xml (added)
+++ nutch/trunk/src/plugin/mimetype-filter/build.xml Mon Feb 23 02:53:24 2015
@@ -0,0 +1,28 @@
+
+
+
+
+
+
+
+
+
+
+
+
+

Added: nutch/trunk/src/plugin/mimetype-filter/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/ivy.xml?rev=1661600&view=auto
==
--- nutch/trunk/src/plugin/mimetype-filter/ivy.xml (added)
+++ nutch/trunk/src/plugin/mimetype