Author: lewismc
Date: Wed May 27 23:28:26 2015
New Revision: 1682136

URL: http://svn.apache.org/r1682136
Log:
NUTCH-208 http: proxy exception list:

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
    
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May 27 23:28:26 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-208 http: proxy exception list: (Matthias Günter, siren, markus, 
lewismc)
+
 * NUTCH-2007 add test libs to classpath of bin/nutch junit (snagel)
 
 * NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist (totaro)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed May 27 23:28:26 2015
@@ -278,6 +278,13 @@
 </property>
 
 <property>
+  <name>http.proxy.exception.list</name>
+  <value></value>
+  <description>A comma separated list of URL's and hosts that don't use the 
proxy 
+  (e.g. intranets). Example: www.apache.org</description>
+</property>
+
+<property>
   <name>http.verbose</name>
   <value>false</value>
   <description>If true, HTTP will log more verbosely.</description>

Modified: 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 (original)
+++ 
nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
 Wed May 27 23:28:26 2015
@@ -21,6 +21,7 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.net.URL;
+import java.util.*;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashSet;
@@ -67,6 +68,9 @@ public abstract class HttpBase implement
 
   /** The proxy port. */
   protected int proxyPort = 8080;
+  
+  /** The proxy exception list. */
+  protected HashMap proxyException = new HashMap(); 
 
   /** Indicates if a proxy is used */
   protected boolean useProxy = false;
@@ -135,6 +139,7 @@ public abstract class HttpBase implement
     this.conf = conf;
     this.proxyHost = conf.get("http.proxy.host");
     this.proxyPort = conf.getInt("http.proxy.port", 8080);
+    this.proxyException = 
arrayToMap(conf.getStrings("http.proxy.exception.list"));
     this.useProxy = (proxyHost != null && proxyHost.length() > 0);
     this.timeout = conf.getInt("http.timeout", 10000);
     this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
@@ -340,7 +345,12 @@ public abstract class HttpBase implement
     return proxyPort;
   }
 
-  public boolean useProxy() {
+  public boolean useProxy(URL url) {
+    if (!useProxy){
+      return false;
+    } else if (proxyException.get(url.getHost())!=null){
+      return false;
+    }
     return useProxy;
   }
 
@@ -434,6 +444,7 @@ public abstract class HttpBase implement
     if (logger.isInfoEnabled()) {
       logger.info("http.proxy.host = " + proxyHost);
       logger.info("http.proxy.port = " + proxyPort);
+      logger.info("http.proxy.exception.list = " + useProxy);
       logger.info("http.timeout = " + timeout);
       logger.info("http.content.limit = " + maxContent);
       logger.info("http.agent = " + userAgent);
@@ -547,4 +558,22 @@ public abstract class HttpBase implement
   public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
     return robots.getRobotRulesSet(this, url);
   }
+  
+  /**
+   * Transforming a String[] into a HashMap for faster searching
+   * @param input String[]
+   * @return a new HashMap
+   */
+  private HashMap arrayToMap(String[]input){
+    if (input==null ||input.length==0) {
+      return new HashMap();
+    }
+    HashMap hm=new HashMap();
+    for (int i=0;i<input.length;i++){
+      if (!"".equals(input[i].trim())){
+        hm.put(input[i],input[i]);
+      }
+    }
+    return hm;
+  }
 }

Modified: 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
 Wed May 27 23:28:26 2015
@@ -117,8 +117,8 @@ public class HttpResponse implements Res
       socket.setSoTimeout(http.getTimeout());
 
       // connect
-      String sockHost = http.useProxy() ? http.getProxyHost() : host;
-      int sockPort = http.useProxy() ? http.getProxyPort() : port;
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
       InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
 
@@ -158,7 +158,7 @@ public class HttpResponse implements Res
       OutputStream req = socket.getOutputStream();
 
       StringBuffer reqStr = new StringBuffer("GET ");
-      if (http.useProxy()) {
+      if (http.useProxy(url)) {
         reqStr.append(url.getProtocol() + "://" + host + portString + path);
       } else {
         reqStr.append(path);
@@ -329,7 +329,6 @@ public class HttpResponse implements Res
    * @throws HttpException
    * @throws IOException
    */
-  @SuppressWarnings("unused")
   private void readChunkedContent(PushbackInputStream in, StringBuffer line)
       throws HttpException, IOException {
     boolean doneChunks = false;

Modified: 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1682136&r1=1682135&r2=1682136&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
 Wed May 27 23:28:26 2015
@@ -90,8 +90,8 @@ public class HttpResponse implements Res
       socket.setSoTimeout(http.getTimeout());
 
       // connect
-      String sockHost = http.useProxy() ? http.getProxyHost() : host;
-      int sockPort = http.useProxy() ? http.getProxyPort() : port;
+      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
+      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
       InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
       socket.connect(sockAddr, http.getTimeout());
 
@@ -99,7 +99,7 @@ public class HttpResponse implements Res
       OutputStream req = socket.getOutputStream();
 
       StringBuffer reqStr = new StringBuffer("GET ");
-      if (http.useProxy()) {
+      if (http.useProxy(url)) {
         reqStr.append(url.getProtocol() + "://" + host + portString + path);
       } else {
         reqStr.append(path);


Reply via email to