Author: lewismc Date: Wed May 27 23:28:26 2015 New Revision: 1682136 URL: http://svn.apache.org/r1682136 Log: NUTCH-208 http: proxy exception list:
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1682136&r1=1682135&r2=1682136&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed May 27 23:28:26 2015 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development 1.11-SNAPSHOT +* NUTCH-208 http: proxy exception list: (Matthias Günter, siren, markus, lewismc) + * NUTCH-2007 add test libs to classpath of bin/nutch junit (snagel) * NUTCH-1995 Add support for wildcard to http.robot.rules.whitelist (totaro) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1682136&r1=1682135&r2=1682136&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Wed May 27 23:28:26 2015 @@ -278,6 +278,13 @@ </property> <property> + <name>http.proxy.exception.list</name> + <value></value> + <description>A comma separated list of URL's and hosts that don't use the proxy + (e.g. intranets). Example: www.apache.org</description> +</property> + +<property> <name>http.verbose</name> <value>false</value> <description>If true, HTTP will log more verbosely.</description> Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1682136&r1=1682135&r2=1682136&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Wed May 27 23:28:26 2015 @@ -21,6 +21,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.net.URL; +import java.util.*; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; @@ -67,6 +68,9 @@ public abstract class HttpBase implement /** The proxy port. */ protected int proxyPort = 8080; + + /** The proxy exception list. */ + protected HashMap proxyException = new HashMap(); /** Indicates if a proxy is used */ protected boolean useProxy = false; @@ -135,6 +139,7 @@ public abstract class HttpBase implement this.conf = conf; this.proxyHost = conf.get("http.proxy.host"); this.proxyPort = conf.getInt("http.proxy.port", 8080); + this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list")); this.useProxy = (proxyHost != null && proxyHost.length() > 0); this.timeout = conf.getInt("http.timeout", 10000); this.maxContent = conf.getInt("http.content.limit", 64 * 1024); @@ -340,7 +345,12 @@ public abstract class HttpBase implement return proxyPort; } - public boolean useProxy() { + public boolean useProxy(URL url) { + if (!useProxy){ + return false; + } else if (proxyException.get(url.getHost())!=null){ + return false; + } return useProxy; } @@ -434,6 +444,7 @@ public abstract class HttpBase implement if (logger.isInfoEnabled()) { logger.info("http.proxy.host = " + proxyHost); logger.info("http.proxy.port = " + proxyPort); + logger.info("http.proxy.exception.list = " + useProxy); logger.info("http.timeout = " + timeout); logger.info("http.content.limit = " + maxContent); logger.info("http.agent = " + userAgent); @@ -547,4 +558,22 @@ public abstract class HttpBase implement public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) { return robots.getRobotRulesSet(this, url); } + + /** + * Transforming a String[] into a HashMap for faster searching + * @param input String[] + * @return a new HashMap + */ + private HashMap arrayToMap(String[]input){ + if (input==null ||input.length==0) { + return new HashMap(); + } + HashMap hm=new HashMap(); + for (int i=0;i<input.length;i++){ + if (!"".equals(input[i].trim())){ + hm.put(input[i],input[i]); + } + } + return hm; + } } Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1682136&r1=1682135&r2=1682136&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Wed May 27 23:28:26 2015 @@ -117,8 +117,8 @@ public class HttpResponse implements Res socket.setSoTimeout(http.getTimeout()); // connect - String sockHost = http.useProxy() ? http.getProxyHost() : host; - int sockPort = http.useProxy() ? http.getProxyPort() : port; + String sockHost = http.useProxy(url) ? http.getProxyHost() : host; + int sockPort = http.useProxy(url) ? http.getProxyPort() : port; InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); socket.connect(sockAddr, http.getTimeout()); @@ -158,7 +158,7 @@ public class HttpResponse implements Res OutputStream req = socket.getOutputStream(); StringBuffer reqStr = new StringBuffer("GET "); - if (http.useProxy()) { + if (http.useProxy(url)) { reqStr.append(url.getProtocol() + "://" + host + portString + path); } else { reqStr.append(path); @@ -329,7 +329,6 @@ public class HttpResponse implements Res * @throws HttpException * @throws IOException */ - @SuppressWarnings("unused") private void readChunkedContent(PushbackInputStream in, StringBuffer line) throws HttpException, IOException { boolean doneChunks = false; Modified: nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java?rev=1682136&r1=1682135&r2=1682136&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java Wed May 27 23:28:26 2015 @@ -90,8 +90,8 @@ public class HttpResponse implements Res socket.setSoTimeout(http.getTimeout()); // connect - String sockHost = http.useProxy() ? http.getProxyHost() : host; - int sockPort = http.useProxy() ? http.getProxyPort() : port; + String sockHost = http.useProxy(url) ? http.getProxyHost() : host; + int sockPort = http.useProxy(url) ? http.getProxyPort() : port; InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort); socket.connect(sockAddr, http.getTimeout()); @@ -99,7 +99,7 @@ public class HttpResponse implements Res OutputStream req = socket.getOutputStream(); StringBuffer reqStr = new StringBuffer("GET "); - if (http.useProxy()) { + if (http.useProxy(url)) { reqStr.append(url.getProtocol() + "://" + host + portString + path); } else { reqStr.append(path);