Author: jnioche Date: Fri May 16 13:32:35 2014 New Revision: 1595193 URL: http://svn.apache.org/r1595193 Log: NUTCH-1676 Add rudimentary SSL support to protocol-http
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/trunk/src/plugin/protocol-http/plugin.xml nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1595193&r1=1595192&r2=1595193&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri May 16 13:32:35 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus) + * NUTCH-1772 Injector does not need merging if no pre-existing crawldb (jnioche) * NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel) Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1595193&r1=1595192&r2=1595193&view=diff ============================================================================== --- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri May 16 13:32:35 2014 @@ -19,6 +19,9 @@ package org.apache.nutch.protocol.http.a // JDK imports import java.io.IOException; import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; // Logging imports import org.slf4j.Logger; @@ -43,9 +46,6 @@ import org.apache.hadoop.io.Text; // crawler-commons imports import crawlercommons.robots.BaseRobotRules; -/** - * @author Jérôme Charron - */ public abstract class HttpBase implements Protocol { public static final Text RESPONSE_TIME = new Text("_rs_"); @@ -103,6 +103,12 @@ public abstract class HttpBase implement /** Skip page if Crawl-Delay longer than this value. */ protected long maxCrawlDelay = -1L; + + /** Which TLS/SSL protocols to support */ + protected Set<String> tlsPreferredProtocols; + + /** Which TLS/SSL cipher suites to support */ + protected Set<String> tlsPreferredCipherSuites; /** Creates a new instance of HttpBase */ public HttpBase() { @@ -133,6 +139,32 @@ public abstract class HttpBase implement this.useHttp11 = conf.getBoolean("http.useHttp11", false); this.responseTime = conf.getBoolean("http.store.responsetime", true); this.robots.setConf(conf); + + String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); + String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", + "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", + "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", + "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", + "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", + "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", + "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", + "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA", + "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5", + "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA", + "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA", + "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA", + "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5", + "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5"); + + tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); + tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); + logConf(); } @@ -258,6 +290,14 @@ public abstract class HttpBase implement return useHttp11; } + public Set<String> getTlsPreferredCipherSuites() { + return tlsPreferredCipherSuites; + } + + public Set<String> getTlsPreferredProtocols() { + return tlsPreferredProtocols; + } + private static String getAgentString(String agentName, String agentVersion, String agentDesc, Modified: nutch/trunk/src/plugin/protocol-http/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/plugin.xml?rev=1595193&r1=1595192&r2=1595193&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-http/plugin.xml (original) +++ nutch/trunk/src/plugin/protocol-http/plugin.xml Fri May 16 13:32:35 2014 @@ -40,6 +40,11 @@ class="org.apache.nutch.protocol.http.Http"> <parameter name="protocolName" value="http"/> </implementation> + + <implementation id="org.apache.nutch.protocol.http.Http" + class="org.apache.nutch.protocol.http.Http"> + <parameter name="protocolName" value="https"/> + </implementation> </extension> Modified: nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1595193&r1=1595192&r2=1595193&view=diff ============================================================================== --- nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Fri May 16 13:32:35 2014 @@ -16,7 +16,6 @@ */ package org.apache.nutch.protocol.http; -// JDK imports import java.io.BufferedInputStream; import java.io.ByteArrayOutputStream; import java.io.EOFException; @@ -28,6 +27,13 @@ import java.net.InetSocketAddress; import java.net.Socket; import java.net.URL; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import javax.net.ssl.SSLSocket; +import javax.net.ssl.SSLSocketFactory; + import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; @@ -50,6 +56,11 @@ public class HttpResponse implements Res private byte[] content; private int code; private Metadata headers = new SpellCheckedMetadata(); + + protected enum Scheme { + HTTP, + HTTPS, + } /** * Default public constructor. @@ -66,9 +77,16 @@ public class HttpResponse implements Res this.url = url; this.orig = url.toString(); this.base = url.toString(); - - if (!"http".equals(url.getProtocol())) - throw new HttpException("Not an HTTP url:" + url); + + Scheme scheme = null; + + if ("http".equals(url.getProtocol())) { + scheme = Scheme.HTTP; + } else if ("https".equals(url.getProtocol())) { + scheme = Scheme.HTTPS; + } else { + throw new HttpException("Unknown scheme (not http/https) for url:" + url); + } if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetching " + url); @@ -84,7 +102,11 @@ public class HttpResponse implements Res int port; String portString; if (url.getPort() == -1) { - port= 80; + if (scheme == Scheme.HTTP) { + port = 80; + } else { + port = 443; + } portString= ""; } else { port= url.getPort(); @@ -102,6 +124,26 @@ public class HttpResponse implements Res int sockPort = http.useProxy() ? http.getProxyPort() : port; InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort); socket.connect(sockAddr, http.getTimeout()); + + if (scheme == Scheme.HTTPS) { + SSLSocketFactory factory = (SSLSocketFactory)SSLSocketFactory.getDefault(); + SSLSocket sslsocket = (SSLSocket)factory.createSocket(socket, sockHost, sockPort, true); + sslsocket.setUseClientMode(true); + + // Get the protocols and ciphers supported by this JVM + Set<String> protocols = new HashSet<String>(Arrays.asList(sslsocket.getSupportedProtocols())); + Set<String> ciphers = new HashSet<String>(Arrays.asList(sslsocket.getSupportedCipherSuites())); + + // Intersect with preferred protocols and ciphers + protocols.retainAll(http.getTlsPreferredProtocols()); + ciphers.retainAll(http.getTlsPreferredCipherSuites()); + + sslsocket.setEnabledProtocols(protocols.toArray(new String[protocols.size()])); + sslsocket.setEnabledCipherSuites(ciphers.toArray(new String[ciphers.size()])); + + sslsocket.startHandshake(); + socket = sslsocket; + } this.conf = http.getConf(); if (sockAddr != null