Author: jnioche Date: Fri May 16 13:40:21 2014 New Revision: 1595196 URL: http://svn.apache.org/r1595196 Log: NUTCH-1676 Add rudimentary SSL support to protocol-http
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java nutch/branches/2.x/src/plugin/protocol-http/plugin.xml nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1595196&r1=1595195&r2=1595196&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri May 16 13:40:21 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1676 Add rudimentary SSL support to protocol-http (jnioche, markus) + * NUTCH-1674 Use batchId filter to enable scan (GORA-119) for Fetch,Parse,Update,Index (Tien Nguyen Manh and Alparslan Avcı via jnioche) * NUTCH-1714 Upgrade to Gora 0.4 (Alparslan Avcı via jnioche) Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1595196&r1=1595195&r2=1595196&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri May 16 13:40:21 2014 @@ -20,6 +20,9 @@ package org.apache.nutch.protocol.http.a import java.io.IOException; import java.net.URL; import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.avro.util.Utf8; @@ -95,6 +98,12 @@ public abstract class HttpBase implement /** Response Time */ protected boolean responseTime = true; + /** Which TLS/SSL protocols to support */ + protected Set<String> tlsPreferredProtocols; + + /** Which TLS/SSL cipher suites to support */ + protected Set<String> tlsPreferredCipherSuites; + /** Creates a new instance of HttpBase */ public HttpBase() { this(null); @@ -124,6 +133,32 @@ public abstract class HttpBase implement this.useHttp11 = conf.getBoolean("http.useHttp11", false); this.responseTime = conf.getBoolean("http.store.responsetime", true); this.robots.setConf(conf); + + String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); + String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", + "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", + "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", + "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", + "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", + "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", + "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", + "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA", + "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5", + "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA", + "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA", + "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA", + "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5", + "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5"); + + tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); + tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); + logConf(); } @@ -249,6 +284,14 @@ public abstract class HttpBase implement public boolean getUseHttp11() { return useHttp11; } + + public Set<String> getTlsPreferredCipherSuites() { + return tlsPreferredCipherSuites; + } + + public Set<String> getTlsPreferredProtocols() { + return tlsPreferredProtocols; + } private static String getAgentString(String agentName, String agentVersion, Modified: nutch/branches/2.x/src/plugin/protocol-http/plugin.xml URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/plugin.xml?rev=1595196&r1=1595195&r2=1595196&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-http/plugin.xml (original) +++ nutch/branches/2.x/src/plugin/protocol-http/plugin.xml Fri May 16 13:40:21 2014 @@ -40,6 +40,10 @@ class="org.apache.nutch.protocol.http.Http"> <parameter name="protocolName" value="http"/> </implementation> + <implementation id="org.apache.nutch.protocol.http.Http" + class="org.apache.nutch.protocol.http.Http"> + <parameter name="protocolName" value="https"/> + </implementation> </extension> Modified: nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?rev=1595196&r1=1595195&r2=1595196&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original) +++ nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Fri May 16 13:40:21 2014 @@ -32,8 +32,13 @@ import org.apache.nutch.storage.WebPage; import java.io.*; import java.net.InetSocketAddress; import java.net.Socket; +import javax.net.ssl.SSLSocket; +import javax.net.ssl.SSLSocketFactory; import java.net.URL; import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; /** An HTTP response. */ public class HttpResponse implements Response { @@ -45,6 +50,10 @@ public class HttpResponse implements Res private int code; private final Metadata headers = new SpellCheckedMetadata(); + protected enum Scheme { + HTTP, + HTTPS, + } public HttpResponse(HttpBase http, URL url, WebPage page) throws ProtocolException, IOException { @@ -52,8 +61,15 @@ public class HttpResponse implements Res this.http = http; this.url = url; - if (!"http".equals(url.getProtocol())) - throw new HttpException("Not an HTTP url:" + url); + Scheme scheme = null; + + if ("http".equals(url.getProtocol())) { + scheme = Scheme.HTTP; + } else if ("https".equals(url.getProtocol())) { + scheme = Scheme.HTTPS; + } else { + throw new HttpException("Unknown scheme (not http/https) for url:" + url); + } if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetching " + url); @@ -69,7 +85,11 @@ public class HttpResponse implements Res int port; String portString; if (url.getPort() == -1) { - port= 80; + if (scheme == Scheme.HTTP) { + port = 80; + } else { + port = 443; + } portString= ""; } else { port= url.getPort(); @@ -88,6 +108,26 @@ public class HttpResponse implements Res InetSocketAddress sockAddr= new InetSocketAddress(sockHost, sockPort); socket.connect(sockAddr, http.getTimeout()); + if (scheme == Scheme.HTTPS) { + SSLSocketFactory factory = (SSLSocketFactory)SSLSocketFactory.getDefault(); + SSLSocket sslsocket = (SSLSocket)factory.createSocket(socket, sockHost, sockPort, true); + sslsocket.setUseClientMode(true); + + // Get the protocols and ciphers supported by this JVM + Set<String> protocols = new HashSet<String>(Arrays.asList(sslsocket.getSupportedProtocols())); + Set<String> ciphers = new HashSet<String>(Arrays.asList(sslsocket.getSupportedCipherSuites())); + + // Intersect with preferred protocols and ciphers + protocols.retainAll(http.getTlsPreferredProtocols()); + ciphers.retainAll(http.getTlsPreferredCipherSuites()); + + sslsocket.setEnabledProtocols(protocols.toArray(new String[protocols.size()])); + sslsocket.setEnabledCipherSuites(ciphers.toArray(new String[ciphers.size()])); + + sslsocket.startHandshake(); + socket = sslsocket; + } + conf = http.getConf(); if (sockAddr != null && conf.getBoolean("store.ip.address", false) == true) {