Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Fri Jan 9 06:34:33 2015 @@ -44,7 +44,7 @@ import org.apache.nutch.util.MimeUtil; import crawlercommons.robots.BaseRobotRules; public abstract class HttpBase implements Protocol { - + private final static Utf8 RESPONSE_TIME = new Utf8("_rs_"); public static final int BUFFER_SIZE = 8 * 1024; @@ -69,15 +69,12 @@ public abstract class HttpBase implement protected int maxContent = 64 * 1024; /** The Nutch 'User-Agent' request header */ - protected String userAgent = getAgentString( - "NutchCVS", null, "Nutch", - "http://nutch.apache.org/bot.html", - "ag...@nutch.apache.org"); - + protected String userAgent = getAgentString("NutchCVS", null, "Nutch", + "http://nutch.apache.org/bot.html", "ag...@nutch.apache.org"); /** The "Accept-Language" request header value. */ protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3"; - + /** The "Accept" request header value. */ protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; @@ -97,13 +94,13 @@ public abstract class HttpBase implement /** Response Time */ protected boolean responseTime = true; - + /** Which TLS/SSL protocols to support */ protected Set<String> tlsPreferredProtocols; - + /** Which TLS/SSL cipher suites to support */ protected Set<String> tlsPreferredCipherSuites; - + /** Creates a new instance of HttpBase */ public HttpBase() { this(null); @@ -125,37 +122,62 @@ public abstract class HttpBase implement this.useProxy = (proxyHost != null && proxyHost.length() > 0); this.timeout = conf.getInt("http.timeout", 10000); this.maxContent = conf.getInt("http.content.limit", 64 * 1024); - this.userAgent = getAgentString(conf.get("http.agent.name"), conf.get("http.agent.version"), conf - .get("http.agent.description"), conf.get("http.agent.url"), conf.get("http.agent.email")); + this.userAgent = getAgentString(conf.get("http.agent.name"), + conf.get("http.agent.version"), conf.get("http.agent.description"), + conf.get("http.agent.url"), conf.get("http.agent.email")); this.acceptLanguage = conf.get("http.accept.language", acceptLanguage); this.accept = conf.get("http.accept", accept); this.mimeTypes = new MimeUtil(conf); this.useHttp11 = conf.getBoolean("http.useHttp11", false); this.responseTime = conf.getBoolean("http.store.responsetime", true); this.robots.setConf(conf); - - String[] protocols = conf.getStrings("http.tls.supported.protocols", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); - String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", - "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", - "TLS_RSA_WITH_AES_256_CBC_SHA256","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384","TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", - "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256","TLS_DHE_DSS_WITH_AES_256_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", - "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA","TLS_RSA_WITH_AES_256_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", - "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_RSA_WITH_AES_256_CBC_SHA","TLS_DHE_DSS_WITH_AES_256_CBC_SHA", - "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256","TLS_RSA_WITH_AES_128_CBC_SHA256", - "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256","TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", - "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256","TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", - "TLS_RSA_WITH_AES_128_CBC_SHA","TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA","TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", - "TLS_DHE_RSA_WITH_AES_128_CBC_SHA","TLS_DHE_DSS_WITH_AES_128_CBC_SHA","TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", - "TLS_ECDHE_RSA_WITH_RC4_128_SHA","SSL_RSA_WITH_RC4_128_SHA","TLS_ECDH_ECDSA_WITH_RC4_128_SHA", - "TLS_ECDH_RSA_WITH_RC4_128_SHA","TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", - "SSL_RSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA","TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", - "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA","SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA","SSL_RSA_WITH_RC4_128_MD5", - "TLS_EMPTY_RENEGOTIATION_INFO_SCSV","TLS_RSA_WITH_NULL_SHA256","TLS_ECDHE_ECDSA_WITH_NULL_SHA", - "TLS_ECDHE_RSA_WITH_NULL_SHA","SSL_RSA_WITH_NULL_SHA","TLS_ECDH_ECDSA_WITH_NULL_SHA","TLS_ECDH_RSA_WITH_NULL_SHA", - "SSL_RSA_WITH_NULL_MD5","SSL_RSA_WITH_DES_CBC_SHA","SSL_DHE_RSA_WITH_DES_CBC_SHA","SSL_DHE_DSS_WITH_DES_CBC_SHA", - "TLS_KRB5_WITH_RC4_128_SHA","TLS_KRB5_WITH_RC4_128_MD5","TLS_KRB5_WITH_3DES_EDE_CBC_SHA","TLS_KRB5_WITH_3DES_EDE_CBC_MD5", - "TLS_KRB5_WITH_DES_CBC_SHA","TLS_KRB5_WITH_DES_CBC_MD5"); - + + String[] protocols = conf.getStrings("http.tls.supported.protocols", + "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3"); + String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites", + "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384", + "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384", + "TLS_RSA_WITH_AES_256_CBC_SHA256", + "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384", + "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384", + "TLS_DHE_RSA_WITH_AES_256_CBC_SHA256", + "TLS_DHE_DSS_WITH_AES_256_CBC_SHA256", + "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA", + "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", + "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA", + "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256", + "TLS_RSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256", + "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256", + "TLS_DHE_RSA_WITH_AES_128_CBC_SHA256", + "TLS_DHE_DSS_WITH_AES_128_CBC_SHA256", + "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA", + "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA", + "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", + "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA", + "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA", + "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", + "TLS_ECDH_RSA_WITH_RC4_128_SHA", + "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA", + "TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA", + "SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5", + "TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256", + "TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA", + "SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA", + "TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5", + "SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA", + "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA", + "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA", + "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA", + "TLS_KRB5_WITH_DES_CBC_MD5"); + tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols)); tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers)); @@ -171,81 +193,90 @@ public abstract class HttpBase implement try { URL u = new URL(url); - + long startTime = System.currentTimeMillis(); Response response = getResponse(u, page, false); // make a request - int elapsedTime =(int) (System.currentTimeMillis() - startTime); - - if(this.responseTime) { - page.getMetadata().put(RESPONSE_TIME, ByteBuffer.wrap(Bytes.toBytes(elapsedTime))); + int elapsedTime = (int) (System.currentTimeMillis() - startTime); + + if (this.responseTime) { + page.getMetadata().put(RESPONSE_TIME, + ByteBuffer.wrap(Bytes.toBytes(elapsedTime))); } - + int code = response.getCode(); byte[] content = response.getContent(); Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), - response.getHeader("Content-Type"), - response.getHeaders(), mimeTypes); + response.getHeader("Content-Type"), response.getHeaders(), mimeTypes); if (code == 200) { // got a good response return new ProtocolOutput(c); // return it } else if (code >= 300 && code < 400) { // handle redirect String location = response.getHeader("Location"); // some broken servers, such as MS IIS, use lowercase header name... - if (location == null) location = response.getHeader("location"); - if (location == null) location = ""; + if (location == null) + location = response.getHeader("location"); + if (location == null) + location = ""; u = new URL(u, location); int protocolStatusCode; switch (code) { - case 300: // multiple choices, preferred value in Location + case 300: // multiple choices, preferred value in Location protocolStatusCode = ProtocolStatusCodes.MOVED; break; - case 301: // moved permanently - case 305: // use proxy (Location is URL of proxy) + case 301: // moved permanently + case 305: // use proxy (Location is URL of proxy) protocolStatusCode = ProtocolStatusCodes.MOVED; break; - case 302: // found (temporarily moved) - case 303: // see other (redirect after POST) - case 307: // temporary redirect + case 302: // found (temporarily moved) + case 303: // see other (redirect after POST) + case 307: // temporary redirect protocolStatusCode = ProtocolStatusUtils.TEMP_MOVED; break; - case 304: // not modified + case 304: // not modified protocolStatusCode = ProtocolStatusUtils.NOTMODIFIED; break; default: protocolStatusCode = ProtocolStatusUtils.MOVED; } // handle this in the higher layer. - return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(protocolStatusCode, u)); + return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus( + protocolStatusCode, u)); } else if (code == 400) { // bad request, mark as GONE - if (logger.isTraceEnabled()) { logger.trace("400 Bad request: " + u); } - return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, u)); - } else if (code == 401) { // requires authorization, but no valid auth provided. - if (logger.isTraceEnabled()) { logger.trace("401 Authentication Required"); } - return new ProtocolOutput(c, - ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.ACCESS_DENIED, - "Authentication required: "+ url)); + if (logger.isTraceEnabled()) { + logger.trace("400 Bad request: " + u); + } + return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus( + ProtocolStatusCodes.GONE, u)); + } else if (code == 401) { // requires authorization, but no valid auth + // provided. + if (logger.isTraceEnabled()) { + logger.trace("401 Authentication Required"); + } + return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus( + ProtocolStatusCodes.ACCESS_DENIED, "Authentication required: " + + url)); } else if (code == 404) { - return new ProtocolOutput(c, - ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.NOTFOUND, u)); + return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus( + ProtocolStatusCodes.NOTFOUND, u)); } else if (code == 410) { // permanently GONE - return new ProtocolOutput(c, - ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, "Http: " + code + " url=" + u)); + return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus( + ProtocolStatusCodes.GONE, "Http: " + code + " url=" + u)); } else { - return new ProtocolOutput(c, - ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.EXCEPTION, "Http code=" + code + ", url=" - + u)); + return new ProtocolOutput(c, ProtocolStatusUtils.makeStatus( + ProtocolStatusCodes.EXCEPTION, "Http code=" + code + ", url=" + u)); } } catch (Throwable e) { logger.error("Failed with the following error: ", e); - return new ProtocolOutput(null, - ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.EXCEPTION, e.toString())); + return new ProtocolOutput(null, ProtocolStatusUtils.makeStatus( + ProtocolStatusCodes.EXCEPTION, e.toString())); } } - /* -------------------------- * - * </implementation:Protocol> * - * -------------------------- */ + /* + * -------------------------- * </implementation:Protocol> * + * -------------------------- + */ public String getProxyHost() { return proxyHost; } @@ -269,58 +300,57 @@ public abstract class HttpBase implement public String getUserAgent() { return userAgent; } - - /** Value of "Accept-Language" request header sent by Nutch. + + /** + * Value of "Accept-Language" request header sent by Nutch. + * * @return The value of the header "Accept-Language" header. */ public String getAcceptLanguage() { - return acceptLanguage; + return acceptLanguage; } public String getAccept() { - return accept; + return accept; } public boolean getUseHttp11() { return useHttp11; } - + public Set<String> getTlsPreferredCipherSuites() { return tlsPreferredCipherSuites; } - + public Set<String> getTlsPreferredProtocols() { return tlsPreferredProtocols; } - private static String getAgentString(String agentName, - String agentVersion, - String agentDesc, - String agentURL, - String agentEmail) { + private static String getAgentString(String agentName, String agentVersion, + String agentDesc, String agentURL, String agentEmail) { - if ( (agentName == null) || (agentName.trim().length() == 0) ) { + if ((agentName == null) || (agentName.trim().length() == 0)) { // TODO : NUTCH-258 if (LOGGER.isErrorEnabled()) { LOGGER.error("No User-Agent string set (http.agent.name)!"); } } - StringBuffer buf= new StringBuffer(); + StringBuffer buf = new StringBuffer(); buf.append(agentName); if (agentVersion != null) { buf.append("/"); buf.append(agentVersion); } - if ( ((agentDesc != null) && (agentDesc.length() != 0)) + if (((agentDesc != null) && (agentDesc.length() != 0)) || ((agentEmail != null) && (agentEmail.length() != 0)) - || ((agentURL != null) && (agentURL.length() != 0)) ) { + || ((agentURL != null) && (agentURL.length() != 0))) { buf.append(" ("); if ((agentDesc != null) && (agentDesc.length() != 0)) { buf.append(agentDesc); - if ( (agentURL != null) || (agentEmail != null) ) + if ((agentURL != null) || (agentEmail != null)) buf.append("; "); } @@ -350,9 +380,12 @@ public abstract class HttpBase implement } } - public byte[] processGzipEncoded(byte[] compressed, URL url) throws IOException { + public byte[] processGzipEncoded(byte[] compressed, URL url) + throws IOException { - if (LOGGER.isTraceEnabled()) { LOGGER.trace("uncompressing...."); } + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("uncompressing...."); + } byte[] content; if (getMaxContent() >= 0) { @@ -366,25 +399,29 @@ public abstract class HttpBase implement if (LOGGER.isTraceEnabled()) { LOGGER.trace("fetched " + compressed.length - + " bytes of compressed content (expanded to " - + content.length + " bytes) from " + url); + + " bytes of compressed content (expanded to " + content.length + + " bytes) from " + url); } return content; } - public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException { + public byte[] processDeflateEncoded(byte[] compressed, URL url) + throws IOException { - if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); } + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("inflating...."); + } - byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent()); + byte[] content = DeflateUtils + .inflateBestEffort(compressed, getMaxContent()); if (content == null) throw new IOException("inflateBestEffort returned null"); if (LOGGER.isTraceEnabled()) { LOGGER.trace("fetched " + compressed.length - + " bytes of compressed content (expanded to " - + content.length + " bytes) from " + url); + + " bytes of compressed content (expanded to " + content.length + + " bytes) from " + url); } return content; } @@ -409,27 +446,28 @@ public abstract class HttpBase implement } else if (i != args.length - 1) { System.err.println(usage); System.exit(-1); - } else // root is required parameter + } else + // root is required parameter url = args[i]; } - ProtocolOutput out = http.getProtocolOutput(url, WebPage.newBuilder().build()); + ProtocolOutput out = http.getProtocolOutput(url, WebPage.newBuilder() + .build()); Content content = out.getContent(); System.out.println("Status: " + out.getStatus()); if (content != null) { System.out.println("Content Type: " + content.getContentType()); - System.out.println("Content Length: " + - content.getMetadata().get(Response.CONTENT_LENGTH)); + System.out.println("Content Length: " + + content.getMetadata().get(Response.CONTENT_LENGTH)); System.out.println("Content:"); String text = new String(content.getContent()); System.out.println(text); } } - protected abstract Response getResponse(URL url, - WebPage page, boolean followRedirects) - throws ProtocolException, IOException; + protected abstract Response getResponse(URL url, WebPage page, + boolean followRedirects) throws ProtocolException, IOException; @Override public BaseRobotRules getRobotRules(String url, WebPage page) {
Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java Fri Jan 9 06:34:33 2015 @@ -19,7 +19,6 @@ package org.apache.nutch.protocol.http.a // Nutch imports import org.apache.nutch.protocol.ProtocolException; - public class HttpException extends ProtocolException { public HttpException() { Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java Fri Jan 9 06:34:33 2015 @@ -30,16 +30,18 @@ import org.slf4j.LoggerFactory; import java.net.URL; /** - * This class is used for parsing robots for urls belonging to HTTP protocol. - * It extends the generic {@link RobotRulesParser} class and contains - * Http protocol specific implementation for obtaining the robots file. + * This class is used for parsing robots for urls belonging to HTTP protocol. It + * extends the generic {@link RobotRulesParser} class and contains Http protocol + * specific implementation for obtaining the robots file. */ public class HttpRobotRulesParser extends RobotRulesParser { - - public static final Logger LOG = LoggerFactory.getLogger(HttpRobotRulesParser.class); + + public static final Logger LOG = LoggerFactory + .getLogger(HttpRobotRulesParser.class); protected boolean allowForbidden = false; - HttpRobotRulesParser() { } + HttpRobotRulesParser() { + } public HttpRobotRulesParser(Configuration conf) { super(conf); @@ -48,14 +50,17 @@ public class HttpRobotRulesParser extend /** Compose unique key to store and access robot rules in cache for given URL */ protected static String getCacheKey(URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower case - String host = url.getHost().toLowerCase(); // normalize to lower case + String protocol = url.getProtocol().toLowerCase(); // normalize to lower + // case + String host = url.getHost().toLowerCase(); // normalize to lower case int port = url.getPort(); if (port == -1) { port = url.getDefaultPort(); } - /* Robot rules apply only to host, protocol, and port where robots.txt is - * hosted (cf. NUTCH-1752). Consequently */ + /* + * Robot rules apply only to host, protocol, and port where robots.txt is + * hosted (cf. NUTCH-1752). Consequently + */ String cacheKey = protocol + ":" + host + ":" + port; return cacheKey; } @@ -71,7 +76,7 @@ public class HttpRobotRulesParser extend * The {@link Protocol} object * @param url * URL robots.txt applies to - * + * * @return {@link BaseRobotRules} holding the rules from robots.txt */ public BaseRobotRules getRobotRulesSet(Protocol http, URL url) { @@ -80,13 +85,15 @@ public class HttpRobotRulesParser extend BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(cacheKey); boolean cacheRule = true; - - if (robotRules == null) { // cache miss + + if (robotRules == null) { // cache miss URL redir = null; - if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); } + if (LOG.isTraceEnabled()) { + LOG.trace("cache miss " + url); + } try { - Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"), - WebPage.newBuilder().build(), true); + Response response = ((HttpBase) http).getResponse(new URL(url, + "/robots.txt"), WebPage.newBuilder().build(), true); // try one level of redirection ? if (response.getCode() == 301 || response.getCode() == 302) { String redirection = response.getHeader("Location"); @@ -101,23 +108,23 @@ public class HttpRobotRulesParser extend } else { redir = new URL(redirection); } - - response = ((HttpBase)http).getResponse(redir, WebPage.newBuilder().build(), true); + + response = ((HttpBase) http).getResponse(redir, WebPage + .newBuilder().build(), true); } } - if (response.getCode() == 200) // found rules: parse them - robotRules = parseRules(url.toString(), response.getContent(), - response.getHeader("Content-Type"), - agentNames); + if (response.getCode() == 200) // found rules: parse them + robotRules = parseRules(url.toString(), response.getContent(), + response.getHeader("Content-Type"), agentNames); - else if ( (response.getCode() == 403) && (!allowForbidden) ) - robotRules = FORBID_ALL_RULES; // use forbid all + else if ((response.getCode() == 403) && (!allowForbidden)) + robotRules = FORBID_ALL_RULES; // use forbid all else if (response.getCode() >= 500) { cacheRule = false; robotRules = EMPTY_RULES; - }else - robotRules = EMPTY_RULES; // use default rules + } else + robotRules = EMPTY_RULES; // use default rules } catch (Throwable t) { if (LOG.isInfoEnabled()) { LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString()); @@ -127,7 +134,7 @@ public class HttpRobotRulesParser extend } if (cacheRule) { - CACHE.put(cacheKey, robotRules); // cache rules for host + CACHE.put(cacheKey, robotRules); // cache rules for host if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) { // cache also for the redirected host CACHE.put(getCacheKey(redir), robotRules); Modified: nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original) +++ nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Fri Jan 9 06:34:33 2015 @@ -25,10 +25,10 @@ import crawlercommons.robots.BaseRobotRu import static org.junit.Assert.*; /** - * JUnit test case which tests - * 1. that robots filtering is performed correctly as per the agent name - * 2. that crawl delay is extracted correctly from the robots file - * + * JUnit test case which tests 1. that robots filtering is performed correctly + * as per the agent name 2. that crawl delay is extracted correctly from the + * robots file + * */ public class TestRobotRulesParser { @@ -37,40 +37,33 @@ public class TestRobotRulesParser { private static final String MULTIPLE_AGENTS = "Agent2, Agent1"; private static final String UNKNOWN_AGENT = "AgentABC"; private static final String CR = "\r"; - - private static final String ROBOTS_STRING = - "User-Agent: Agent1 #foo" + CR - + "Disallow: /a" + CR - + "Disallow: /b/a" + CR - + "#Disallow: /c" + CR - + "Crawl-delay: 10" + CR // set crawl delay for Agent1 as 10 sec - + "" + CR - + "" + CR - + "User-Agent: Agent2" + CR - + "Disallow: /a/bloh" + CR - + "Disallow: /c" + CR - + "Disallow: /foo" + CR - + "Crawl-delay: 20" + CR - + "" + CR - + "User-Agent: *" + CR - + "Disallow: /foo/bar/" + CR; // no crawl delay for other agents - - private static final String[] TEST_PATHS = new String[] { - "http://example.com/a", - "http://example.com/a/bloh/foo.html", - "http://example.com/b", - "http://example.com/c", - "http://example.com/b/a/index.html", - "http://example.com/foo/bar/baz.html" - }; - private static final boolean[] RESULTS = new boolean[] { - false, // /a - false, // /a/bloh/foo.html - true, // /b - true, // /c - false, // /b/a/index.html - true // /foo/bar/baz.html + private static final String ROBOTS_STRING = "User-Agent: Agent1 #foo" + CR + + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + + CR + + "Crawl-delay: 10" + + CR // set crawl delay for Agent1 as 10 sec + + "" + CR + "" + CR + "User-Agent: Agent2" + CR + "Disallow: /a/bloh" + + CR + "Disallow: /c" + CR + "Disallow: /foo" + CR + "Crawl-delay: 20" + + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR; // no + // crawl + // delay + // for + // other + // agents + + private static final String[] TEST_PATHS = new String[] { + "http://example.com/a", "http://example.com/a/bloh/foo.html", + "http://example.com/b", "http://example.com/c", + "http://example.com/b/a/index.html", + "http://example.com/foo/bar/baz.html" }; + + private static final boolean[] RESULTS = new boolean[] { false, // /a + false, // /a/bloh/foo.html + true, // /b + true, // /c + false, // /b/a/index.html + true // /foo/bar/baz.html }; private HttpRobotRulesParser parser; @@ -82,41 +75,52 @@ public class TestRobotRulesParser { } /** - * Test that the robots rules are interpreted correctly by the robots rules parser. - */ + * Test that the robots rules are interpreted correctly by the robots rules + * parser. + */ @Test public void testRobotsAgent() { - rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT); + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, SINGLE_AGENT); - for(int counter = 0; counter < TEST_PATHS.length; counter++) { - assertTrue("testing on agent (" + SINGLE_AGENT + "), and " - + "path " + TEST_PATHS[counter] - + " got " + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); + for (int counter = 0; counter < TEST_PATHS.length; counter++) { + assertTrue( + "testing on agent (" + SINGLE_AGENT + "), and " + "path " + + TEST_PATHS[counter] + " got " + + rules.isAllowed(TEST_PATHS[counter]), + rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); } - rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, MULTIPLE_AGENTS); + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, MULTIPLE_AGENTS); - for(int counter = 0; counter < TEST_PATHS.length; counter++) { - assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and " - + "path " + TEST_PATHS[counter] - + " got " + rules.isAllowed(TEST_PATHS[counter]), - rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); + for (int counter = 0; counter < TEST_PATHS.length; counter++) { + assertTrue( + "testing on agents (" + MULTIPLE_AGENTS + "), and " + "path " + + TEST_PATHS[counter] + " got " + + rules.isAllowed(TEST_PATHS[counter]), + rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); } } /** - * Test that the crawl delay is extracted from the robots file for respective agent. - * If its not specified for a given agent, default value must be returned. - */ + * Test that the crawl delay is extracted from the robots file for respective + * agent. If its not specified for a given agent, default value must be + * returned. + */ @Test public void testCrawlDelay() { - // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the parser - rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT); - assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay() == 10000)); - + // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be + // returned by the parser + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, SINGLE_AGENT); + assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ", + (rules.getCrawlDelay() == 10000)); + // for UNKNOWN_AGENT, the default crawl delay must be returned. - rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT); - assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay() == Long.MIN_VALUE)); + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + CONTENT_TYPE, UNKNOWN_AGENT); + assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ", + (rules.getCrawlDelay() == Long.MIN_VALUE)); } } Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java (original) +++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java Fri Jan 9 06:34:33 2015 @@ -16,11 +16,9 @@ */ package org.apache.nutch.urlfilter.api; - - /** * A generic regular expression rule. - * + * * @author Jérôme Charron */ public abstract class RegexRule { @@ -29,13 +27,15 @@ public abstract class RegexRule { /** * Constructs a new regular expression rule. - * - * @param sign specifies if this rule must filter-in or filter-out. - * A <code>true</code> value means that any url matching this rule - * must be accepted, a <code>false</code> value means that any url - * matching this rule must be rejected. - * @param regex is the regular expression used for matching (see - * {@link #match(String)} method). + * + * @param sign + * specifies if this rule must filter-in or filter-out. A + * <code>true</code> value means that any url matching this rule must + * be accepted, a <code>false</code> value means that any url + * matching this rule must be rejected. + * @param regex + * is the regular expression used for matching (see + * {@link #match(String)} method). */ protected RegexRule(boolean sign, String regex) { this.sign = sign; @@ -43,19 +43,22 @@ public abstract class RegexRule { /** * Return if this rule is used for filtering-in or out. - * + * * @return <code>true</code> if any url matching this rule must be accepted, * otherwise <code>false</code>. */ - protected boolean accept() { return sign; } - + protected boolean accept() { + return sign; + } + /** * Checks if a url matches this rule. - * @param url is the url to check. - * @return <code>true</code> if the specified url matches this rule, - * otherwise <code>false</code>. + * + * @param url + * is the url to check. + * @return <code>true</code> if the specified url matches this rule, otherwise + * <code>false</code>. */ protected abstract boolean match(String url); } - Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java (original) +++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java Fri Jan 9 06:34:33 2015 @@ -37,27 +37,30 @@ import org.apache.hadoop.conf.Configurat // Nutch imports import org.apache.nutch.net.*; - /** - * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on - * regular expressions. - * - * <p>The regular expressions rules are expressed in a file. The file of rules - * is provided by each implementation using the - * {@link #getRulesFile(Configuration)} method.</p> + * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular + * expressions. + * + * <p> + * The regular expressions rules are expressed in a file. The file of rules is + * provided by each implementation using the + * {@link #getRulesFile(Configuration)} method. + * </p> * - * <p>The format of this file is made of many rules (one per line):<br/> + * <p> + * The format of this file is made of many rules (one per line):<br/> * <code> * [+-]<regex> * </code><br/> - * where plus (<code>+</code>)means go ahead and index it and minus - * (<code>-</code>)means no.</p> - + * where plus (<code>+</code>)means go ahead and index it and minus ( + * <code>-</code>)means no. + * </p> */ public abstract class RegexURLFilterBase implements URLFilter { /** My logger */ - private final static Logger LOG = LoggerFactory.getLogger(RegexURLFilterBase.class); + private final static Logger LOG = LoggerFactory + .getLogger(RegexURLFilterBase.class); /** An array of applicable rules */ private List<RegexRule> rules; @@ -65,24 +68,28 @@ public abstract class RegexURLFilterBase /** The current configuration */ private Configuration conf; - /** * Constructs a new empty RegexURLFilterBase */ - public RegexURLFilterBase() { } + public RegexURLFilterBase() { + } /** * Constructs a new RegexURLFilter and init it with a file of rules. - * @param filename is the name of rules file. + * + * @param filename + * is the name of rules file. */ - public RegexURLFilterBase(File filename) - throws IOException, IllegalArgumentException { + public RegexURLFilterBase(File filename) throws IOException, + IllegalArgumentException { this(new FileReader(filename)); } - + /** * Constructs a new RegexURLFilter and inits it with a list of rules. - * @param rules string with a list of rules, one rule per line + * + * @param rules + * string with a list of rules, one rule per line * @throws IOException * @throws IllegalArgumentException */ @@ -93,68 +100,82 @@ public abstract class RegexURLFilterBase /** * Constructs a new RegexURLFilter and init it with a Reader of rules. - * @param reader is a reader of rules. + * + * @param reader + * is a reader of rules. */ - protected RegexURLFilterBase(Reader reader) - throws IOException, IllegalArgumentException { + protected RegexURLFilterBase(Reader reader) throws IOException, + IllegalArgumentException { rules = readRules(reader); } - + /** * Creates a new {@link RegexRule}. - * @param sign of the regular expression. - * A <code>true</code> value means that any URL matching this rule - * must be included, whereas a <code>false</code> - * value means that any URL matching this rule must be excluded. - * @param regex is the regular expression associated to this rule. + * + * @param sign + * of the regular expression. A <code>true</code> value means that + * any URL matching this rule must be included, whereas a + * <code>false</code> value means that any URL matching this rule + * must be excluded. + * @param regex + * is the regular expression associated to this rule. */ protected abstract RegexRule createRule(boolean sign, String regex); - + /** - * Returns the name of the file of rules to use for - * a particular implementation. - * @param conf is the current configuration. + * Returns the name of the file of rules to use for a particular + * implementation. + * + * @param conf + * is the current configuration. * @return the name of the resource containing the rules to use. */ - protected abstract Reader getRulesReader(Configuration conf) throws IOException; - - - /* -------------------------- * - * <implementation:URLFilter> * - * -------------------------- */ - + protected abstract Reader getRulesReader(Configuration conf) + throws IOException; + + /* + * -------------------------- * <implementation:URLFilter> * + * -------------------------- + */ + // Inherited Javadoc public String filter(String url) { for (RegexRule rule : rules) { if (rule.match(url)) { return rule.accept() ? url : null; } - }; + } + ; return null; } - /* --------------------------- * - * </implementation:URLFilter> * - * --------------------------- */ - - - /* ----------------------------- * - * <implementation:Configurable> * - * ----------------------------- */ - + /* + * --------------------------- * </implementation:URLFilter> * + * --------------------------- + */ + + /* + * ----------------------------- * <implementation:Configurable> * + * ----------------------------- + */ + public void setConf(Configuration conf) { this.conf = conf; Reader reader = null; try { reader = getRulesReader(conf); } catch (Exception e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } - throw new RuntimeException(e.getMessage(), e); + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } + throw new RuntimeException(e.getMessage(), e); } try { rules = readRules(reader); } catch (IOException e) { - if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); } + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } throw new RuntimeException(e.getMessage(), e); } } @@ -162,45 +183,51 @@ public abstract class RegexURLFilterBase public Configuration getConf() { return this.conf; } - - /* ------------------------------ * - * </implementation:Configurable> * - * ------------------------------ */ - + + /* + * ------------------------------ * </implementation:Configurable> * + * ------------------------------ + */ /** * Read the specified file of rules. - * @param reader is a reader of regular expressions rules. + * + * @param reader + * is a reader of regular expressions rules. * @return the corresponding {@RegexRule rules}. */ - private List<RegexRule> readRules(Reader reader) - throws IOException, IllegalArgumentException { + private List<RegexRule> readRules(Reader reader) throws IOException, + IllegalArgumentException { BufferedReader in = new BufferedReader(reader); List<RegexRule> rules = new ArrayList<RegexRule>(); String line; - - while((line=in.readLine())!=null) { + + while ((line = in.readLine()) != null) { if (line.length() == 0) { continue; } - char first=line.charAt(0); - boolean sign=false; + char first = line.charAt(0); + boolean sign = false; switch (first) { - case '+' : - sign=true; + case '+': + sign = true; break; - case '-' : - sign=false; + case '-': + sign = false; break; - case ' ' : case '\n' : case '#' : // skip blank & comment lines + case ' ': + case '\n': + case '#': // skip blank & comment lines continue; - default : - throw new IOException("Invalid first character: "+line); + default: + throw new IOException("Invalid first character: " + line); } String regex = line.substring(1); - if (LOG.isTraceEnabled()) { LOG.trace("Adding rule [" + regex + "]"); } + if (LOG.isTraceEnabled()) { + LOG.trace("Adding rule [" + regex + "]"); + } RegexRule rule = createRule(sign, regex); rules.add(rule); } @@ -209,18 +236,20 @@ public abstract class RegexURLFilterBase /** * Filter the standard input using a RegexURLFilterBase. - * @param filter is the RegexURLFilterBase to use for filtering the - * standard input. - * @param args some optional parameters (not used). + * + * @param filter + * is the RegexURLFilterBase to use for filtering the standard input. + * @param args + * some optional parameters (not used). */ public static void main(RegexURLFilterBase filter, String args[]) - throws IOException, IllegalArgumentException { + throws IOException, IllegalArgumentException { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; - while((line=in.readLine())!=null) { + while ((line = in.readLine()) != null) { String out = filter.filter(line); - if (out!=null) { + if (out != null) { System.out.print("+"); System.out.println(out); } else { Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (original) +++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Fri Jan 9 06:34:33 2015 @@ -42,52 +42,52 @@ import org.apache.nutch.net.URLFilter; * JUnit based test of class <code>RegexURLFilterBase</code>. */ -//@RunWith(Suite.class) -//@Suite.SuiteClasses({TestAutomatonURLFilter.class, TestRegexURLFilter.class}) +// @RunWith(Suite.class) +// @Suite.SuiteClasses({TestAutomatonURLFilter.class, TestRegexURLFilter.class}) public abstract class RegexURLFilterBaseTest { - + /** My logger */ - protected static final Logger LOG = LoggerFactory.getLogger(RegexURLFilterBaseTest.class); + protected static final Logger LOG = LoggerFactory + .getLogger(RegexURLFilterBaseTest.class); - private final static String SEPARATOR = System.getProperty("file.separator"); + private final static String SEPARATOR = System.getProperty("file.separator"); private final static String SAMPLES = System.getProperty("test.data", "."); - + protected abstract URLFilter getURLFilter(Reader rules); protected void bench(int loops, String file) { try { - bench(loops, - new FileReader(SAMPLES + SEPARATOR + file + ".rules"), - new FileReader(SAMPLES + SEPARATOR + file + ".urls")); + bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"), + new FileReader(SAMPLES + SEPARATOR + file + ".urls")); } catch (Exception e) { fail(e.toString()); } } - + protected void bench(int loops, Reader rules, Reader urls) { long start = System.currentTimeMillis(); try { URLFilter filter = getURLFilter(rules); FilteredURL[] expected = readURLFile(urls); - for (int i=0; i<loops; i++) { + for (int i = 0; i < loops; i++) { test(filter, expected); } } catch (Exception e) { fail(e.toString()); } - LOG.info("bench time (" + loops + ") " + - (System.currentTimeMillis()-start) + "ms"); + LOG.info("bench time (" + loops + ") " + + (System.currentTimeMillis() - start) + "ms"); } - + protected void test(String file) { try { test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"), - new FileReader(SAMPLES + SEPARATOR + file + ".urls")); + new FileReader(SAMPLES + SEPARATOR + file + ".urls")); } catch (Exception e) { fail(e.toString()); } } - + protected void test(Reader rules, Reader urls) { try { test(getURLFilter(rules), readURLFile(urls)); @@ -95,9 +95,9 @@ public abstract class RegexURLFilterBase fail(e.toString()); } } - + protected void test(URLFilter filter, FilteredURL[] expected) { - for (int i=0; i<expected.length; i++) { + for (int i = 0; i < expected.length; i++) { String result = filter.filter(expected[i].url); if (result != null) { assertTrue(expected[i].url, expected[i].sign); @@ -106,37 +106,37 @@ public abstract class RegexURLFilterBase } } } - + private static FilteredURL[] readURLFile(Reader reader) throws IOException { BufferedReader in = new BufferedReader(reader); List<FilteredURL> list = new ArrayList<FilteredURL>(); String line; - while((line=in.readLine()) != null) { + while ((line = in.readLine()) != null) { if (line.length() != 0) { list.add(new FilteredURL(line)); } } return (FilteredURL[]) list.toArray(new FilteredURL[list.size()]); } - + private static class FilteredURL { - + boolean sign; String url; FilteredURL(String line) { switch (line.charAt(0)) { - case '+' : + case '+': sign = true; break; - case '-' : + case '-': sign = false; break; - default : + default: // Simply ignore... } url = line.substring(1); } } - + } Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -39,7 +39,7 @@ import org.apache.nutch.util.Bytes; * @author Jérôme Charron */ public class RelTagIndexingFilter implements IndexingFilter { - + private Configuration conf; private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); @@ -50,10 +50,9 @@ public class RelTagIndexingFilter implem } /** - * Gets all the fields for a given {@link WebPage} - * Many datastores need to setup the mapreduce job by specifying the fields - * needed. All extensions that work on WebPage are able to specify what fields - * they need. + * Gets all the fields for a given {@link WebPage} Many datastores need to + * setup the mapreduce job by specifying the fields needed. All extensions + * that work on WebPage are able to specify what fields they need. */ @Override public Collection<Field> getFields() { @@ -73,24 +72,28 @@ public class RelTagIndexingFilter implem public Configuration getConf() { return this.conf; } - + /** * The {@link RelTagIndexingFilter} filter object. - * - * @param doc The {@link NutchDocument} object - * @param url URL to be filtered for rel-tag's - * @param page {@link WebPage} object relative to the URL + * + * @param doc + * The {@link NutchDocument} object + * @param url + * URL to be filtered for rel-tag's + * @param page + * {@link WebPage} object relative to the URL * @return filtered NutchDocument */ @Override - public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException { - // Check if some Rel-Tags found, possibly put there by RelTagParser + public NutchDocument filter(NutchDocument doc, String url, WebPage page) + throws IndexingException { + // Check if some Rel-Tags found, possibly put there by RelTagParser ByteBuffer bb = page.getMetadata().get(new Utf8(RelTagParser.REL_TAG)); - + if (bb != null) { String[] tags = Bytes.toString(bb).split("\t"); for (int i = 0; i < tags.length; i++) { - doc.add("tag", tags[i]); + doc.add("tag", tags[i]); } } return doc; Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original) +++ nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Fri Jan 9 06:34:33 2015 @@ -74,26 +74,26 @@ public class RelTagParser implements Par if (node.getNodeType() == Node.ELEMENT_NODE) { // Look for <a> tag if ("a".equalsIgnoreCase(node.getNodeName())) { - NamedNodeMap attrs = node.getAttributes(); - Node hrefNode = attrs.getNamedItem("href"); - // Checks that it contains a href attribute - if (hrefNode != null) { - Node relNode = attrs.getNamedItem("rel"); - // Checks that it contains a rel attribute too - if (relNode != null) { - // Finaly checks that rel=tag - if ("tag".equalsIgnoreCase(relNode.getNodeValue())) { - String tag = parseTag(hrefNode.getNodeValue()); - if (!StringUtil.isEmpty(tag)) { - if(!tags.contains(tag)){ + NamedNodeMap attrs = node.getAttributes(); + Node hrefNode = attrs.getNamedItem("href"); + // Checks that it contains a href attribute + if (hrefNode != null) { + Node relNode = attrs.getNamedItem("rel"); + // Checks that it contains a rel attribute too + if (relNode != null) { + // Finaly checks that rel=tag + if ("tag".equalsIgnoreCase(relNode.getNodeValue())) { + String tag = parseTag(hrefNode.getNodeValue()); + if (!StringUtil.isEmpty(tag)) { + if (!tags.contains(tag)) { tags.add(tag); - LOG.debug("Adding tag: " + tag + " to tag set."); + LOG.debug("Adding tag: " + tag + " to tag set."); } - } - } - } - } - } + } + } + } + } + } } // Recurse @@ -108,11 +108,13 @@ public class RelTagParser implements Par try { URL u = new URL(url); String path = u.getPath(); - tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8"); + tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), + "UTF-8"); } catch (Exception e) { // Malformed tag... tag = null; - } return tag; + } + return tag; } } @@ -136,12 +138,11 @@ public class RelTagParser implements Par FIELDS.add(WebPage.Field.BASE_URL); FIELDS.add(WebPage.Field.METADATA); } - + /** - * Gets all the fields for a given {@link WebPage} - * Many datastores need to setup the mapreduce job by specifying the fields - * needed. All extensions that work on WebPage are able to specify what fields - * they need. + * Gets all the fields for a given {@link WebPage} Many datastores need to + * setup the mapreduce job by specifying the fields needed. All extensions + * that work on WebPage are able to specify what fields they need. */ @Override public Collection<Field> getFields() { Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java (original) +++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java Fri Jan 9 06:34:33 2015 @@ -28,13 +28,13 @@ import java.nio.ByteBuffer; import static org.junit.Assert.*; /** - *JUnit test case for {@link RelTagIndexingFilter} which - *simply asserts that a 'tag' field is obtained by the filter. - * - *@author lewismc + * JUnit test case for {@link RelTagIndexingFilter} which simply asserts that a + * 'tag' field is obtained by the filter. + * + * @author lewismc */ - public class TestRelTagIndexingFilter { +public class TestRelTagIndexingFilter { @Test public void testRelTagFields() throws Exception { @@ -57,4 +57,3 @@ import static org.junit.Assert.*; assertTrue("check for 'tag' field", doc.getFieldNames().contains("tag")); } } - \ No newline at end of file Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java (original) +++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java Fri Jan 9 06:34:33 2015 @@ -38,15 +38,15 @@ import java.nio.ByteBuffer; import static org.junit.Assert.assertEquals; /** - * Junit test for {@link RelTagParser} based mainly John Xing's parser tests. - * We are not concerned with actual parse text within the sample file, instead - * we assert that the rel-tags we expect are found in the WebPage metadata. - * To check the parser is working as expected we unwrap the ByteBuffer obtained - * from metadata, the same type as we use in expected (String). So just the + * Junit test for {@link RelTagParser} based mainly John Xing's parser tests. We + * are not concerned with actual parse text within the sample file, instead we + * assert that the rel-tags we expect are found in the WebPage metadata. To + * check the parser is working as expected we unwrap the ByteBuffer obtained + * from metadata, the same type as we use in expected (String). So just the * other way around as we wrapped the metadata value. * * @author lewismc - * + * */ public class TestRelTagParser { @@ -58,14 +58,15 @@ public class TestRelTagParser { // Make sure sample files are copied to "test.data" as specified in // ./src/plugin/microformats-reltag/build.xml during plugin compilation. private String sampleFile = "microformats_reltag_test.html"; - + // rel-tag's we expect to be extracted from page.getMetadata() private String expectedRelTags = "Category:Specifications Category:rel-tag "; - + private Configuration conf; - + @Test - public void testRelTagParser() throws ParseException, ProtocolException, IOException { + public void testRelTagParser() throws ParseException, ProtocolException, + IOException { conf = NutchConfiguration.create(); conf.set("file.content.limit", "-1"); @SuppressWarnings("unused") @@ -85,14 +86,14 @@ public class TestRelTagParser { String mtype = mimeutil.getMimeType(file); page.setContentType(new Utf8(mtype)); parse = new ParseUtil(conf).parse(urlString, page); - //begin assertion for tests + // begin assertion for tests ByteBuffer bbuf = page.getMetadata().get(new Utf8("Rel-Tag")); byte[] byteArray = new byte[bbuf.remaining()]; bbuf.get(byteArray); String s = new String(byteArray); - //bbuf.flip(); - assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter", - expectedRelTags, s); + // bbuf.flip(); + assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter", + expectedRelTags, s); } - + } \ No newline at end of file