Author: dogacan Date: Fri Jul 13 05:25:45 2007 New Revision: 555969 URL: http://svn.apache.org/viewvc?view=rev&rev=555969 Log: NUTCH-505 - Second part. Optimize UrlValidator by using java.util.regex instead of jakarta-oro. Use initialCapacity for ArrayList-s in ParseOutputFormat. Run url validation and filtering after other tests for better performance.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java?view=diff&rev=555969&r1=555968&r2=555969 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java Fri Jul 13 05:25:45 2007 @@ -16,7 +16,8 @@ */ package org.apache.nutch.net; -import org.apache.oro.text.perl.Perl5Util; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * <p>Validates URLs.</p> @@ -64,9 +65,8 @@ /** * This expression derived/taken from the BNF for URI (RFC2396). */ - private static final String URL_PATTERN = - "/^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?/"; - // 12 3 4 5 6 7 8 9 + private static final Pattern URL_PATTERN = + Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"); /** * Schema/Protocol (ie. http:, ftp:, file:, etc). @@ -85,11 +85,11 @@ /** * Protocol (ie. http:, ftp:,https:). */ - private static final String SCHEME_PATTERN = "/^[" + SCHEME_CHARS + "]/"; + private static final Pattern SCHEME_PATTERN = + Pattern.compile("^[" + SCHEME_CHARS + "]+"); - private static final String AUTHORITY_PATTERN = - "/^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?/"; - // 1 2 3 4 + private static final Pattern AUTHORITY_PATTERN = + Pattern.compile("^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?"); private static final int PARSE_AUTHORITY_HOST_IP = 1; @@ -100,23 +100,28 @@ */ private static final int PARSE_AUTHORITY_EXTRA = 3; - private static final String PATH_PATTERN = "/^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$/"; + private static final Pattern PATH_PATTERN = + Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"); - private static final String QUERY_PATTERN = "/^(.*)$/"; + private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$"); - private static final String LEGAL_ASCII_PATTERN = "/^[\\000-\\177]+$/"; + private static final Pattern LEGAL_ASCII_PATTERN = + Pattern.compile("^[\\x20-\\x7E]+$"); - private static final String IP_V4_DOMAIN_PATTERN = - "/^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$/"; + private static final Pattern IP_V4_DOMAIN_PATTERN = + Pattern.compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$"); - private static final String DOMAIN_PATTERN = - "/^" + ATOM + "(\\." + ATOM + ")*$/"; + private static final Pattern DOMAIN_PATTERN = + Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$"); - private static final String PORT_PATTERN = "/^:(\\d{1,5})$/"; + private static final Pattern PORT_PATTERN = + Pattern.compile("^:(\\d{1,5})$"); - private static final String ATOM_PATTERN = "/(" + ATOM + ")/"; + private static final Pattern ATOM_PATTERN = + Pattern.compile("(" + ATOM + ")"); - private static final String ALPHA_PATTERN = "/^[" + ALPHA_CHARS + "]/"; + private static final Pattern ALPHA_PATTERN = + Pattern.compile("^[" + ALPHA_CHARS + "]"); private static final UrlValidator VALIDATOR = new UrlValidator(); @@ -139,15 +144,13 @@ return false; } - Perl5Util matchUrlPat = new Perl5Util(); - Perl5Util matchAsciiPat = new Perl5Util(); - - if (!matchAsciiPat.match(LEGAL_ASCII_PATTERN, value)) { + Matcher matchUrlPat = URL_PATTERN.matcher(value); + if (!LEGAL_ASCII_PATTERN.matcher(value).matches()) { return false; } // Check the whole url address structure - if (!matchUrlPat.match(URL_PATTERN, value)) { + if (!matchUrlPat.matches()) { return false; } @@ -183,12 +186,7 @@ return false; } - Perl5Util schemeMatcher = new Perl5Util(); - if (!schemeMatcher.match(SCHEME_PATTERN, scheme)) { - return false; - } - - return true; + return SCHEME_PATTERN.matcher(scheme).matches(); } /** @@ -202,10 +200,8 @@ return false; } - Perl5Util authorityMatcher = new Perl5Util(); - Perl5Util matchIPV4Pat = new Perl5Util(); - - if (!authorityMatcher.match(AUTHORITY_PATTERN, authority)) { + Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority); + if (!authorityMatcher.matches()) { return false; } @@ -213,7 +209,8 @@ boolean hostname = false; // check if authority is IP address or hostname String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); - ipV4Address = matchIPV4Pat.match(IP_V4_DOMAIN_PATTERN, hostIP); + Matcher matchIPV4Pat = IP_V4_DOMAIN_PATTERN.matcher(hostIP); + ipV4Address = matchIPV4Pat.matches(); if (ipV4Address) { // this is an IP address so check components @@ -234,8 +231,7 @@ } } else { // Domain is hostname name - Perl5Util domainMatcher = new Perl5Util(); - hostname = domainMatcher.match(DOMAIN_PATTERN, hostIP); + hostname = DOMAIN_PATTERN.matcher(hostIP).matches(); } // rightmost hostname will never start with a digit. @@ -250,20 +246,16 @@ } } String[] domainSegment = new String[size]; - boolean match = true; int segCount = 0; int segLen = 0; - Perl5Util atomMatcher = new Perl5Util(); + Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP); - while (match) { - match = atomMatcher.match(ATOM_PATTERN, hostIP); - if (match) { - domainSegment[segCount] = atomMatcher.group(1); - segLen = domainSegment[segCount].length() + 1; - hostIP = (segLen >= hostIP.length()) ? "" - : hostIP.substring(segLen); - segCount++; - } + while (atomMatcher.find()) { + domainSegment[segCount] = atomMatcher.group(); + segLen = domainSegment[segCount].length() + 1; + hostIP = (segLen >= hostIP.length()) ? "" + : hostIP.substring(segLen); + segCount++; } String topLevel = domainSegment[segCount - 1]; if (topLevel.length() < 2 || topLevel.length() > 4) { @@ -271,8 +263,7 @@ } // First letter of top level must be a alpha - Perl5Util alphaMatcher = new Perl5Util(); - if (!alphaMatcher.match(ALPHA_PATTERN, topLevel.substring(0, 1))) { + if (!ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()) { return false; } @@ -288,18 +279,13 @@ String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); if (port != null) { - Perl5Util portMatcher = new Perl5Util(); - if (!portMatcher.match(PORT_PATTERN, port)) { + if (!PORT_PATTERN.matcher(port).matches()) { return false; } } String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); - if (!isBlankOrNull(extra)) { - return false; - } - - return true; + return isBlankOrNull(extra); } /** @@ -323,23 +309,15 @@ return false; } - Perl5Util pathMatcher = new Perl5Util(); - - if (!pathMatcher.match(PATH_PATTERN, path)) { + if (!PATH_PATTERN.matcher(path).matches()) { return false; } int slash2Count = countToken("//", path); - int slashCount = countToken("/", path); int dot2Count = countToken("..", path); - if (dot2Count > 0) { - if ((slashCount - slash2Count - 1) <= dot2Count) { - return false; - } - } - - return true; + + return (dot2Count <= 0) || ((slashCount - slash2Count - 1) > dot2Count); } /** @@ -352,8 +330,7 @@ return true; } - Perl5Util queryMatcher = new Perl5Util(); - return queryMatcher.match(QUERY_PATTERN, query); + return QUERY_PATTERN.matcher(query).matches(); } /** Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=555969&r1=555968&r2=555969 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Fri Jul 13 05:25:45 2007 @@ -145,24 +145,10 @@ int validCount = 0; CrawlDatum adjust = null; - List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(); - List<Outlink> outlinkList = new ArrayList<Outlink>(); + List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore); + List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); - if (!validator.isValid(toUrl)) { - continue; - } - try { - // normalizing here is not necessary since outlinks - // are already normalized in Outlink's constructor - toUrl = filters.filter(toUrl); // filter the url - if (toUrl == null) { - continue; - } - } catch (Exception e) { - continue; - } - // ignore links to self (or anchors within the page) if (fromUrl.equals(toUrl)) { continue; @@ -176,6 +162,19 @@ if (toHost == null || !toHost.equals(fromHost)) { // external links continue; // skip it } + } + if (!validator.isValid(toUrl)) { + continue; + } + try { + // normalizing here is not necessary since outlinks + // are already normalized in Outlink's constructor + toUrl = filters.filter(toUrl); // filter the url + if (toUrl == null) { + continue; + } + } catch (Exception e) { + continue; } CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); Text targetUrl = new Text(toUrl); ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs