Author: siren Date: Sun Mar 11 14:18:23 2007 New Revision: 517015 URL: http://svn.apache.org/viewvc?view=rev&rev=517015 Log: merging 517012:516728 excluding changes made by dennis
Added: lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar - copied unchanged from r516728, lucene/nutch/trunk/lib/commons-logging-api-1.0.4.jar lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar - copied unchanged from r516728, lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.LICENSE.txt lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/ontology/lib/commons-logging-1.0.3.jar lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html - copied unchanged from r516728, lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/package.html Removed: lucene/nutch/trunk/src/plugin/index-more/src/test/ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/package.html lucene/nutch/trunk/src/plugin/parse-js/src/test/ Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/build.xml lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=517015&r1=517014&r2=517015 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sun Mar 11 14:18:23 2007 @@ -158,18 +158,11 @@ 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins framework to operate properly (Heiko Dietze via mattmann) -54. Change OutlinkExtractor to use Regular Expressions from JRE (siren) - -55. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan +54. NUTCH-233 - Wrong regular expression hangs reduce process forever (Stefan Groschupf via kubes) -56. NUTCH-436 - Incorrect handling of relative paths when the embedded URL - path is empty (kubes) - -57. Replace oro with jre regular expressions in plugins, remove oro from - dependencies (siren) - -58. Remove redundant commons logging jars (siren) +55. NUTCH-436 - Incorrect handling of relative paths when the embedded URL + path is empty (kubes) Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=517015&r1=517014&r2=517015 ============================================================================== --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Sun Mar 11 14:18:23 2007 @@ -148,20 +148,8 @@ <jar jarfile="${build.dir}/${final.name}.job"> <zipfileset dir="${build.classes}"/> <zipfileset dir="${conf.dir}" excludes="*.template"/> - <zipfileset dir="${lib.dir}" prefix="lib" includes="**/*.jar"> - <exclude name="hadoop-*.jar"/> - <exclude name="servlet-*.jar"/> - <exclude name="junit*.jar"/> - <exclude name="jetty-*.jar"/> - <exclude name="pmd-ext/*"/> - <exclude name="jetty-ext/*"/> - <exclude name="jets3t.jar"/> - <exclude name="taglib*.jar"/> - <exclude name="commons-cli*.jar"/> - <exclude name="xerces-*.jar"/> - <exclude name="commons-logging-1.0.4.jar"/> - <exclude name="log4j-1.2.13.jar"/> - </zipfileset> + <zipfileset dir="${lib.dir}" prefix="lib" + includes="**/*.jar" excludes="hadoop-*.jar"/> <zipfileset dir="${build.plugins}" prefix="plugins"/> </jar> </target> Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=517015&r1=517014&r2=517015 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Sun Mar 11 14:18:23 2007 @@ -1,4 +1,4 @@ -/* +/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,21 +14,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.nutch.parse; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.Pattern; +import org.apache.oro.text.regex.PatternCompiler; +import org.apache.oro.text.regex.PatternMatcher; +import org.apache.oro.text.regex.PatternMatcherInput; +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Perl5Matcher; /** - * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s / URLs from - * plain text using Regular Expressions. + * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s + * / URLs from plain text using Regular Expressions. * * @see <a * href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison @@ -37,14 +44,12 @@ * </a> * * @author Stephan Strittmatter - http://www.sybit.de - * + * @version 1.0 * @since 0.7 */ public class OutlinkExtractor { private static final Log LOG = LogFactory.getLog(OutlinkExtractor.class); - private static final Outlink[] NO_LINKS = new Outlink[0]; - /** * Regex pattern to get URLs within a plain text. * @@ -52,63 +57,190 @@ * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html * </a> */ - private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; - - static final Pattern urlPattern = Pattern.compile(URL_PATTERN); + private static final String URL_PATTERN = + "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; /** - * Extracts outlinks from a plain text. - * </p> - * @param plainText + * Extracts <code>Outlink</code> from given plain text. + * Applying this method to non-plain-text can result in extremely lengthy + * runtimes for parasitic cases (postscript is a known example). + * @param plainText the plain text from wich URLs should be extracted. * - * @return Array of <code>Outlink</code> s within found in plainText + * @return Array of <code>Outlink</code>s within found in plainText */ - public static Outlink[] getOutlinks(final String plainText, Configuration conf){ - return getOutlinks(plainText, null, conf); + public static Outlink[] getOutlinks(final String plainText, Configuration conf) { + return OutlinkExtractor.getOutlinks(plainText, "", conf); } - /** - * Extracts outlinks from a plain text. - * </p> - * @param plainText text to extract urls from + * Extracts <code>Outlink</code> from given plain text and adds anchor + * to the extracted <code>Outlink</code>s + * + * @param plainText the plain text from wich URLs should be extracted. + * @param anchor the anchor of the url * - * @return Array of <code>Outlink</code> s found in plainText + * @return Array of <code>Outlink</code>s within found in plainText */ - public static Outlink[] getOutlinks(final String plainText, String anchor, - Configuration conf) { - - if(plainText == null){ - return NO_LINKS; - } - - final ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); - Outlink[] retval; - Outlink link; - - Matcher m = urlPattern.matcher(plainText); - while (m.find()) { - - try { - link = new Outlink(m.toMatchResult().group(), anchor, conf); - outlinks.add(link); - } catch (MalformedURLException ex) { - // if it is a malformed URL we just throw it away and continue with - // extraction. - if (LOG.isDebugEnabled()) { - LOG.debug("extracted malformed url:" + m.toMatchResult().group(), ex); + public static Outlink[] getOutlinks(final String plainText, String anchor, Configuration conf) { + long start = System.currentTimeMillis(); + final List outlinks = new ArrayList(); + + try { + final PatternCompiler cp = new Perl5Compiler(); + final Pattern pattern = cp.compile(URL_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final PatternMatcher matcher = new Perl5Matcher(); + + final PatternMatcherInput input = new PatternMatcherInput(plainText); + + MatchResult result; + String url; + + //loop the matches + while (matcher.contains(input, pattern)) { + // if this is taking too long, stop matching + // (SHOULD really check cpu time used so that heavily loaded systems + // do not unnecessarily hit this limit.) + if (System.currentTimeMillis() - start >= 60000L) { + if (LOG.isWarnEnabled()) { + LOG.warn("Time limit exceeded for getOutLinks"); + } + break; + } + result = matcher.getMatch(); + url = result.group(0); + try { + Outlink outlink = new Outlink(url, anchor, conf); + outlinks.add(new Outlink(url, anchor, conf)); + } catch (MalformedURLException mue) { + LOG.warn("Invalid url: '" + url + "', skipping."); } } - + } catch (Exception ex) { + // if the matcher fails (perhaps a malformed URL) we just log it and move on + if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); } } - if (outlinks.size() > 0) { - retval = outlinks.toArray(new Outlink[outlinks.size()]); + final Outlink[] retval; + + //create array of the Outlinks + if (outlinks != null && outlinks.size() > 0) { + retval = (Outlink[]) outlinks.toArray(new Outlink[0]); } else { - retval = NO_LINKS; + retval = new Outlink[0]; } return retval; } + + /** + * Extracts outlinks from a plain text. <br /> + * This Method takes the Jakarta Regexp API. + * + * @param plainText + * + * @return Array of <code>Outlink</code> s within found in plainText + * @deprecated only for tests + */ + private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) { + + throw new UnsupportedOperationException( + "Implementation commented out. Please uncomment to use it."); + + // final List outlinks = new ArrayList(); + // String url; + // Outlink link; + // + // RE re = new RE(URL_PATTERN); + // + // int pos = 0; + // + // while (re.match(plainText, pos)) { + // + // url = re.getParen(0); + // + // if (LOG.isTraceEnabled()) { + // LOG.trace("Extracted url: " + url); + // } + // + // try { + // + // link = new Outlink(url, null); + // outlinks.add(link); + // + // } catch (MalformedURLException ex) { + // // if it is a malformed URL we just throw it away and continue with + // // extraction. + // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); } + // } + // + // pos = re.getParenEnd(0); + // } + // + // final Outlink[] retval; + // + // if (pos > 0) { + // retval = (Outlink[]) outlinks.toArray(new Outlink[0]); + // } else { + // retval = new Outlink[0]; + // } + // + // return retval; + + } + + /** + * Extracts outlinks from a plain text. + * </p> + * This Method takes the JDK5 Regexp API. + * + * @param plainText + * + * @return Array of <code>Outlink</code> s within found in plainText + * @deprecated only for tests + */ + private Outlink[] getOutlinksJDK5Impl(final String plainText) { + + throw new UnsupportedOperationException( + "Implementation commented out. Please uncomment to use it."); + + // final List outlinks = new ArrayList(); + // String url; + // Outlink link; + // + // final Pattern urlPattern = Pattern.compile(URL_PATTERN); + // final RE re = new RE(urlPattern); + // + // int pos = 0; + // + // while (re.match(plainText, pos)) { + // + // url = re.getParen(0); + // + // try { + // + // link = new Outlink(url, null); + // outlinks.add(link); + // } catch (MalformedURLException ex) { + // // if it is a malformed URL we just throw it away and continue with + // // extraction. + // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); } + // } + // + // pos = re.getParenEnd(0); + // } + // + // final Outlink[] retval; + // + // if (pos > 0) { + // retval = (Outlink[]) outlinks.toArray(new Outlink[0]); + // } else { + // retval = new Outlink[0]; + // } + // + // return retval; + } + } Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=517015&r1=517014&r2=517015 ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Sun Mar 11 14:18:23 2007 @@ -83,13 +83,11 @@ <target name="test"> <parallel threadCount="2"> <ant dir="creativecommons" target="test"/> - <ant dir="index-more" target="test"/> <ant dir="languageidentifier" target="test"/> <ant dir="lib-http" target="test"/> <ant dir="ontology" target="test"/> <!--ant dir="parse-ext" target="test"/--> <ant dir="parse-html" target="test"/> - <ant dir="parse-js" target="test"/> <!-- <ant dir="parse-mp3" target="test"/> --> <ant dir="parse-msexcel" target="test"/> <ant dir="parse-mspowerpoint" target="test"/> Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?view=diff&rev=517015&r1=517014&r2=517015 ============================================================================== --- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original) +++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Sun Mar 11 14:18:23 2007 @@ -16,6 +16,14 @@ */ package org.apache.nutch.indexer.more; + +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Perl5Matcher; +import org.apache.oro.text.regex.Perl5Pattern; +import org.apache.oro.text.regex.PatternMatcher; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.MalformedPatternException; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -48,8 +56,6 @@ import java.util.Date; import java.util.TimeZone; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.commons.lang.time.DateUtils; @@ -238,15 +244,21 @@ // Patterns used to extract filename from possible non-standard // HTTP header "Content-Disposition". Typically it looks like: // Content-Disposition: inline; filename="foo.ppt" + private PatternMatcher matcher = new Perl5Matcher(); private Configuration conf; - static Pattern patterns[] = new Pattern[2]; + static Perl5Pattern patterns[] = {null, null}; static { + Perl5Compiler compiler = new Perl5Compiler(); + try { // order here is important patterns[0] = - Pattern.compile("\\bfilename=['\"](.+)['\"]"); + (Perl5Pattern) compiler.compile("\\bfilename=['\"](.+)['\"]"); patterns[1] = - Pattern.compile("\\bfilename=(\\S+)\\b"); + (Perl5Pattern) compiler.compile("\\bfilename=(\\S+)\\b"); + } catch (MalformedPatternException e) { + // just ignore + } } private Document resetTitle(Document doc, ParseData data, String url) { @@ -254,28 +266,16 @@ if (contentDisposition == null) return doc; - String filename = getFileName(contentDisposition); - - if (filename != null) { - doc.add(new Field("title", filename, Field.Store.YES, Field.Index.NO)); - } - - return doc; - } - - String getFileName(String value) { - - String filename = null; - - for (int i = 0; i < patterns.length; i++) { - Matcher matcher = patterns[i].matcher(value); - if(matcher.find()) { - filename = matcher.group(1); + MatchResult result; + for (int i=0; i<patterns.length; i++) { + if (matcher.contains(contentDisposition,patterns[i])) { + result = matcher.getMatch(); + doc.add(new Field("title", result.group(1), Field.Store.YES, Field.Index.NO)); break; } } - return filename; + return doc; } public void setConf(Configuration conf) { Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=517015&r1=517014&r2=517015 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original) +++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Sun Mar 11 14:18:23 2007 @@ -25,8 +25,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -42,6 +40,13 @@ import org.apache.nutch.protocol.Content; import org.apache.nutch.util.NutchConfiguration; import org.apache.hadoop.conf.Configuration; +import org.apache.oro.text.regex.MatchResult; +import org.apache.oro.text.regex.Pattern; +import org.apache.oro.text.regex.PatternCompiler; +import org.apache.oro.text.regex.PatternMatcher; +import org.apache.oro.text.regex.PatternMatcherInput; +import org.apache.oro.text.regex.Perl5Compiler; +import org.apache.oro.text.regex.Perl5Matcher; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Element; import org.w3c.dom.NamedNodeMap; @@ -49,24 +54,11 @@ import org.w3c.dom.NodeList; /** - * <p> - * This class is a heuristic link extractor for JavaScript files and code - * snippets. The general idea of a two-pass regex matching comes from Heritrix. - * Parts of the code come from OutlinkExtractor.java by Stephan Strittmatter. - * </p> - * - * <p> - * This Filter extracts javascript from following locations: - * </p> - * <li>from inside <script> tags</li> - * <li>from html 4.0 events like Window: onload,onunload, Form: - * onchange,onsubmit,onreset,onselect,onblur,onfocus Keyboard: - * onkeydown,onkeypress,onkeyup Mouse: - * onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup - * </li> - * <li>a href starting with literal "javascript"</li> - * - * + * This class is a heuristic link extractor for JavaScript files and + * code snippets. The general idea of a two-pass regex matching comes from + * Heritrix. Parts of the code come from OutlinkExtractor.java + * by Stephan Strittmatter. + * * @author Andrzej Bialecki <[EMAIL PROTECTED]> */ public class JSParseFilter implements HtmlParseFilter, Parser { @@ -105,7 +97,6 @@ Node lNode = n.getAttributes().getNamedItem("language"); if (lNode == null) lang = "javascript"; else lang = lNode.getNodeValue(); - //XXX lang is not checked?? StringBuffer script = new StringBuffer(); NodeList nn = n.getChildNodes(); if (nn.getLength() > 0) { @@ -113,9 +104,9 @@ if (i > 0) script.append('\n'); script.append(nn.item(i).getNodeValue()); } - if (LOG.isDebugEnabled()) { - LOG.info("script: language=" + lang + ", text: " + script.toString()); - } + // if (LOG.isInfoEnabled()) { + // LOG.info("script: language=" + lang + ", text: " + script.toString()); + // } Outlink[] links = getJSLinks(script.toString(), "", base); if (links != null && links.length > 0) outlinks.addAll(Arrays.asList(links)); // no other children of interest here, go one level up. @@ -184,7 +175,7 @@ /** * This method extracts URLs from literals embedded in JavaScript. */ - Outlink[] getJSLinks(String plainText, String anchor, String base) { + private Outlink[] getJSLinks(String plainText, String anchor, String base) { final List outlinks = new ArrayList(); URL baseURL = null; @@ -196,27 +187,30 @@ } try { - final Pattern stringPattern = Pattern.compile(STRING_PATTERN, - Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); - final Pattern urlPattern = Pattern.compile(URI_PATTERN, - Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); - - final Matcher quoted = stringPattern.matcher(plainText); + final PatternCompiler cp = new Perl5Compiler(); + final Pattern pattern = cp.compile(STRING_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final Pattern pattern1 = cp.compile(URI_PATTERN, + Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK + | Perl5Compiler.MULTILINE_MASK); + final PatternMatcher matcher = new Perl5Matcher(); + + final PatternMatcher matcher1 = new Perl5Matcher(); + final PatternMatcherInput input = new PatternMatcherInput(plainText); + MatchResult result; String url; //loop the matches - while (quoted.find()) { - String quotedString = quoted.group(2); - Matcher urls = urlPattern.matcher(quotedString); - - if (!urls.find()) { + while (matcher.contains(input, pattern)) { + result = matcher.getMatch(); + url = result.group(2); + PatternMatcherInput input1 = new PatternMatcherInput(url); + if (!matcher1.matches(input1, pattern1)) { //if (LOG.isTraceEnabled()) { LOG.trace(" - invalid '" + url + "'"); } continue; } - - url = urls.group(); - if (url.startsWith("www.")) { url = "http://" + url; } else { Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java?view=diff&rev=517015&r1=517014&r2=517015 ============================================================================== --- lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (original) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java Sun Mar 11 14:18:23 2007 @@ -1,4 +1,4 @@ -/* +/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,62 +14,60 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.nutch.net.urlnormalizer.basic; import java.net.URL; import java.net.MalformedURLException; -import java.util.regex.Matcher; -import java.util.regex.Pattern; +// Commons Logging imports import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +// Nutch imports import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.util.LogUtil; import org.apache.hadoop.conf.Configuration; +import org.apache.oro.text.regex.*; -/** - * Converts URLs to a normal form. - * <p> - * All substitutions will be done step by step, to ensure that certain - * constellations will be normalized, too. - * </p> - * <p> - * For example: "/aa/bb/../../cc/../foo.html will be normalized in the following - * manner: "/aa/bb/../../cc/../foo.html" "/aa/../cc/../foo.html" - * "/cc/../foo.html" "/foo.html". - * </p> - * <p> - * The normalization also takes care of leading "/../", which will be replaced - * by "/", because this is a rather a sign of bad webserver configuration than - * of a wanted link. For example, urls like "http://www.foo.com/../" should - * return a http 404 error instead of redirecting to "http://www.foo.com". - * </p> - */ +/** Converts URLs to a normal form . */ public class BasicURLNormalizer implements URLNormalizer { public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class); - /** - * This pattern tries to find spots like "/xx/../" in the url, which could - * be replaced by "/" xx consists of chars, different then "/" (slash) and - * needs to have at least one char different from ".". - */ - private static final Pattern RELATIVE_PATH_PATTERN = Pattern.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)"); - - private static final String RELATIVE_PATH_SUBSTITUTION="/"; - - /** - * This pattern tries to find spots like leading "/../" in the url, which - * could be replaced by "/". - */ - private static final Pattern LEADING_RELATIVE_PATH_PATTERN = Pattern.compile("^(/\\.\\./)+"); - - private static final String LEADING_RELATIVE_PATH_SUBSTITUTION="/"; + private Perl5Compiler compiler = new Perl5Compiler(); + private ThreadLocal matchers = new ThreadLocal() { + protected synchronized Object initialValue() { + return new Perl5Matcher(); + } + }; + private Rule relativePathRule = null; + private Rule leadingRelativePathRule = null; private Configuration conf; - public BasicURLNormalizer() { + try { + // this pattern tries to find spots like "/xx/../" in the url, which + // could be replaced by "/" xx consists of chars, different then "/" + // (slash) and needs to have at least one char different from "." + relativePathRule = new Rule(); + relativePathRule.pattern = (Perl5Pattern) + compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)", + Perl5Compiler.READ_ONLY_MASK); + relativePathRule.substitution = new Perl5Substitution("/"); + + // this pattern tries to find spots like leading "/../" in the url, + // which could be replaced by "/" + leadingRelativePathRule = new Rule(); + leadingRelativePathRule.pattern = (Perl5Pattern) + compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK); + leadingRelativePathRule.substitution = new Perl5Substitution("/"); + + } catch (MalformedPatternException e) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + throw new RuntimeException(e); + } } public String normalize(String urlString, String scope) @@ -131,25 +129,56 @@ return urlString; } - private String substituteUnnecessaryRelativePaths(String file) { - String fileWorkCopy = file; - int oldLen = file.length(); - int newLen = oldLen - 1; - Matcher m; - - while (oldLen != newLen) { - oldLen = fileWorkCopy.length(); - m = RELATIVE_PATH_PATTERN.matcher(fileWorkCopy); - // substitue first occurence of "/xx/../" by "/" - fileWorkCopy = m.replaceFirst(RELATIVE_PATH_SUBSTITUTION); - m = LEADING_RELATIVE_PATH_PATTERN.matcher(fileWorkCopy); - // remove leading "/../" - fileWorkCopy = m.replaceFirst(LEADING_RELATIVE_PATH_SUBSTITUTION); - newLen = fileWorkCopy.length(); + private String substituteUnnecessaryRelativePaths(String file) { + String fileWorkCopy = file; + int oldLen = file.length(); + int newLen = oldLen - 1; + + // All substitutions will be done step by step, to ensure that certain + // constellations will be normalized, too + // + // For example: "/aa/bb/../../cc/../foo.html will be normalized in the + // following manner: + // "/aa/bb/../../cc/../foo.html" + // "/aa/../cc/../foo.html" + // "/cc/../foo.html" + // "/foo.html" + // + // The normalization also takes care of leading "/../", which will be + // replaced by "/", because this is a rather a sign of bad webserver + // configuration than of a wanted link. For example, urls like + // "http://www.foo.com/../" should return a http 404 error instead of + // redirecting to "http://www.foo.com". + // + Perl5Matcher matcher = (Perl5Matcher)matchers.get(); + + while (oldLen != newLen) { + // substitue first occurence of "/xx/../" by "/" + oldLen = fileWorkCopy.length(); + fileWorkCopy = Util.substitute + (matcher, relativePathRule.pattern, + relativePathRule.substitution, fileWorkCopy, 1); + + // remove leading "/../" + fileWorkCopy = Util.substitute + (matcher, leadingRelativePathRule.pattern, + leadingRelativePathRule.substitution, fileWorkCopy, 1); + newLen = fileWorkCopy.length(); + } + + return fileWorkCopy; + } + + + /** + * Class which holds a compiled pattern and its corresponding substition + * string. + */ + private static class Rule { + public Perl5Pattern pattern; + public Perl5Substitution substitution; } - return fileWorkCopy; - } public void setConf(Configuration conf) { this.conf = conf; @@ -160,3 +189,4 @@ } } + Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=517015&r1=517014&r2=517015 ============================================================================== --- lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original) +++ lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Sun Mar 11 14:18:23 2007 @@ -1,4 +1,4 @@ -/* +/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.nutch.net.urlnormalizer.regex; import java.net.URL; @@ -27,7 +28,6 @@ import java.util.List; import java.util.ArrayList; import java.util.Iterator; -import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -40,6 +40,7 @@ import javax.xml.parsers.*; import org.w3c.dom.*; +import org.apache.oro.text.regex.*; /** * Allows users to do regex substitutions on all/any URLs that are encountered, @@ -64,14 +65,16 @@ * string. */ private static class Rule { - public Pattern pattern; + public Perl5Pattern pattern; public String substitution; } - private HashMap<String, List<Rule>> scopedRules; + private HashMap scopedRules; - private static final List<Rule> EMPTY_RULES = Collections.EMPTY_LIST; + private static final List EMPTY_RULES = Collections.EMPTY_LIST; + + private PatternMatcher matcher = new Perl5Matcher(); /** * The default constructor which is called from UrlNormalizerFactory @@ -90,9 +93,9 @@ * configuration files for it. */ public RegexURLNormalizer(Configuration conf, String filename) - throws IOException { + throws IOException, MalformedPatternException { super(conf); - List<Rule> rules = readConfigurationFile(filename); + List rules = readConfigurationFile(filename); if (rules != null) scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); } @@ -103,9 +106,9 @@ // the default constructor was called if (this.scopedRules == null) { String filename = getConf().get("urlnormalizer.regex.file"); - scopedRules = new HashMap<String, List<Rule>>(); + scopedRules = new HashMap(); URL url = getConf().getResource(filename); - List<Rule> rules = null; + List rules = null; if (url == null) { LOG.warn("Can't load the default config file! " + filename); rules = EMPTY_RULES; @@ -123,7 +126,7 @@ // used in JUnit test. void setConfiguration(InputStream is, String scope) { - List<Rule> rules = readConfiguration(is); + List rules = readConfiguration(is); scopedRules.put(scope, rules); LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules."); } @@ -133,7 +136,7 @@ * patterns. It accepts a string url as input and returns the altered string. */ public synchronized String regexNormalize(String urlString, String scope) { - List<Rule> curRules = scopedRules.get(scope); + List curRules = (List)scopedRules.get(scope); if (curRules == null) { // try to populate String configFile = getConf().get("urlnormalizer.regex.file." + scope); @@ -144,6 +147,7 @@ LOG.warn("Can't load resource for config file: " + configFile); } else { try { + InputStream is = resource.openStream(); curRules = readConfiguration(resource.openStream()); scopedRules.put(scope, curRules); } catch (Exception e) { @@ -158,11 +162,14 @@ } if (curRules == EMPTY_RULES || curRules == null) { // use global rules - curRules = scopedRules.get(URLNormalizers.SCOPE_DEFAULT); + curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT); } - - for (Rule rule: curRules) { - urlString = rule.pattern.matcher(urlString).replaceAll(rule.substitution); + Iterator i = curRules.iterator(); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution( + r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual + // substitution } return urlString; } @@ -173,7 +180,7 @@ } /** Reads the configuration file and populates a List of Rules. */ - private List<Rule> readConfigurationFile(String filename) { + private List readConfigurationFile(String filename) { if (LOG.isInfoEnabled()) { LOG.info("loading " + filename); } @@ -186,8 +193,9 @@ } } - private List<Rule> readConfiguration(InputStream is) { - List<Rule> rules = new ArrayList<Rule>(); + private List readConfiguration(InputStream is) { + Perl5Compiler compiler = new Perl5Compiler(); + List rules = new ArrayList(); try { // borrowed heavily from code in Configuration.java @@ -225,7 +233,7 @@ } if (patternValue != null && subValue != null) { Rule rule = new Rule(); - rule.pattern = Pattern.compile(patternValue); + rule.pattern = (Perl5Pattern) compiler.compile(patternValue); rule.substitution = subValue; rules.add(rule); } @@ -241,14 +249,15 @@ } /** Spits out patterns and substitutions that are in the configuration file. */ - public static void main(String args[]) throws IOException { + public static void main(String args[]) throws MalformedPatternException, + IOException { RegexURLNormalizer normalizer = new RegexURLNormalizer(); normalizer.setConf(NutchConfiguration.create()); Iterator i = ((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator(); System.out.println("* Rules for 'DEFAULT' scope:"); while (i.hasNext()) { Rule r = (Rule) i.next(); - System.out.print(" " + r.pattern.pattern() + " -> "); + System.out.print(" " + r.pattern.getPattern() + " -> "); System.out.println(r.substitution); } // load the scope @@ -264,7 +273,7 @@ i = ((List)normalizer.scopedRules.get(scope)).iterator(); while (i.hasNext()) { Rule r = (Rule) i.next(); - System.out.print(" " + r.pattern.pattern() + " -> "); + System.out.print(" " + r.pattern.getPattern() + " -> "); System.out.println(r.substitution); } } ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs