Author: siren Date: Sat Mar 10 09:30:04 2007 New Revision: 516754 URL: http://svn.apache.org/viewvc?view=rev&rev=516754 Log: Change OutlinkExtractor to use Regular Expressions from JRE, get rid of ORO dependency
Removed: lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516754&r1=516753&r2=516754 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Sat Mar 10 09:30:04 2007 @@ -158,6 +158,9 @@ 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins framework to operate properly (Heiko Dietze via mattmann) +54. Change OutlinkExtractor to use Regular Expressions from JRE, get rid + of ORO dependency (siren) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=516754&r1=516753&r2=516754 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Sat Mar 10 09:30:04 2007 @@ -1,4 +1,4 @@ -/** +/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. @@ -14,28 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.parse; import java.net.MalformedURLException; import java.util.ArrayList; -import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.oro.text.regex.MatchResult; -import org.apache.oro.text.regex.Pattern; -import org.apache.oro.text.regex.PatternCompiler; -import org.apache.oro.text.regex.PatternMatcher; -import org.apache.oro.text.regex.PatternMatcherInput; -import org.apache.oro.text.regex.Perl5Compiler; -import org.apache.oro.text.regex.Perl5Matcher; /** - * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s - * / URLs from plain text using Regular Expressions. + * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s / URLs from + * plain text using Regular Expressions. * * @see <a * href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison @@ -44,12 +37,14 @@ * </a> * * @author Stephan Strittmatter - http://www.sybit.de - * @version 1.0 + * * @since 0.7 */ public class OutlinkExtractor { private static final Log LOG = LogFactory.getLog(OutlinkExtractor.class); + private static final Outlink[] NO_LINKS = new Outlink[0]; + /** * Regex pattern to get URLs within a plain text. * @@ -57,190 +52,63 @@ * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html * </a> */ - private static final String URL_PATTERN = - "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + + static final Pattern urlPattern = Pattern.compile(URL_PATTERN); /** - * Extracts <code>Outlink</code> from given plain text. - * Applying this method to non-plain-text can result in extremely lengthy - * runtimes for parasitic cases (postscript is a known example). - * @param plainText the plain text from wich URLs should be extracted. + * Extracts outlinks from a plain text. + * </p> + * @param plainText * - * @return Array of <code>Outlink</code>s within found in plainText + * @return Array of <code>Outlink</code> s within found in plainText */ - public static Outlink[] getOutlinks(final String plainText, Configuration conf) { - return OutlinkExtractor.getOutlinks(plainText, "", conf); + public static Outlink[] getOutlinks(final String plainText, Configuration conf){ + return getOutlinks(plainText, null, conf); } + /** - * Extracts <code>Outlink</code> from given plain text and adds anchor - * to the extracted <code>Outlink</code>s - * - * @param plainText the plain text from wich URLs should be extracted. - * @param anchor the anchor of the url + * Extracts outlinks from a plain text. + * </p> + * @param plainText text to extract urls from * - * @return Array of <code>Outlink</code>s within found in plainText + * @return Array of <code>Outlink</code> s found in plainText */ - public static Outlink[] getOutlinks(final String plainText, String anchor, Configuration conf) { - long start = System.currentTimeMillis(); - final List outlinks = new ArrayList(); - - try { - final PatternCompiler cp = new Perl5Compiler(); - final Pattern pattern = cp.compile(URL_PATTERN, - Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK - | Perl5Compiler.MULTILINE_MASK); - final PatternMatcher matcher = new Perl5Matcher(); - - final PatternMatcherInput input = new PatternMatcherInput(plainText); - - MatchResult result; - String url; - - //loop the matches - while (matcher.contains(input, pattern)) { - // if this is taking too long, stop matching - // (SHOULD really check cpu time used so that heavily loaded systems - // do not unnecessarily hit this limit.) - if (System.currentTimeMillis() - start >= 60000L) { - if (LOG.isWarnEnabled()) { - LOG.warn("Time limit exceeded for getOutLinks"); - } - break; - } - result = matcher.getMatch(); - url = result.group(0); - try { - Outlink outlink = new Outlink(url, anchor, conf); - outlinks.add(new Outlink(url, anchor, conf)); - } catch (MalformedURLException mue) { - LOG.warn("Invalid url: '" + url + "', skipping."); + public static Outlink[] getOutlinks(final String plainText, String anchor, + Configuration conf) { + + if(plainText == null){ + return NO_LINKS; + } + + final ArrayList<Outlink> outlinks = new ArrayList<Outlink>(); + Outlink[] retval; + Outlink link; + + Matcher m = urlPattern.matcher(plainText); + while (m.find()) { + + try { + link = new Outlink(m.toMatchResult().group(), anchor, conf); + outlinks.add(link); + } catch (MalformedURLException ex) { + // if it is a malformed URL we just throw it away and continue with + // extraction. + if (LOG.isDebugEnabled()) { + LOG.debug("extracted malformed url:" + m.toMatchResult().group(), ex); } } - } catch (Exception ex) { - // if the matcher fails (perhaps a malformed URL) we just log it and move on - if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); } - } - final Outlink[] retval; + } - //create array of the Outlinks - if (outlinks != null && outlinks.size() > 0) { - retval = (Outlink[]) outlinks.toArray(new Outlink[0]); + if (outlinks.size() > 0) { + retval = outlinks.toArray(new Outlink[outlinks.size()]); } else { - retval = new Outlink[0]; + retval = NO_LINKS; } return retval; } - - /** - * Extracts outlinks from a plain text. <br /> - * This Method takes the Jakarta Regexp API. - * - * @param plainText - * - * @return Array of <code>Outlink</code> s within found in plainText - * @deprecated only for tests - */ - private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) { - - throw new UnsupportedOperationException( - "Implementation commented out. Please uncomment to use it."); - - // final List outlinks = new ArrayList(); - // String url; - // Outlink link; - // - // RE re = new RE(URL_PATTERN); - // - // int pos = 0; - // - // while (re.match(plainText, pos)) { - // - // url = re.getParen(0); - // - // if (LOG.isTraceEnabled()) { - // LOG.trace("Extracted url: " + url); - // } - // - // try { - // - // link = new Outlink(url, null); - // outlinks.add(link); - // - // } catch (MalformedURLException ex) { - // // if it is a malformed URL we just throw it away and continue with - // // extraction. - // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); } - // } - // - // pos = re.getParenEnd(0); - // } - // - // final Outlink[] retval; - // - // if (pos > 0) { - // retval = (Outlink[]) outlinks.toArray(new Outlink[0]); - // } else { - // retval = new Outlink[0]; - // } - // - // return retval; - - } - - /** - * Extracts outlinks from a plain text. - * </p> - * This Method takes the JDK5 Regexp API. - * - * @param plainText - * - * @return Array of <code>Outlink</code> s within found in plainText - * @deprecated only for tests - */ - private Outlink[] getOutlinksJDK5Impl(final String plainText) { - - throw new UnsupportedOperationException( - "Implementation commented out. Please uncomment to use it."); - - // final List outlinks = new ArrayList(); - // String url; - // Outlink link; - // - // final Pattern urlPattern = Pattern.compile(URL_PATTERN); - // final RE re = new RE(urlPattern); - // - // int pos = 0; - // - // while (re.match(plainText, pos)) { - // - // url = re.getParen(0); - // - // try { - // - // link = new Outlink(url, null); - // outlinks.add(link); - // } catch (MalformedURLException ex) { - // // if it is a malformed URL we just throw it away and continue with - // // extraction. - // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); } - // } - // - // pos = re.getParenEnd(0); - // } - // - // final Outlink[] retval; - // - // if (pos > 0) { - // retval = (Outlink[]) outlinks.toArray(new Outlink[0]); - // } else { - // retval = new Outlink[0]; - // } - // - // return retval; - } - } ------------------------------------------------------------------------- Take Surveys. Earn Cash. Influence the Future of IT Join SourceForge.net's Techsay panel and you'll get the chance to share your opinions on IT & business topics through brief surveys-and earn cash http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs