OutlinkExtractor.java

siren Sat, 10 Mar 2007 09:30:31 -0800

Author: siren
Date: Sat Mar 10 09:30:04 2007
New Revision: 516754

URL: http://svn.apache.org/viewvc?view=rev&rev=516754
Log:
Change OutlinkExtractor to use Regular Expressions from JRE, get rid of ORO 
dependency


Removed:
    lucene/nutch/trunk/lib/jakarta-oro-2.0.7.jar
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=516754&r1=516753&r2=516754
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Sat Mar 10 09:30:04 2007
@@ -158,6 +158,9 @@
 53. NUTCH-384 - Protocol-file plugin does not allow the parse plugins
     framework to operate properly (Heiko Dietze via mattmann)
 
+54. Change OutlinkExtractor to use Regular Expressions from JRE, get rid
+    of ORO dependency (siren)
+
 
 Release 0.8 - 2006-07-25
 

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=516754&r1=516753&r2=516754
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Sat Mar 10 09:30:04 2007
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,28 +14,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.net.MalformedURLException;
 import java.util.ArrayList;
-import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.Pattern;
-import org.apache.oro.text.regex.PatternCompiler;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.PatternMatcherInput;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
 
 /**
- * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s 
- * / URLs from plain text using Regular Expressions.
+ * Extractor to extract [EMAIL PROTECTED] org.apache.nutch.parse.Outlink}s / 
URLs from
+ * plain text using Regular Expressions.
  * 
  * @see <a
  *      
href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions";>Comparison
@@ -44,12 +37,14 @@
  *      </a>
  * 
  * @author Stephan Strittmatter - http://www.sybit.de
- * @version 1.0
+ *
  * @since 0.7
  */
 public class OutlinkExtractor {
   private static final Log LOG = LogFactory.getLog(OutlinkExtractor.class);
 
+  private static final Outlink[] NO_LINKS = new Outlink[0];
+
   /**
    * Regex pattern to get URLs within a plain text.
    * 
@@ -57,190 +52,63 @@
    *      
href="http://www.truerwords.net/articles/ut/urlactivation.html";>http://www.truerwords.net/articles/ut/urlactivation.html
    *      </a>
    */
-  private static final String URL_PATTERN = 
-    
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+  private static final String URL_PATTERN = 
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+
+  static final Pattern urlPattern = Pattern.compile(URL_PATTERN);
 
   /**
-   * Extracts <code>Outlink</code> from given plain text.
-   * Applying this method to non-plain-text can result in extremely lengthy
-   * runtimes for parasitic cases (postscript is a known example).
-   * @param plainText  the plain text from wich URLs should be extracted.
+   * Extracts outlinks from a plain text.
+   * </p>
+   * @param plainText
    * 
-   * @return Array of <code>Outlink</code>s within found in plainText
+   * @return Array of <code>Outlink</code> s within found in plainText
    */
-  public static Outlink[] getOutlinks(final String plainText, Configuration 
conf) {
-    return OutlinkExtractor.getOutlinks(plainText, "", conf);
+  public static Outlink[] getOutlinks(final String plainText, Configuration 
conf){
+    return getOutlinks(plainText, null, conf);
   }
 
+  
   /**
-   * Extracts <code>Outlink</code> from given plain text and adds anchor
-   * to the extracted <code>Outlink</code>s
-   * 
-   * @param plainText the plain text from wich URLs should be extracted.
-   * @param anchor    the anchor of the url
+   * Extracts outlinks from a plain text.
+   * </p>
+   * @param plainText text to extract urls from
    * 
-   * @return Array of <code>Outlink</code>s within found in plainText
+   * @return Array of <code>Outlink</code> s found in plainText
    */
-  public static Outlink[] getOutlinks(final String plainText, String anchor, 
Configuration conf) {
-    long start = System.currentTimeMillis();
-    final List outlinks = new ArrayList();
-
-    try {
-      final PatternCompiler cp = new Perl5Compiler();
-      final Pattern pattern = cp.compile(URL_PATTERN,
-          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
-              | Perl5Compiler.MULTILINE_MASK);
-      final PatternMatcher matcher = new Perl5Matcher();
-
-      final PatternMatcherInput input = new PatternMatcherInput(plainText);
-
-      MatchResult result;
-      String url;
-
-      //loop the matches
-      while (matcher.contains(input, pattern)) {
-        // if this is taking too long, stop matching
-        //   (SHOULD really check cpu time used so that heavily loaded systems
-        //   do not unnecessarily hit this limit.)
-        if (System.currentTimeMillis() - start >= 60000L) {
-          if (LOG.isWarnEnabled()) {
-            LOG.warn("Time limit exceeded for getOutLinks");
-          }
-          break;
-        }
-        result = matcher.getMatch();
-        url = result.group(0);
-        try {
-          Outlink outlink = new Outlink(url, anchor, conf);
-          outlinks.add(new Outlink(url, anchor, conf));
-        } catch (MalformedURLException mue) {
-          LOG.warn("Invalid url: '" + url + "', skipping.");
+  public static Outlink[] getOutlinks(final String plainText, String anchor,
+      Configuration conf) {
+    
+    if(plainText == null){
+      return NO_LINKS;
+    }
+
+    final ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
+    Outlink[] retval;
+    Outlink link;
+
+    Matcher m = urlPattern.matcher(plainText);
+    while (m.find()) {
+
+      try {
+        link = new Outlink(m.toMatchResult().group(), anchor, conf);
+        outlinks.add(link);
+      } catch (MalformedURLException ex) {
+        // if it is a malformed URL we just throw it away and continue with
+        // extraction.
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("extracted malformed url:" + m.toMatchResult().group(), 
ex);
         }
       }
-    } catch (Exception ex) {
-      // if the matcher fails (perhaps a malformed URL) we just log it and 
move on
-      if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
-    }
 
-    final Outlink[] retval;
+    }
 
-    //create array of the Outlinks
-    if (outlinks != null && outlinks.size() > 0) {
-      retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
+    if (outlinks.size() > 0) {
+      retval = outlinks.toArray(new Outlink[outlinks.size()]);
     } else {
-      retval = new Outlink[0];
+      retval = NO_LINKS;
     }
 
     return retval;
   }
-  
 
-  /**
-   * Extracts outlinks from a plain text. <br />
-   * This Method takes the Jakarta Regexp API.
-   * 
-   * @param plainText
-   * 
-   * @return Array of <code>Outlink</code> s within found in plainText
-   * @deprecated only for tests
-   */
-  private Outlink[] getOutlinksJakartaRegexpImpl(final String plainText) {
-
-    throw new UnsupportedOperationException(
-        "Implementation commented out. Please uncomment to use it.");
-
-    // final List outlinks = new ArrayList();
-    // String url;
-    // Outlink link;
-    //
-    // RE re = new RE(URL_PATTERN);
-    //
-    // int pos = 0;
-    //
-    // while (re.match(plainText, pos)) {
-    //
-    // url = re.getParen(0);
-    //
-    // if (LOG.isTraceEnabled()) {
-    //   LOG.trace("Extracted url: " + url);
-    // }
-    //
-    // try {
-    //
-    // link = new Outlink(url, null);
-    // outlinks.add(link);
-    //
-    // } catch (MalformedURLException ex) {
-    // // if it is a malformed URL we just throw it away and continue with
-    // // extraction.
-    // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
-    // }
-    //
-    // pos = re.getParenEnd(0);
-    // }
-    //
-    // final Outlink[] retval;
-    //
-    // if (pos > 0) {
-    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
-    // } else {
-    // retval = new Outlink[0];
-    // }
-    //
-    // return retval;
-
-  }
-
-  /**
-   * Extracts outlinks from a plain text.
-   * </p>
-   * This Method takes the JDK5 Regexp API.
-   * 
-   * @param plainText
-   * 
-   * @return Array of <code>Outlink</code> s within found in plainText
-   * @deprecated only for tests
-   */
-  private Outlink[] getOutlinksJDK5Impl(final String plainText) {
-
-    throw new UnsupportedOperationException(
-        "Implementation commented out. Please uncomment to use it.");
-
-    // final List outlinks = new ArrayList();
-    // String url;
-    // Outlink link;
-    //
-    // final Pattern urlPattern = Pattern.compile(URL_PATTERN);
-    // final RE re = new RE(urlPattern);
-    //
-    // int pos = 0;
-    //
-    // while (re.match(plainText, pos)) {
-    //
-    // url = re.getParen(0);
-    //
-    // try {
-    //
-    // link = new Outlink(url, null);
-    // outlinks.add(link);
-    // } catch (MalformedURLException ex) {
-    // // if it is a malformed URL we just throw it away and continue with
-    // // extraction.
-    // if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
-    // }
-    //
-    // pos = re.getParenEnd(0);
-    // }
-    //
-    // final Outlink[] retval;
-    //
-    // if (pos > 0) {
-    // retval = (Outlink[]) outlinks.toArray(new Outlink[0]);
-    // } else {
-    // retval = new Outlink[0];
-    // }
-    //
-    // return retval;
-  }
- 
 }



-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-cvs mailing list
Nutch-cvs@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

[Nutch-cvs] svn commit: r516754 - in /lucene/nutch/trunk: CHANGES.txt lib/jakarta-oro-2.0.7.jar src/java/org/apache/nutch/parse/OutlinkExtractor.java

Reply via email to