Author: ab
Date: Mon Sep 25 10:36:36 2006
New Revision: 449752

URL: http://svn.apache.org/viewvc?view=rev&rev=449752
Log:
Catch exception on invalid urls, and continue collecting valid ones.

Modified:
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=449752&r1=449751&r2=449752
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Mon Sep 25 10:36:36 2006
@@ -16,6 +16,7 @@
 
 package org.apache.nutch.parse;
 
+import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -108,7 +109,12 @@
         }
         result = matcher.getMatch();
         url = result.group(0);
-        outlinks.add(new Outlink(url, anchor, conf));
+        try {
+          Outlink outlink = new Outlink(url, anchor, conf);
+          outlinks.add(new Outlink(url, anchor, conf));
+        } catch (MalformedURLException mue) {
+          LOG.warn("Invalid url: '" + url + "', skipping.");
+        }
       }
     } catch (Exception ex) {
       // if the matcher fails (perhaps a malformed URL) we just log it and 
move on


Reply via email to