Author: ab Date: Mon Sep 25 10:36:36 2006 New Revision: 449752 URL: http://svn.apache.org/viewvc?view=rev&rev=449752 Log: Catch exception on invalid urls, and continue collecting valid ones.
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?view=diff&rev=449752&r1=449751&r2=449752 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Mon Sep 25 10:36:36 2006 @@ -16,6 +16,7 @@ package org.apache.nutch.parse; +import java.net.MalformedURLException; import java.util.ArrayList; import java.util.List; @@ -108,7 +109,12 @@ } result = matcher.getMatch(); url = result.group(0); - outlinks.add(new Outlink(url, anchor, conf)); + try { + Outlink outlink = new Outlink(url, anchor, conf); + outlinks.add(new Outlink(url, anchor, conf)); + } catch (MalformedURLException mue) { + LOG.warn("Invalid url: '" + url + "', skipping."); + } } } catch (Exception ex) { // if the matcher fails (perhaps a malformed URL) we just log it and move on