Author: dogacan Date: Mon Jun 18 11:13:15 2007 New Revision: 548429 URL: http://svn.apache.org/viewvc?view=rev&rev=548429 Log: NUTCH-489 - URLFilter-suffix management of the url path when the url contains some query parameters.
Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/conf/suffix-urlfilter.txt lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=548429&r1=548428&r2=548429 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Mon Jun 18 11:13:15 2007 @@ -41,6 +41,8 @@ 13. NUTCH-485 - Change HtmlParseFilter 's to return ParseResult object instead of Parse object. (Gal Nitzan via dogacan) +14. NUTCH-489 - URLFilter-suffix management of the url path when the url contains some query parameters. (Emmanuel Joke via dogacan) + Release 0.9 - 2007-04-02 1. Changed log4j confiquration to log to stdout on commandline Modified: lucene/nutch/trunk/conf/suffix-urlfilter.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/suffix-urlfilter.txt?view=diff&rev=548429&r1=548428&r2=548429 ============================================================================== --- lucene/nutch/trunk/conf/suffix-urlfilter.txt (original) +++ lucene/nutch/trunk/conf/suffix-urlfilter.txt Mon Jun 18 11:13:15 2007 @@ -2,6 +2,8 @@ # case-insensitive, allow unknown suffixes +I +# uncomment the line below to filter on url path +#+P ### prohibit these # pictures Modified: lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?view=diff&rev=548429&r1=548428&r2=548429 ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Mon Jun 18 11:13:15 2007 @@ -22,7 +22,6 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.SuffixStringMatcher; -import org.apache.nutch.util.TrieStringMatcher; import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.PluginRepository; @@ -39,6 +38,9 @@ import java.util.List; import java.util.ArrayList; +import java.net.URL; +import java.net.MalformedURLException; + /** * Filters URLs based on a file of URL suffixes. The file is named by * <ol> @@ -127,7 +129,7 @@ private SuffixStringMatcher suffixes; private boolean modeAccept = false; - + private boolean filterFromPath = false; private boolean ignoreCase = false; private Configuration conf; @@ -146,6 +148,15 @@ if (ignoreCase) _url = url.toLowerCase(); else _url = url; + if (filterFromPath) { + try { + URL pUrl = new URL(_url); + _url = pUrl.getPath(); + } catch (MalformedURLException e) { + // don't care + } + } + String a = suffixes.shortestMatch(_url); if (a == null) { if (modeAccept) return url; @@ -185,12 +196,16 @@ break; case '-': allow = false; - if (line.length() > 1 && line.charAt(1) == 'I') + if(line.contains("P")) + filterFromPath = true; + if(line.contains("I")) ignore = true; break; case '+': allow = true; - if (line.length() > 1 && line.charAt(1) == 'I') + if(line.contains("P")) + filterFromPath = true; + if(line.contains("I")) ignore = true; break; default: @@ -284,5 +299,9 @@ public void setIgnoreCase(boolean ignoreCase) { this.ignoreCase = ignoreCase; + } + + public void setFilterFromPath(boolean filterFromPath) { + this.filterFromPath = filterFromPath; } } Modified: lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?view=diff&rev=548429&r1=548428&r2=548429 ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java Mon Jun 18 11:13:15 2007 @@ -35,7 +35,8 @@ "# this is a comment\n" + "\n" + ".gif\n" + - ".jpg\n"; + ".jpg\n" + + ".js\n"; private static final String[] urls = new String[] { "http://www.example.com/test.gif", @@ -44,6 +45,8 @@ "http://www.example.com/test.JPG", "http://www.example.com/test.html", "http://www.example.com/test.HTML", + "http://www.example.com/test.html?q=abc.js", + "http://www.example.com/test.js?foo=bar&baz=bar#12333", }; private static String[] urlsModeAccept = new String[] { @@ -52,7 +55,9 @@ null, urls[3], urls[4], - urls[5] + urls[5], + null, + urls[7] }; private static String[] urlsModeReject = new String[] { @@ -61,6 +66,8 @@ urls[2], null, null, + null, + urls[6], null }; @@ -70,18 +77,44 @@ null, null, urls[4], - urls[5] + urls[5], + null, + urls[7] }; - + private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0], urls[1], urls[2], urls[3], null, + null, + urls[6], + null + }; + + private static String[] urlsModeAcceptAndPathFilter = new String[] { + null, + urls[1], + null, + urls[3], + urls[4], + urls[5], + urls[6], null }; + private static String[] urlsModeAcceptAndNonPathFilter = new String[] { + null, + urls[1], + null, + urls[3], + urls[4], + urls[5], + null, + urls[7] + }; + private SuffixURLFilter filter = null; public TestSuffixURLFilter(String testName) { @@ -129,6 +162,22 @@ filter.setModeAccept(false); for (int i = 0; i < urls.length; i++) { assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i])); + } + } + + public void testModeAcceptAndNonPathFilter() { + filter.setModeAccept(true); + filter.setFilterFromPath(false); + for (int i = 0; i < urls.length; i++) { + assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter.filter(urls[i])); + } + } + + public void testModeAcceptAndPathFilter() { + filter.setModeAccept(true); + filter.setFilterFromPath(true); + for (int i = 0; i < urls.length; i++) { + assertTrue(urlsModeAcceptAndPathFilter[i] == filter.filter(urls[i])); } } ------------------------------------------------------------------------- This SF.net email is sponsored by DB2 Express Download DB2 Express C - the FREE version of DB2 express and take control of your XML. No limits. Just data. Click to get it now. http://sourceforge.net/powerbar/db2/ _______________________________________________ Nutch-cvs mailing list Nutch-cvs@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/nutch-cvs