svn commit: r387650 - in /lucene/nutch/trunk/src/plugin/urlfilter-regex: ./ sample/ src/java/org/apache/nutch/net/ src/test/ src/test/org/ src/test/org/apache/ src/test/org/apache/nutch/ src/test/org/apache/nutch/net/

jerome Tue, 21 Mar 2006 14:27:21 -0800

Author: jerome
Date: Tue Mar 21 14:26:56 2006
New Revision: 387650

URL: http://svn.apache.org/viewcvs?rev=387650&view=rev
Log:
urlfilter-regex now use the lib-regex-filter framework.
Add some unit tests.


Added:
    lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules
    lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls
    lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
    lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls
    lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
    lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/
    
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java
   (with props)
Modified:
    lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml
    
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java

Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml?rev=387650&r1=387649&r2=387650&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml Tue Mar 21 14:26:56 
2006
@@ -4,4 +4,29 @@
 
   <import file="../build-plugin.xml"/>
 
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+  </path>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+  </copy>
+
 </project>

Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules?rev=387650&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules 
(added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules Tue 
Mar 21 14:26:56 2006
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
[EMAIL PROTECTED]
+
+# skip .fr .org and .net domains
+-^.*//.*\.fr/
+-^.*//.*\.org/
+-^.*//.*\.net/
+
+# skip everything else
++.

Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls?rev=387650&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls (added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls Tue 
Mar 21 14:26:56 2006
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules?rev=387650&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules 
(added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules 
Tue Mar 21 14:26:56 2006
@@ -0,0 +1,27 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
[EMAIL PROTECTED]
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
+# accept hosts in MY.DOMAIN.NAME
++^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
+
+# skip everything else
+-.

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls?rev=387650&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls 
(added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls 
Tue Mar 21 14:26:56 2006
@@ -0,0 +1,8 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:[EMAIL PROTECTED]
+-news://any.news.server/comp.lang.java
+-whois:/nutch.org
++http://MY.DOMAIN.NAME/
++http://MY.DOMAIN.NAME/nutch
++http://www.MY.DOMAIN.NAME/

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules?rev=387650&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules 
(added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules 
Tue Mar 21 14:26:56 2006
@@ -0,0 +1,22 @@
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file: ftp: and mailto: urls
+-^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+
+# skip URLs containing certain characters as probable queries, etc.
[EMAIL PROTECTED]
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+-.*(/.+?)/.*?\1/.*?\1/
+
+# accept anything else
++.

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls?rev=387650&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls 
(added)
+++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls 
Tue Mar 21 14:26:56 2006
@@ -0,0 +1,11 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:[EMAIL PROTECTED]
++news://any.news.server/comp.lang.java
++whois:/nutch.org
+-http://www.nutch.org/nutch.gif
+-http://www.nutch.org/nutch.eps
+-http://www.nutch.org/nutch?q=nutch
++http://www.nutch.org/
+-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
+-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/

Modified: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java?rev=387650&r1=387649&r2=387650&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java
 Tue Mar 21 14:26:56 2006
@@ -13,185 +13,75 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.util.LogFormatter;
-
-import org.apache.nutch.plugin.Extension;
-import org.apache.nutch.plugin.PluginRepository;
-
+// JDK imports
 import java.io.Reader;
-import java.io.FileReader;
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
 import java.io.IOException;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
 
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.logging.Logger;
-import java.util.regex.*;
 
 /**
- * Filters URLs based on a file of regular expressions. The file is named by
- * (1) property "urlfilter.regex.file" in ./conf/nutch-default.xml, and
- * (2) attribute "file" in plugin.xml of this plugin
- * Attribute "file" has higher precedence if defined.
- *
- * <p>The format of this file is:
- * <pre>
- * [+-]<regex>
- * </pre>
- * where plus means go ahead and index it and minus means no.
+ * Filters URLs based on a file of regular expressions using the
+ * [EMAIL PROTECTED] java.util.regex Java Regex implementation}.
  */
-
-public class RegexURLFilter implements URLFilter {
-
-  private static final Logger LOG =
-    LogFormatter.getLogger(RegexURLFilter.class.getName());
-
-  // read in attribute "file" of this plugin.
-  private static String attributeFile = null;
-
-  private static class Rule {
-    public Pattern pattern;
-    public boolean sign;
-    public String regex;
-  }
-
-  private List rules;
-
-  private Configuration conf;
+public class RegexURLFilter extends RegexURLFilterBase {
 
   public RegexURLFilter() {
+    super();
   }
 
   public RegexURLFilter(String filename)
     throws IOException, PatternSyntaxException {
-    rules = readConfigurationFile(new FileReader(filename));
+    super(filename);
   }
 
-  public synchronized String filter(String url) {
-    Iterator i=rules.iterator();
-    while(i.hasNext()) {
-      Rule r=(Rule) i.next();
-      Matcher matcher = r.pattern.matcher(url);
-
-      if (matcher.find()) {
-        //System.out.println("Matched " + r.regex);
-        return r.sign ? url : null;
-      }
-    };
-        
-    return null;   // assume no go
-  }
-
-  //
-  // Format of configuration file is
-  //    
-  // [+-]<regex>
-  //
-  // where plus means go ahead and index it and minus means no.
-  // 
+  RegexURLFilter(Reader reader)
+    throws IOException, IllegalArgumentException {
+    super(reader);
+  }
 
-  private static List readConfigurationFile(Reader reader)
-    throws IOException, PatternSyntaxException {
+  
+  /* ----------------------------------- *
+   * <implementation:RegexURLFilterBase> *
+   * ----------------------------------- */
+  
+  // Inherited Javadoc
+  protected String getRulesFile(Configuration conf) {
+    return conf.get("urlfilter.regex.file");
+  }
 
-    BufferedReader in=new BufferedReader(reader);
-    List rules=new ArrayList();
-    String line;
-       
-    while((line=in.readLine())!=null) {
-      if (line.length() == 0)
-        continue;
-      char first=line.charAt(0);
-      boolean sign=false;
-      switch (first) {
-      case '+' : 
-        sign=true;
-        break;
-      case '-' :
-        sign=false;
-        break;
-      case ' ' : case '\n' : case '#' :           // skip blank & comment lines
-        continue;
-      default :
-        throw new IOException("Invalid first character: "+line);
-      }
-
-      String regex=line.substring(1);
-
-      Rule rule=new Rule();
-      rule.pattern=Pattern.compile(regex);
-      rule.sign=sign;
-      rule.regex=regex;
-      rules.add(rule);
-    }
+  // Inherited Javadoc
+  protected RegexRule createRule(boolean sign, String regex) {
+    return new Rule(sign, regex);
+  }
+  
+  /* ------------------------------------ *
+   * </implementation:RegexURLFilterBase> *
+   * ------------------------------------ */
 
-    return rules;
+  
+  public static void main(String args[]) throws IOException {
+    main(new RegexURLFilter(), args);
   }
 
-  public static void main(String args[])
-    throws IOException, PatternSyntaxException {
 
-    RegexURLFilter filter=new RegexURLFilter();
-    BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
-    String line;
-    while((line=in.readLine())!=null) {
-      String out=filter.filter(line);
-      if(out!=null) {
-        System.out.print("+");
-        System.out.println(out);
-      } else {
-        System.out.print("-");
-        System.out.println(line);
-      }
+  private class Rule extends RegexRule {
+    
+    private Pattern pattern;
+    
+    Rule(boolean sign, String regex) {
+      super(sign, regex);
+      pattern = Pattern.compile(regex);
     }
-  }
 
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    String pluginName = "urlfilter-regex";
-    Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
-        URLFilter.class.getName()).getExtensions();
-    for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
-      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
-        attributeFile = extension.getAttribute("file");
-        break;
-      }
-    }
-    if (attributeFile != null && attributeFile.trim().equals(""))
-      attributeFile = null;
-    if (attributeFile != null) {
-      LOG.info("Attribute \"file\" is defined for plugin " + pluginName
-          + " as " + attributeFile);
-    } else {
-      //LOG.warning("Attribute \"file\" is not defined in plugin.xml for 
plugin "+pluginName);
-    }
-    String file = conf.get("urlfilter.regex.file");
-    // attribute "file" takes precedence if defined
-    if (attributeFile != null)
-      file = attributeFile;
-    Reader reader = conf.getConfResourceAsReader(file);
-
-    if (reader == null) {
-      LOG.severe("Can't find resource: " + file);
-    } else {
-      try {
-        rules = readConfigurationFile(reader);
-      } catch (IOException e) {
-        LOG.severe(e.getMessage());
-        //TODO [EMAIL PROTECTED]: throw Exception? Because broken api.
-        throw new RuntimeException(e.getMessage(), e);
-      }
+    protected boolean match(String url) {
+      return pattern.matcher(url).find();
     }
   }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
+  
 }

Added: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java?rev=387650&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java
 (added)
+++ 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java
 Tue Mar 21 14:26:56 2006
@@ -0,0 +1,66 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+// JUnit imports
+import junit.framework.Test;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+
+/**
+ * JUnit based test of class <code>RegexURLFilter</code>.
+ *
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestRegexURLFilter extends RegexURLFilterBaseTest {
+  
+  public TestRegexURLFilter(String testName) {
+    super(testName);
+  }
+  
+  public static Test suite() {
+    return new TestSuite(TestRegexURLFilter.class);
+  }
+  
+  public static void main(String[] args) {
+    TestRunner.run(suite());
+  }
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new RegexURLFilter(rules);
+    } catch (IOException e) {
+      fail(e.toString());
+      return null;
+    }
+  }
+  
+  public void test() {
+    test("WholeWebCrawling");
+    test("IntranetCrawling");
+    bench(50, "Benchmarks");
+    bench(100, "Benchmarks");
+    bench(200, "Benchmarks");
+    bench(400, "Benchmarks");
+    bench(800, "Benchmarks");
+  }
+
+}

Propchange: 
lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

svn commit: r387650 - in /lucene/nutch/trunk/src/plugin/urlfilter-regex: ./ sample/ src/java/org/apache/nutch/net/ src/test/ src/test/org/ src/test/org/apache/ src/test/org/apache/nutch/ src/test/org/apache/nutch/net/

Reply via email to