Author: jerome Date: Tue Mar 21 14:26:56 2006 New Revision: 387650 URL: http://svn.apache.org/viewcvs?rev=387650&view=rev Log: urlfilter-regex now use the lib-regex-filter framework. Add some unit tests.
Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java (with props) Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml?rev=387650&r1=387649&r2=387650&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/build.xml Tue Mar 21 14:26:56 2006 @@ -4,4 +4,29 @@ <import file="../build-plugin.xml"/> + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="jar" inheritall="false" dir="../lib-regex-filter"/> + <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- Add compilation dependencies to classpath --> + <path id="plugin.deps"> + <fileset dir="${nutch.root}/build"> + <include name="**/lib-regex-filter/*.jar" /> + </fileset> + <pathelement location="${nutch.root}/build/lib-regex-filter/test"/> + </path> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/> + </target> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample" includes="**/*.rules, **/*.urls"/> + </copy> + </project> Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules?rev=387650&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.rules Tue Mar 21 14:26:56 2006 @@ -0,0 +1,26 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ + +# skip URLs containing certain characters as probable queries, etc. [EMAIL PROTECTED] + +# skip .fr .org and .net domains +-^.*//.*\.fr/ +-^.*//.*\.org/ +-^.*//.*\.net/ + +# skip everything else ++. Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls?rev=387650&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/Benchmarks.urls Tue Mar 21 14:26:56 2006 @@ -0,0 +1,297 @@ ++http://www.hostip.info/ +-http://www.elanceur.org/Articles/OntologieSurfaite.html ++http://www.opensymphony.com/quartz/ +-http://www.portletbridge.org/saxbenchmark/index.html ++http://www.lesmotsdelinfo.com/ ++http://usefulinc.com/doap/ ++http://www.codezoo.com/ ++http://search.infocious.com/ +-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html ++http://www.brics.dk/%7Eamoeller/automaton/ ++http://jazzz.com/wp.html ++http://www.maxkiesler.com/index.php ++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html ++http://www.alias-i.com/lingpipe/ +-http://johnny.ihackstuff.com/index.php?module=prodreviews +-http://www.spurl.net/ ++http://www.dropload.com/ ++http://vivisimo.com/ ++http://www.marumushi.com/apps/newsmap/newsmap.cfm ++http://www.ixquick.com/ +-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html ++http://www.mail-archive.com/ ++http://www.spymac.com/ +-http://browsers.evolt.org/ +-http://www.oswd.org/ ++http://www.stayinvisible.com/index.pl ++http://java.sun.com/j2se/1.4.2/docs/api/index.html ++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx ++http://www.bloglines.com/ +-http://www.fckeditor.net/ ++http://search.msn.com/ +-http://www.grub.org/ ++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html +-http://www.mnot.net/cache_docs/ +-http://www.furl.net/ ++http://www.blogpulse.com/ ++http://www.googlefight.com/ ++http://www.rokulabs.com/ +-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php +-http://www.batbox.org/wrt54g-linux.html +-http://en.wikipedia.org/wiki/%s ++http://www.sipcenter.com/ ++http://www.merriampark.com/ld.htm ++http://anon.inf.tu-dresden.de/index_en.html ++http://www.pluck.com/ ++http://www.tiddlywiki.com/ ++http://www.jux2.com/ ++http://clusty.com/ +-http://findability.org/ ++http://www.searchengineshowdown.com/ ++http://www.nhacks.com/email/index.php ++http://www.koders.com/ ++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf ++http://www.gmailwiki.com/index.php/Main_Page ++http://www.tadalist.com/ ++http://www.net2ftp.com/ ++http://www.streamload.com/ ++http://www.lucazappa.com/brilliantMaker/buttonImage.php ++http://www.hybernaut.com/bdv/delicious-import.html ++http://www.gtmcknight.com/buttons/ ++http://amb.vis.ne.jp/mozilla/scrapbook/ ++http://g-metrics.com/index.php +-http://tor.eff.org/ ++http://www.search-this.com/search_engine_decoder.asp ++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html ++http://www.adaptivepath.com/publications/essays/archives/000385.php +-http://isnoop.net/gmail/ +-http://openweb.eu.org/ ++http://www.mistergooddeal.com/ ++http://javatoolbox.com/ +-http://www.freenews.fr/ ++http://www.wikiwax.com/ +-http://today.java.net/pub/a/today/2005/04/21/farm.html ++http://users.skynet.be/J.Beever/pave.htm ++http://www.lundi8h.com/ ++http://www.snap.com/ ++http://www.goosee.com/puppy/index.shtml +-http://www.softwarefreedom.org/index.html +-http://y.20q.net/ ++http://www.bitty.com/ ++http://www.lafraise.com/ +-http://www.liquidinformation.org/ ++http://www.searchtools.com/ ++http://www.martinfowler.com/articles/injection.html ++http://pdos.csail.mit.edu/scigen/ +-http://developer.yahoo.net/blog/ ++http://blogger-templates.blogspot.com/ ++http://phpadsnew.com/two/ ++http://www.langreiter.com/exec/yahoo-vs-google.html +-http://www.dataparksearch.org/ +-http://www.yubnub.org/ +-http://www.fing.org/ +-http://www.swish-e.org/ +-http://www.openajax.net/wordpress/ ++http://crypto.stanford.edu/PwdHash/ ++http://www.html-kit.com/favicon/ +-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1 ++http://www.durhamtownship.com/ ++http://jiwire.com/ ++http://www.insilmaril.de/vym/ +-http://www.spreadshirt.net/ ++http://www.goffice.com/ ++http://www.writely.com/ ++http://www.milindparikh.com/ ++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html ++http://www.wikyblog.com/Map/Guest/Home +-http://www.kottke.org/05/08/googleos-webos ++http://www.rollyo.com/ ++http://www.meebo.com/ ++http://www.factbites.com/ ++http://www.placeopedia.com/ ++http://swoogle.umbc.edu/ ++http://www.viaduc.com/ +-http://demo.wikiwyg.net/wikiwyg/demo/standalone/ ++http://podcasts.yahoo.com/ +-http://beaglewiki.org/Main_Page ++http://yq.search.yahoo.com/ +-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1 ++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html ++http://socialight.com/ ++http://www.lexxe.com/ ++http://www.xom.nu/ ++http://www.turboprint.de/ ++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27 ++http://www.wi-fiplanet.com/tutorials/article.php/3562391 ++http://particletree.com/features/10-tips-to-a-better-form/ ++http://www.songbirdnest.com/ +-http://www.w3.org/Talks/Tools/Slidy/ +-http://www.compassframework.org/display/SITE/Home ++http://motrech.blogspot.com/ ++http://www.moteurzine.com/ ++http://www.mex-search.com/ +-http://beta.previewseek.com/?mdc=y&twin=n&ilang=french ++http://www.goshme.com/ ++http://rialto.application-servers.com/ ++http://www.multe-pass.com/ ++http://www.tailrank.com/ ++http://www.vandertramp.com/INTERNETDOWN/ ++http://www.letterjames.de/index.html ++http://code.google.com/index.html ++http://www.kritx.com/ ++http://performancing.com/firefox ++http://www.mywebsearch.com/ +-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1 ++http://www.lukew.com/resources/articles/blogs2.asp +-http://www.hyperwords.net/ ++http://ajax.parish.ath.cx/translator/ ++http://www.maplandia.com/ +-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages ++http://onefeed.com/index.php ++http://www.file-swap.com/ +-http://opennlp.org/ ++http://mindprod.com/jgloss/encoding.html ++http://code.google.com/webstats/index.html ++http://www.freeweb-hosting.com/google_pagerank_pr_checker/ +-http://www.framakey.org/ +-http://microformats.org/wiki/hreview +-http://www.ashesandsnow.org/index2.html +-http://uima-framework.sourceforge.net/ ++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html +-http://www.anandtech.com/IT/showdoc.aspx?i=2523&p=2 ++http://fr.techcrunch.com/ +-http://developer.yahoo.net/yui/ ++http://www.fredrikodman.com/ ++http://www.mpirical.com/companion/mpirical_companion.html ++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html +-http://k9copy.free.fr/ +-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 +-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design +-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2 ++http://blogokat.canalblog.com/archives/2005/11/02/882454.html ++http://robur.slu.se/jensl/xmlclitools/ +-http://www.internetactu.net/?p=6291 +-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1 ++http://www.memodata.com/2004/fr/alexandria/ +-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave ++http://www.randomerror.com/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/ +-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395 +-http://interstices.info/display.jsp?id=c_15918 ++http://www.tech-invite.com/ ++http://www.croczilla.com/zap +-http://www.libervis.com/modules/wordpress/?p=13 ++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/ +-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm ++http://www.influo.com/ ++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html +-http://www.addnb.org/fr/docs/webinvisible.htm +-http://manhack.net/ +-http://www.jibaku.net/ ++http://www.pipologie.com/ ++http://christophenoel.blogspot.com/ +-http://www.seekport.fr/seekbot/ ++http://beta.exalead.com/ +-http://www.boolgum.fr/index.html ++http://www.kesako.canalblog.com/ ++http://loran.blogspot.com/ ++http://outils-recherche.blogspot.com/ ++http://www.art-dept.com/artists/giacobbe/ ++http://www.meggould.netfirms.com/site_seeingIII.htm ++http://www.freedpi.com/ ++http://www.frenchfred.com/ ++http://www.photoways.com/ +-http://freco.free.fr/index.htm +-http://triturages.free.fr/index.htm +-http://www.qsos.org/ ++http://www.alvis.info/alvis/ ++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/ +-http://www.shinux.org/ ++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml ++http://www.kurobox.com/online/tiki-index.php +-http://news.gmane.org/gmane.comp.misc.linkstation.linux ++http://www.imsbook.com/SIP-IMS-Standards-List.html +-http://incubator.apache.org/directory/subprojects/snickers/ +-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html +-http://sourceforge.net/projects/cryptix-asn1/ +-http://sourceforge.net/projects/basn/ +-http://asn1.elibel.tm.fr/fr/index.htm +-http://sourceforge.net/projects/a2j/ ++http://www.degrouptest.com/ ++http://interstices.info/ ++http://louvre-boite.viabloga.com/news/18.shtml +-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html ++http://poiplace.oabsoftware.nl/ +-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759 +-http://www.yoono.com/favorites.jsp?user-id=lquerel +-http://www.librecours.org/cgi-bin/main +-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1 +-http://limo.sourceforge.net/ ++http://www-scf.usc.edu/%7Emattmann/ ++http://spaces.msn.com/members/famillezen/ +-http://photos.joune.org/ +-http://www.canon.fr/paperart/ ++http://flash.eastweb.ru/files/20051024092150.swf ++http://www.xsltwiki.com/index.php/Main_Page ++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/ +-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31 ++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html +-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/ ++http://www.aeliosfinance.com/ ++http://www.capital-it.com/ +-http://www.tradedoubler.fr/pan/public/solutions/publisher +-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm ++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/ ++http://wanabo.com/ +-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1 +-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam ++http://aeliosfinance.com/ ++http://www.centreincubation.com/ ++http://www.franceincubation.com/ +-http://www.oseo.fr/ ++http://www.i18nfaq.com/chardet.html +-http://cpdetector.sourceforge.net/ ++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles ++http://chezlorry.ca/Accueil.htm ++http://cetnia.blogs.com/d_lires/ +-http://www.directwine.fr/ ++http://www.new-phenix.com/ +-http://upnp.sourceforge.net/ +-http://www.pixmania.fr/ +-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3 ++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/ ++http://www.stepnewz.com/sn/default.asp ++http://opquast.com/ +-http://www.freeplayer.org/ +-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie +-http://atomcomputer.free.fr/fbox/ +-http://www.internetactu.net/index.php?p=6100 +-http://mammouthland.free.fr/cours/css/genecss.php +-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1 ++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html +-http://xml.apache.org/xalan-j/extensions.html ++http://developers.sun.com/foryourbusiness/jcc/ ++http://blogs.sun.com/roller/page/roumen/Weblog +-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1 +-http://blog.developpez.com/index.php?blog=51&p=1389&more=1&c=1&tb=1&pb=1 ++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/ ++http://odur.let.rug.nl/%7Evannoord/ +-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html +-http://artist.inist.fr/ ++http://www.elra.info/ +-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO ++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability ++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval ++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/ ++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/ ++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/ ++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/ ++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/ ++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html +-http://www.lexique.org/ ++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/ ++http://www.streamium.com/products/mx6000i/ +-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&scy=FR&slg=fr +-http://store.interact-tv.com/store/product_info.php?cPath=9&products_id=73 ++http://www.tversity.com/ +-http://www.aspseek.org/index.php \ No newline at end of file Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules?rev=387650&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules Tue Mar 21 14:26:56 2006 @@ -0,0 +1,27 @@ +# The url filter file used by the crawl command. + +# Better for intranet crawling. +# Be sure to change MY.DOMAIN.NAME to your domain name. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file:, ftp:, & mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$ + +# skip URLs containing certain characters as probable queries, etc. [EMAIL PROTECTED] + +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +-.*(/.+?)/.*?\1/.*?\1/ + +# accept hosts in MY.DOMAIN.NAME ++^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/ + +# skip everything else +-. Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls?rev=387650&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/IntranetCrawling.urls Tue Mar 21 14:26:56 2006 @@ -0,0 +1,8 @@ +-file://home/jc/nutch/index.html +-ftp://ftp.apache.org/nutch.html +-mailto:[EMAIL PROTECTED] +-news://any.news.server/comp.lang.java +-whois:/nutch.org ++http://MY.DOMAIN.NAME/ ++http://MY.DOMAIN.NAME/nutch ++http://www.MY.DOMAIN.NAME/ Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules?rev=387650&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules Tue Mar 21 14:26:56 2006 @@ -0,0 +1,22 @@ +# The default url filter. +# Better for whole-internet crawling. + +# Each non-comment, non-blank line contains a regular expression +# prefixed by '+' or '-'. The first matching pattern in the file +# determines whether a URL is included or ignored. If no pattern +# matches, the URL is ignored. + +# skip file: ftp: and mailto: urls +-^(file|ftp|mailto): + +# skip image and other suffixes we can't yet parse +-\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$ + +# skip URLs containing certain characters as probable queries, etc. [EMAIL PROTECTED] + +# skip URLs with slash-delimited segment that repeats 3+ times, to break loops +-.*(/.+?)/.*?\1/.*?\1/ + +# accept anything else ++. Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls?rev=387650&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/sample/WholeWebCrawling.urls Tue Mar 21 14:26:56 2006 @@ -0,0 +1,11 @@ +-file://home/jc/nutch/index.html +-ftp://ftp.apache.org/nutch.html +-mailto:[EMAIL PROTECTED] ++news://any.news.server/comp.lang.java ++whois:/nutch.org +-http://www.nutch.org/nutch.gif +-http://www.nutch.org/nutch.eps +-http://www.nutch.org/nutch?q=nutch ++http://www.nutch.org/ +-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/ +-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/ Modified: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java?rev=387650&r1=387649&r2=387650&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java (original) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/net/RegexURLFilter.java Tue Mar 21 14:26:56 2006 @@ -13,185 +13,75 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.nutch.net; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.util.LogFormatter; - -import org.apache.nutch.plugin.Extension; -import org.apache.nutch.plugin.PluginRepository; - +// JDK imports import java.io.Reader; -import java.io.FileReader; -import java.io.BufferedReader; -import java.io.InputStreamReader; import java.io.IOException; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; -import java.util.List; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.logging.Logger; -import java.util.regex.*; /** - * Filters URLs based on a file of regular expressions. The file is named by - * (1) property "urlfilter.regex.file" in ./conf/nutch-default.xml, and - * (2) attribute "file" in plugin.xml of this plugin - * Attribute "file" has higher precedence if defined. - * - * <p>The format of this file is: - * <pre> - * [+-]<regex> - * </pre> - * where plus means go ahead and index it and minus means no. + * Filters URLs based on a file of regular expressions using the + * [EMAIL PROTECTED] java.util.regex Java Regex implementation}. */ - -public class RegexURLFilter implements URLFilter { - - private static final Logger LOG = - LogFormatter.getLogger(RegexURLFilter.class.getName()); - - // read in attribute "file" of this plugin. - private static String attributeFile = null; - - private static class Rule { - public Pattern pattern; - public boolean sign; - public String regex; - } - - private List rules; - - private Configuration conf; +public class RegexURLFilter extends RegexURLFilterBase { public RegexURLFilter() { + super(); } public RegexURLFilter(String filename) throws IOException, PatternSyntaxException { - rules = readConfigurationFile(new FileReader(filename)); + super(filename); } - public synchronized String filter(String url) { - Iterator i=rules.iterator(); - while(i.hasNext()) { - Rule r=(Rule) i.next(); - Matcher matcher = r.pattern.matcher(url); - - if (matcher.find()) { - //System.out.println("Matched " + r.regex); - return r.sign ? url : null; - } - }; - - return null; // assume no go - } - - // - // Format of configuration file is - // - // [+-]<regex> - // - // where plus means go ahead and index it and minus means no. - // + RegexURLFilter(Reader reader) + throws IOException, IllegalArgumentException { + super(reader); + } - private static List readConfigurationFile(Reader reader) - throws IOException, PatternSyntaxException { + + /* ----------------------------------- * + * <implementation:RegexURLFilterBase> * + * ----------------------------------- */ + + // Inherited Javadoc + protected String getRulesFile(Configuration conf) { + return conf.get("urlfilter.regex.file"); + } - BufferedReader in=new BufferedReader(reader); - List rules=new ArrayList(); - String line; - - while((line=in.readLine())!=null) { - if (line.length() == 0) - continue; - char first=line.charAt(0); - boolean sign=false; - switch (first) { - case '+' : - sign=true; - break; - case '-' : - sign=false; - break; - case ' ' : case '\n' : case '#' : // skip blank & comment lines - continue; - default : - throw new IOException("Invalid first character: "+line); - } - - String regex=line.substring(1); - - Rule rule=new Rule(); - rule.pattern=Pattern.compile(regex); - rule.sign=sign; - rule.regex=regex; - rules.add(rule); - } + // Inherited Javadoc + protected RegexRule createRule(boolean sign, String regex) { + return new Rule(sign, regex); + } + + /* ------------------------------------ * + * </implementation:RegexURLFilterBase> * + * ------------------------------------ */ - return rules; + + public static void main(String args[]) throws IOException { + main(new RegexURLFilter(), args); } - public static void main(String args[]) - throws IOException, PatternSyntaxException { - RegexURLFilter filter=new RegexURLFilter(); - BufferedReader in=new BufferedReader(new InputStreamReader(System.in)); - String line; - while((line=in.readLine())!=null) { - String out=filter.filter(line); - if(out!=null) { - System.out.print("+"); - System.out.println(out); - } else { - System.out.print("-"); - System.out.println(line); - } + private class Rule extends RegexRule { + + private Pattern pattern; + + Rule(boolean sign, String regex) { + super(sign, regex); + pattern = Pattern.compile(regex); } - } - public void setConf(Configuration conf) { - this.conf = conf; - String pluginName = "urlfilter-regex"; - Extension[] extensions = PluginRepository.get(conf).getExtensionPoint( - URLFilter.class.getName()).getExtensions(); - for (int i = 0; i < extensions.length; i++) { - Extension extension = extensions[i]; - if (extension.getDescriptor().getPluginId().equals(pluginName)) { - attributeFile = extension.getAttribute("file"); - break; - } - } - if (attributeFile != null && attributeFile.trim().equals("")) - attributeFile = null; - if (attributeFile != null) { - LOG.info("Attribute \"file\" is defined for plugin " + pluginName - + " as " + attributeFile); - } else { - //LOG.warning("Attribute \"file\" is not defined in plugin.xml for plugin "+pluginName); - } - String file = conf.get("urlfilter.regex.file"); - // attribute "file" takes precedence if defined - if (attributeFile != null) - file = attributeFile; - Reader reader = conf.getConfResourceAsReader(file); - - if (reader == null) { - LOG.severe("Can't find resource: " + file); - } else { - try { - rules = readConfigurationFile(reader); - } catch (IOException e) { - LOG.severe(e.getMessage()); - //TODO [EMAIL PROTECTED]: throw Exception? Because broken api. - throw new RuntimeException(e.getMessage(), e); - } + protected boolean match(String url) { + return pattern.matcher(url).find(); } } - - public Configuration getConf() { - return this.conf; - } - + } Added: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java?rev=387650&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java (added) +++ lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java Tue Mar 21 14:26:56 2006 @@ -0,0 +1,66 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net; + +// JDK imports +import java.io.IOException; +import java.io.Reader; + +// JUnit imports +import junit.framework.Test; +import junit.framework.TestSuite; +import junit.textui.TestRunner; + + +/** + * JUnit based test of class <code>RegexURLFilter</code>. + * + * @author Jérôme Charron + */ +public class TestRegexURLFilter extends RegexURLFilterBaseTest { + + public TestRegexURLFilter(String testName) { + super(testName); + } + + public static Test suite() { + return new TestSuite(TestRegexURLFilter.class); + } + + public static void main(String[] args) { + TestRunner.run(suite()); + } + + protected URLFilter getURLFilter(Reader rules) { + try { + return new RegexURLFilter(rules); + } catch (IOException e) { + fail(e.toString()); + return null; + } + } + + public void test() { + test("WholeWebCrawling"); + test("IntranetCrawling"); + bench(50, "Benchmarks"); + bench(100, "Benchmarks"); + bench(200, "Benchmarks"); + bench(400, "Benchmarks"); + bench(800, "Benchmarks"); + } + +} Propchange: lucene/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/net/TestRegexURLFilter.java ------------------------------------------------------------------------------ svn:eol-style = native