Author: siren Date: Sat Jun 10 12:30:34 2006 New Revision: 413356 URL: http://svn.apache.org/viewvc?rev=413356&view=rev Log: fixed blugin.xml and a bug in Subcollection.java, added testcase to verify functionality
Added: lucene/nutch/trunk/src/plugin/subcollection/src/test/org/ lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/ lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/ lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/ lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Modified: lucene/nutch/trunk/src/plugin/subcollection/plugin.xml lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Modified: lucene/nutch/trunk/src/plugin/subcollection/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/plugin.xml?rev=413356&r1=413355&r2=413356&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/subcollection/plugin.xml Sat Jun 10 12:30:34 2006 @@ -10,15 +10,19 @@ </requires> <runtime> - <library name="subcollection.jar"/> + <library name="subcollection.jar"> + <export name="*"/> + </library> </runtime> - + <extension id="org.apache.nutch.searcher.subcollection.query" name="Subcollection Query Filter" point="org.apache.nutch.searcher.QueryFilter"> <implementation id="SubcollectionQueryFilter" - class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter" - raw-fields="subcollection"/> + class="org.apache.nutch.searcher.subcollection.SubcollectionQueryFilter"> + <parameter name="raw-fields" value="subcollection"/> + </implementation> + </extension> <extension id="org.apache.nutch.indexer.subcollection.indexing" Modified: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=413356&r1=413355&r2=413356&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original) +++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Sat Jun 10 12:30:34 2006 @@ -30,6 +30,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.nutch.util.DomUtil; +import org.apache.nutch.util.NutchConfiguration; import org.apache.xerces.dom.DocumentImpl; import org.w3c.dom.Document; import org.w3c.dom.Element; @@ -50,6 +51,13 @@ super(conf); init(); } + + /** + * Used for testing + */ + protected CollectionManager(){ + super(NutchConfiguration.create()); + } protected void init(){ try { @@ -60,26 +68,30 @@ InputStream input = getConf().getConfResourceAsInputStream( getConf().get("subcollections.config", DEFAULT_FILE_NAME)); - Element collections = DomUtil.getDom(input); - - if (collections != null) { - NodeList nodeList = collections - .getElementsByTagName(Subcollection.TAG_COLLECTION); - - LOG.info("file has" + nodeList.getLength() + " elements"); - - for (int i = 0; i < nodeList.getLength(); i++) { - Element scElem = (Element) nodeList.item(i); - Subcollection subCol = new Subcollection(getConf()); - subCol.initialize(scElem); - collectionMap.put(subCol.name, subCol); - } - } else { - LOG.info("Cannot find collections"); - } + parse(input); } catch (Exception e) { LOG.info("Error occured:" + e); e.printStackTrace(System.out); + } + } + + protected void parse(InputStream input) { + Element collections = DomUtil.getDom(input); + + if (collections != null) { + NodeList nodeList = collections + .getElementsByTagName(Subcollection.TAG_COLLECTION); + + LOG.info("file has" + nodeList.getLength() + " elements"); + + for (int i = 0; i < nodeList.getLength(); i++) { + Element scElem = (Element) nodeList.item(i); + Subcollection subCol = new Subcollection(getConf()); + subCol.initialize(scElem); + collectionMap.put(subCol.name, subCol); + } + } else { + LOG.info("Cannot find collections"); } } Modified: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=413356&r1=413355&r2=413356&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original) +++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Sat Jun 10 12:30:34 2006 @@ -158,11 +158,13 @@ } /** - * Initialize SubCollection from dom element + * Initialize Subcollection from dom element * * @param collection */ public void initialize(Element collection) { + this.id = DOMUtil.getChildText( + collection.getElementsByTagName(TAG_ID).item(0)).trim(); this.name = DOMUtil.getChildText( collection.getElementsByTagName(TAG_NAME).item(0)).trim(); this.wlString = DOMUtil.getChildText( Added: lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=413356&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (added) +++ lucene/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Sat Jun 10 12:30:34 2006 @@ -0,0 +1,104 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.collection; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.Collection; + +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestSubcollection extends TestCase { + + /**Test filtering logic + * + * @throws Exception + */ + public void testFilter() throws Exception { + Subcollection sc=new Subcollection(NutchConfiguration.create()); + sc.setWhiteList("www.nutch.org\nwww.apache.org"); + sc.setBlackList("jpg\nwww.apache.org/zecret/"); + + //matches whitelist + assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html")); + + //matches blacklist + assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html")); + assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg")); + + //no match + assertEquals(null, sc.filter("http://www.google.com/")); + } + + public void testInput(){ + StringBuffer xml=new StringBuffer(); + xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); + xml.append("<subcollections>"); + xml.append("<subcollection>"); + xml.append("<name>nutch collection</name>"); + xml.append("<id>nutch</id>"); + xml.append("<whitelist>"); + xml.append("http://lucene.apache.org/nutch/\n"); + xml.append("http://wiki.apache.org/nutch/\n"); + xml.append("</whitelist>"); + xml.append("<blacklist>"); + xml.append("http://www.xxx.yyy\n"); + xml.append("</blacklist>"); + xml.append("</subcollection>"); + xml.append("</subcollections>"); + + InputStream is=new ByteArrayInputStream(xml.toString().getBytes()); + + CollectionManager cm=new CollectionManager(); + cm.parse(is); + + Collection c=cm.getAll(); + + // test that size matches + assertEquals(1,c.size()); + + Subcollection collection=(Subcollection)c.toArray()[0]; + + //test collection id + assertEquals("nutch", collection.getId()); + + //test collection name + assertEquals("nutch collection", collection.getName()); + + //test whitelist + assertEquals(2,collection.whiteList.size()); + + String wlUrl=(String)collection.whiteList.get(0); + assertEquals("http://lucene.apache.org/nutch/", wlUrl); + + wlUrl=(String)collection.whiteList.get(1); + assertEquals("http://wiki.apache.org/nutch/", wlUrl); + + //matches whitelist + assertEquals("http://lucene.apache.org/nutch/", collection.filter("http://lucene.apache.org/nutch/")); + + //test blacklist + assertEquals(1,collection.blackList.size()); + + String blUrl=(String)collection.blackList.get(0); + assertEquals("http://www.xxx.yyy", blUrl); + + //no match + assertEquals(null, collection.filter("http://www.google.com/")); + } +}