Author: siren Date: Wed Nov 15 11:42:22 2006 New Revision: 475378 URL: http://svn.apache.org/viewvc?view=rev&rev=475378 Log: added more junit tests
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?view=diff&rev=475378&r1=475377&r2=475378 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Wed Nov 15 11:42:22 2006 @@ -44,6 +44,10 @@ /** Generates a subset of a crawl db to fetch. */ public class Generator extends ToolBase { + public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip"; + public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host"; + public static final String CRAWL_TOP_N = "crawl.topN"; + public static final String CRAWL_GEN_CUR_TIME = "crawl.gen.curTime"; public static final Log LOG = LogFactory.getLog(Generator.class); public static class SelectorEntry implements Writable { Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=475378&r1=475377&r2=475378 ============================================================================== --- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original) +++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Wed Nov 15 11:42:22 2006 @@ -31,12 +31,10 @@ import junit.framework.TestCase; /** - * Basic generator test: - * 1. Insert entries in crawldb - * 2. Generates entries to fetch - * 3. Verifies that number of generated urls match - * 4. Verifies that highest scoring urls are generated - + * Basic generator test. 1. Insert entries in crawldb 2. Generates entries to + * fetch 3. Verifies that number of generated urls match 4. Verifies that + * highest scoring urls are generated + * * @author nutch-dev <nutch-dev at lucene.apache.org> * */ @@ -50,11 +48,11 @@ FileSystem fs; - final static Path testdir=new Path("build/test/generator-test"); + final static Path testdir = new Path("build/test/generator-test"); protected void setUp() throws Exception { conf = CrawlDBTestUtil.createConfiguration(); - fs=FileSystem.get(conf); + fs = FileSystem.get(conf); fs.delete(testdir); } @@ -70,81 +68,243 @@ } /** - * Test that generator generates fetchlish ordered by score (desc) - * + * Test that generator generates fetchlish ordered by score (desc). + * * @throws Exception */ public void testGenerateHighest() throws Exception { - int NUM_RESULTS=2; - + final int NUM_RESULTS = 2; + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); - - for(int i=0;i<=100;i++){ - list.add(new CrawlDBTestUtil.URLCrawlDatum(new Text("http://aaa/" + pad(i)), - new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i))); + + for (int i = 0; i <= 100; i++) { + list.add(createURLCrawlDatum("http://aaa/" + pad(i), + 1, i)); } - - dbDir = new Path(testdir, "crawldb"); - segmentsDir = new Path(testdir, "segments"); - fs.mkdirs(dbDir); - fs.mkdirs(segmentsDir); - - // create crawldb - CrawlDBTestUtil.createCrawlDb(fs, dbDir, list); - - // generate segment - Generator g=new Generator(conf); - Path generatedSegment=g.generate(dbDir, segmentsDir, -1, NUM_RESULTS, Long.MAX_VALUE); - - Path fetchlist=new Path(new Path(generatedSegment, CrawlDatum.GENERATE_DIR_NAME), "part-00000"); - - // verify results - SequenceFile.Reader reader=new SequenceFile.Reader(fs, fetchlist, conf); - - ArrayList<URLCrawlDatum> l=new ArrayList<URLCrawlDatum>(); - - READ: - do { - Text key=new Text(); - CrawlDatum value=new CrawlDatum(); - if(!reader.next(key, value)) break READ; - l.add(new URLCrawlDatum(key, value)); - } while(true); - reader.close(); + createCrawlDB(list); + Path generatedSegment = generateFetchlist(NUM_RESULTS, conf); + + Path fetchlist = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + ArrayList<URLCrawlDatum> l = readContents(fetchlist); + // sort urls by score desc Collections.sort(l, new ScoreComparator()); - //verify we got right amount of records + // verify we got right amount of records assertEquals(NUM_RESULTS, l.size()); - //verify we have the highest scoring urls + // verify we have the highest scoring urls assertEquals("http://aaa/100", (l.get(0).url.toString())); assertEquals("http://aaa/099", (l.get(1).url.toString())); } private String pad(int i) { - String s=Integer.toString(i); - while(s.length()<3) - s="0" + s; + String s = Integer.toString(i); + while (s.length() < 3) { + s = "0" + s; + } return s; } /** - * Comparator that sorts by score desc + * Comparator that sorts by score desc. */ public class ScoreComparator implements Comparator<URLCrawlDatum> { public int compare(URLCrawlDatum tuple1, URLCrawlDatum tuple2) { - - if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) + if (tuple2.datum.getScore() - tuple1.datum.getScore() < 0) { return -1; - if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) + } + if (tuple2.datum.getScore() - tuple1.datum.getScore() > 0) { return 1; - + } return 0; } + } + + /** + * Test that generator obeys the property "generate.max.per.host". + * @throws Exception + */ + public void testGenerateHostLimit() throws Exception{ + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + + list.add(createURLCrawlDatum("http://www.example.com/index1.html", + 1, 1)); + list.add(createURLCrawlDatum("http://www.example.com/index2.html", + 1, 1)); + list.add(createURLCrawlDatum("http://www.example.com/index3.html", + 1, 1)); + + createCrawlDB(list); + + Configuration myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1); + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + + Path fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + assertEquals(1, fetchList.size()); + + + myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + assertEquals(2, fetchList.size()); + + myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + assertEquals(3, fetchList.size()); + } + + /** + * Test that generator obeys the property "generate.max.per.host". + * @throws Exception + */ + public void testGenerateHostIPLimit() throws Exception{ + ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>(); + + list.add(createURLCrawlDatum("http://www.example.com/index.html", + 1, 1)); + list.add(createURLCrawlDatum("http://www.example.net/index.html", + 1, 1)); + list.add(createURLCrawlDatum("http://www.example.org/index.html", + 1, 1)); + + createCrawlDB(list); + + Configuration myConfiguration = new Configuration(conf); + myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 1); + myConfiguration.setBoolean(Generator.GENERATE_MAX_PER_HOST_BY_IP, true); + + Path generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + + Path fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + ArrayList<URLCrawlDatum> fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + assertEquals(1, fetchList.size()); + + myConfiguration = new Configuration(myConfiguration); + myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 2); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + assertEquals(2, fetchList.size()); + + myConfiguration = new Configuration(myConfiguration); + myConfiguration.setInt(Generator.GENERATE_MAX_PER_HOST, 3); + generatedSegment = generateFetchlist(Integer.MAX_VALUE, myConfiguration); + + fetchlistPath = new Path(new Path(generatedSegment, + CrawlDatum.GENERATE_DIR_NAME), "part-00000"); + + fetchList = readContents(fetchlistPath); + + // verify we got right amount of records + assertEquals(3, fetchList.size()); + } + + + /** + * Read contents of fetchlist. + * @param fetchlist path to Generated fetchlist + * @return Generated [EMAIL PROTECTED] URLCrawlDatum} objects + * @throws IOException + */ + private ArrayList<URLCrawlDatum> readContents(Path fetchlist) throws IOException { + // verify results + SequenceFile.Reader reader = new SequenceFile.Reader(fs, fetchlist, conf); + + ArrayList<URLCrawlDatum> l = new ArrayList<URLCrawlDatum>(); + + READ: do { + Text key = new Text(); + CrawlDatum value = new CrawlDatum(); + if (!reader.next(key, value)) { + break READ; + } + l.add(new URLCrawlDatum(key, value)); + } while (true); + + reader.close(); + return l; + } + + /** + * Generate Fetchlist. + * @param numResults number of results to generate + * @param config Configuration to use + * @return path to generated segment + * @throws IOException + */ + private Path generateFetchlist(int numResults, Configuration config) throws IOException { + // generate segment + Generator g = new Generator(config); + Path generatedSegment = g.generate(dbDir, segmentsDir, -1, numResults, + Long.MAX_VALUE); + return generatedSegment; + } + + /** + * Creates CrawlDB. + * + * @param list database contents + * @throws IOException + * @throws Exception + */ + private void createCrawlDB(ArrayList<URLCrawlDatum> list) throws IOException, + Exception { + dbDir = new Path(testdir, "crawldb"); + segmentsDir = new Path(testdir, "segments"); + fs.mkdirs(dbDir); + fs.mkdirs(segmentsDir); + + // create crawldb + CrawlDBTestUtil.createCrawlDb(fs, dbDir, list); + } + + /** + * Constructs new [EMAIL PROTECTED] URLCrawlDatum} from submitted parameters. + * @param url url to use + * @param fetchInterval [EMAIL PROTECTED] CrawlDatum#setFetchInterval(float)} + * @param score [EMAIL PROTECTED] CrawlDatum#setScore(float)} + * @return Constructed object + */ + private URLCrawlDatum createURLCrawlDatum(final String url, + final float fetchInterval, final float score) { + return new CrawlDBTestUtil.URLCrawlDatum(new Text(url), new CrawlDatum( + CrawlDatum.STATUS_DB_UNFETCHED, fetchInterval, score)); } }