Update of /cvsroot/nutch/playground/src/test/net/nutch/fetcher
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10313/src/test/net/nutch/fetcher

Added Files:
        TestFetcherContent.java TestRobotRulesParser.java 
        TestFetcherText.java TestFetcher.java TestFetcherOutput.java 
Log Message:
intial commit

--- NEW FILE: TestFetcherContent.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.fetcher;

import java.io.*;
import net.nutch.io.*;
import net.nutch.pagedb.*;
import junit.framework.TestCase;

/** Unit tests for FetcherContent. */

public class TestFetcherContent extends TestCase {
  public TestFetcherContent(String name) { super(name); }

  public void testFetcherContent() throws Exception {

    String page = "<HTML><BODY><H1>Hello World</H1><P>The Quick Brown Fox Jumped Over 
the Lazy Fox.</BODY></HTML>";

    FetcherContent r = new FetcherContent(page.getBytes("UTF8"));
                        
    TestWritable.testWritable(r);
  }
        
}

--- NEW FILE: TestRobotRulesParser.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.fetcher;

import net.nutch.fetcher.RobotRulesParser.RobotRuleSet;

import junit.framework.TestCase;

public class TestRobotRulesParser extends TestCase {
  private static final String LF= "\n";
  private static final String CR= "\r";
  private static final String CRLF= "\r\n";
  

  private static final String[] ROBOTS_STRINGS= new String[] {
    "User-Agent: Agent1 #foo" + CR 
    + "Disallow: /a" + CR 
    + "Disallow: /b/a" + CR 
    + "#Disallow: /c" + CR 
    + "" + CR 
    + "" + CR 
    + "User-Agent: Agent2 Agent3#foo" + CR 
    + "User-Agent: Agent4" + CR 
    + "Disallow: /d" + CR 
    + "Disallow: /e/d/" + CR
    + "" + CR 
    + "User-Agent: *" + CR 
    + "Disallow: /foo/bar/" + CR,
  };

  private static final String[] AGENT_STRINGS= new String[] {
    "Agent1",
    "Agent2",
    "Agent3",
    "Agent4",
    "Agent5",
  };

  private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {
    { 
      false, 
      false,
      false,
      false,
      true,
    }
  };

  private static final String[] TEST_PATHS= new String[] {
    "/a",
    "/a/",
    "/a/bloh/foo.html",
    "/b",
    "/b/a",
    "/b/a/index.html",
    "/b/b/foo.html",
    "/c",
    "/c/a",
    "/c/a/index.html",
    "/c/b/foo.html",
    "/d",
    "/d/a",
    "/e/a/index.html",
    "/e/d",
    "/e/d/foo.html",
    "/e/doh.html",
    "/f/index.html",
    "/foo/bar/baz.html",  
    "/f/",
  };

  private static final boolean[][][] ALLOWED= new boolean[][][] {
    { // ROBOTS_STRINGS[0]
      { // Agent1
        false,  // "/a",              
        false,  // "/a/",             
        false,  // "/a/bloh/foo.html"
        true,   // "/b",              
        false,  // "/b/a",            
        false,  // "/b/a/index.html",
        true,   // "/b/b/foo.html",  
        true,   // "/c",              
        true,   // "/c/a",            
        true,   // "/c/a/index.html",
        true,   // "/c/b/foo.html",  
        true,   // "/d",              
        true,   // "/d/a",            
        true,   // "/e/a/index.html",
        true,   // "/e/d",            
        true,   // "/e/d/foo.html",  
        true,   // "/e/doh.html",    
        true,   // "/f/index.html",  
        true,   // "/foo/bar.html",  
        true,   // "/f/",  
      }, 
      { // Agent2
        true,   // "/a",              
        true,   // "/a/",             
        true,   // "/a/bloh/foo.html"
        true,   // "/b",              
        true,   // "/b/a",            
        true,   // "/b/a/index.html",
        true,   // "/b/b/foo.html",  
        true,   // "/c",              
        true,   // "/c/a",            
        true,   // "/c/a/index.html",
        true,   // "/c/b/foo.html",  
        false,  // "/d",              
        false,  // "/d/a",            
        true,   // "/e/a/index.html",
        true,   // "/e/d",            
        false,  // "/e/d/foo.html",  
        true,   // "/e/doh.html",    
        true,   // "/f/index.html",  
        true,   // "/foo/bar.html",  
        true,   // "/f/",  
      },
      { // Agent3
        true,   // "/a",              
        true,   // "/a/",             
        true,   // "/a/bloh/foo.html"
        true,   // "/b",              
        true,   // "/b/a",            
        true,   // "/b/a/index.html",
        true,   // "/b/b/foo.html",  
        true,   // "/c",              
        true,   // "/c/a",            
        true,   // "/c/a/index.html",
        true,   // "/c/b/foo.html",  
        false,  // "/d",              
        false,  // "/d/a",            
        true,   // "/e/a/index.html",
        true,   // "/e/d",            
        false,  // "/e/d/foo.html",  
        true,   // "/e/doh.html",    
        true,   // "/f/index.html",  
        true,   // "/foo/bar.html",  
        true,   // "/f/",  
      },
      { // Agent4
        true,   // "/a",              
        true,   // "/a/",             
        true,   // "/a/bloh/foo.html"
        true,   // "/b",              
        true,   // "/b/a",            
        true,   // "/b/a/index.html",
        true,   // "/b/b/foo.html",  
        true,   // "/c",              
        true,   // "/c/a",            
        true,   // "/c/a/index.html",
        true,   // "/c/b/foo.html",  
        false,  // "/d",              
        false,  // "/d/a",            
        true,   // "/e/a/index.html",
        true,   // "/e/d",            
        false,  // "/e/d/foo.html",  
        true,   // "/e/doh.html",    
        true,   // "/f/index.html",  
        true,   // "/foo/bar.html",  
        true,   // "/f/",  
      },
      { // Agent5/"*"
        true,   // "/a",              
        true,   // "/a/",             
        true,   // "/a/bloh/foo.html"
        true,   // "/b",              
        true,   // "/b/a",            
        true,   // "/b/a/index.html",
        true,   // "/b/b/foo.html",  
        true,   // "/c",              
        true,   // "/c/a",            
        true,   // "/c/a/index.html",
        true,   // "/c/b/foo.html",  
        true,   // "/d",              
        true,   // "/d/a",            
        true,   // "/e/a/index.html",
        true,   // "/e/d",            
        true,   // "/e/d/foo.html",  
        true,   // "/e/doh.html",    
        true,   // "/f/index.html",  
        false,  // "/foo/bar.html",  
        true,   // "/f/",  
      }
    }
  };
 
  public TestRobotRulesParser(String name) {
    super(name);
  }

  public void testRobotsOneAgent() {
    for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
      for (int j= 0; j < AGENT_STRINGS.length; j++) {
        testRobots(i, new String[] { AGENT_STRINGS[j] },
                   TEST_PATHS, ALLOWED[i][j]);
      }
    }
  }

  public void testRobotsTwoAgents() {
    for (int i= 0; i < ROBOTS_STRINGS.length; i++) {
      for (int j= 0; j < AGENT_STRINGS.length; j++) {
        for (int k= 0; k < AGENT_STRINGS.length; k++) {
          int key= j;
          if (NOT_IN_ROBOTS_STRING[i][j])
            key= k;
          testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },
                     TEST_PATHS, ALLOWED[i][key]);
        }
      }
    }
  }

  // helper

  public void testRobots(int robotsString, String[] agents, String[] paths, 
                         boolean[] allowed) {
    String agentsString= agents[0];
    for (int i= 1; i < agents.length; i++)
      agentsString= agentsString + "," + agents[i];
    RobotRulesParser p= new RobotRulesParser(agents);
    RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString].getBytes());
    for (int i= 0; i < paths.length; i++) {
      assertTrue("testing robots file "+robotsString+", on agents ("
                 + agentsString + "), and path " + TEST_PATHS[i] + "; got " 
                 + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF
                                   + rules,
                 rules.isAllowed(TEST_PATHS[i]) == allowed[i]);
    }
  }

}

--- NEW FILE: TestFetcherText.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.fetcher;

import java.io.*;
import net.nutch.io.*;
import net.nutch.pagedb.*;
import junit.framework.TestCase;

/** Unit tests for FetcherText. */

public class TestFetcherText extends TestCase {
  public TestFetcherText(String name) { super(name); }

  public void testFetcherText() throws Exception {

    String page = "Hello World The Quick Brown Fox Jumped Over the Lazy Fox";

    FetcherText s = new FetcherText(page);
                        
    TestWritable.testWritable(s);
  }
        
}

--- NEW FILE: TestFetcher.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.fetcher;

import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.pagedb.*;

import java.io.*;
import java.util.logging.Level;
import junit.framework.TestCase;


public class TestFetcher extends TestCase {

  public TestFetcher(String name) { super(name); }
        

  public void testFetcher() throws Exception {

    String directory = System.getProperty("test.build.data",".");
    
    String fetchListFilename = directory + "/" + FetchListEntry.DIR_NAME;
                
    ArrayFile.Writer testFetchList =
      new ArrayFile.Writer(fetchListFilename, FetchListEntry.class);

    MD5Hash id1 = new MD5Hash(new byte[]{0,0,0,0, 0,0,0,0, 0,0,0,0, 1,2,3,4});
    MD5Hash id2 = new MD5Hash(new byte[]{0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0});
        
    String url1 = "http://sourceforge.net/projects/nutch/";;
    String url2 = "http://www.yahoo.com/";;
    String url3 = "http://jakarta.apache.org/lucene/";;
    String url4 = "http://nutch.org/docs/index.html";;
    String url5 = "ftp://ftp.redhat.com/";;
    
    Page page1 = new Page(url1, id1);
    Page page2 = new Page(url2, id2);
    Page page3 = new Page(url3, id2);
    Page page4 = new Page(url4, id2);
    Page page5 = new Page(url5, id2);
    
    String[] anchors = new String[] {"foo", "bar"};

    FetchListEntry fe1 = new FetchListEntry(true, page1, anchors);
    FetchListEntry fe2 = new FetchListEntry(true, page2, anchors);
    FetchListEntry fe3 = new FetchListEntry(true, page3, anchors);
    FetchListEntry fe4 = new FetchListEntry(true, page4, anchors);
    FetchListEntry fe5 = new FetchListEntry(false, page5, anchors);
    
    testFetchList.append(fe1);
    testFetchList.append(fe2);
    testFetchList.append(fe3);
    testFetchList.append(fe4);
    testFetchList.append(fe5);
    testFetchList.close();
                
    Fetcher fetcher = new Fetcher(directory);
    fetcher.setLogLevel(Level.FINE);
    //fetcher.getHttp().setMaxContentLength(4096);
    fetcher.getHttp().setAgentString("NutchCVS");

    fetcher.run();

    ArrayFile.Reader fetcher_stripped;
    String stripped = directory + "/" + FetcherText.DIR_NAME;
    FetcherText s = new FetcherText();
    fetcher_stripped = new ArrayFile.Reader(stripped);

    boolean yahoo = false;
    boolean nutch = false;

    while (fetcher_stripped.next(s) != null) {

      if (s.toString().indexOf("Yahoo!") >= 0)
        yahoo = true;

      if (s.toString().indexOf("Nutch") >= 0 )
         nutch = true;
    }
    fetcher_stripped.close();
    assertTrue(yahoo);
    assertTrue(nutch);
  }

}






--- NEW FILE: TestFetcherOutput.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.fetcher;

import java.io.*;
import net.nutch.io.*;
import net.nutch.pagedb.*;
import junit.framework.TestCase;

/** Unit tests for FetcherOutput. */
public class TestFetcherOutput extends TestCase {
  public TestFetcherOutput(String name) { super(name); }

  public void testFetcherOutput() throws Exception {

    String[] anchors = new String[] {"foo", "bar"};

    Outlink[] outlinks = new Outlink[] {
      new Outlink("http://foo.com/";, "Foo"),
      new Outlink("http://bar.com/";, "Bar")
    };

    FetcherOutput o =
      new FetcherOutput(new FetchListEntry(true, TestPage.getTestPage(),
                                           anchors),
                        TestMD5Hash.getTestHash(), true, "Foo",
                        outlinks);
                        
    TestWritable.testWritable(o);

  }
        
}



-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to