util TestFibonacciHeap.java,NONE,1.1 TestRobotsMetaProcessor.java,NONE,1.1 TestDOMContentUtils.java,NONE,1.1 TestPrefixStringMatcher.java,NONE,1.1 TestStringUtil.java,NONE,1.1 TestSuffixStringMatcher.java,NONE,1.1 TestSoftHashMap.java,NONE,1.1 TestGZIPUtils.java,NONE,1.1

joa23 Thu, 29 Jan 2004 08:09:39 -0800

Update of /cvsroot/nutch/playground/src/test/net/nutch/util
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv10313/src/test/net/nutch/util


Added Files:
        TestFibonacciHeap.java TestRobotsMetaProcessor.java 
        TestDOMContentUtils.java TestPrefixStringMatcher.java 
        TestStringUtil.java TestSuffixStringMatcher.java 
        TestSoftHashMap.java TestGZIPUtils.java 
Log Message:
intial commit

--- NEW FILE: TestFibonacciHeap.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import junit.framework.TestCase;

import java.util.Arrays;

/** Unit tests for FibonacciHeap. */
public class TestFibonacciHeap extends TestCase {
  public TestFibonacciHeap(String name) { 
    super(name); 
  }

  
  private static class TestItem implements Comparable {
    int id;
    int priority;

    public TestItem(int id, int priority) {
      this.id= id;
      this.priority= priority;
    }

    public String toString() {
      return "<"+id+","+priority+">";
    }

    public int compareTo(Object other) {
      TestItem o= (TestItem) other;
      if (this.priority < o.priority)
        return -1;
      else if (this.priority == o.priority)
        return 0;
      else return 1;
    }
  }

  private final static int NUM_TEST_ITEMS= 200;

  private final static int NUM_TEST_OPERATIONS= 10000;

  // likelihood of doing any of these operations
  private final static double ADD_PROB= .35;
  private final static double DECREASEKEY_PROB= .25;
  private final static double POP_PROB= .30;
  private final static double PEEK_PROB= .10;

  public void testFibHeap() {
    FibonacciHeap h= new FibonacciHeap();

    TestItem[] vals= new TestItem[NUM_TEST_ITEMS];
    for (int i= 0; i < NUM_TEST_ITEMS; i++) 
      vals[i]= new TestItem(i,i);

    // the number of vals in the heap
    int numInVal= 0;
    // the number of vals that are not in the heap
    int numOutVal= NUM_TEST_ITEMS;

    // thresholds
    double addMaxP= ADD_PROB;
    double decreaseKeyMaxP= ADD_PROB + DECREASEKEY_PROB;
    double popMaxP= ADD_PROB + DECREASEKEY_PROB + POP_PROB;

    // number of operations we've done
    int numOps= 0;

    // test add/peek/pop/decreaseKey
    while (numOps < NUM_TEST_OPERATIONS) {

      numOps++;

      assertTrue("heap reports wrong size!", numInVal == h.size());

      double randVal= Math.random();
      if (randVal < addMaxP) {

        if (numOutVal == 0) // can't add...
          continue;

        // add
        int index= ( (NUM_TEST_ITEMS - 1) - 
                     (int) (Math.random() * (double) numOutVal) );
        TestItem tmp= vals[index];
        vals[index]= vals[numInVal];
        vals[numInVal]= tmp;
        numInVal++;
        numOutVal--;

        h.add(tmp, tmp.priority);

      } else if (randVal < decreaseKeyMaxP) {

        // decreaseKey
        if (numInVal == 0) {
          // do nothing
        } else {
          int index= (int) (Math.random() * (double) numInVal);
          TestItem tmp= vals[index];

          tmp.priority-=  Math.random() * 5.0;

          h.decreaseKey(tmp, tmp.priority);
        }

      } else if (randVal < popMaxP) {

        // pop
        if (numInVal == 0) {
          if (h.size() != 0) {
            assertTrue("heap empty, but peekMin() did not return null!",
                       h.peekMin() == null);
            assertTrue("heap empty, but popMin() did not return null!",
                       h.popMin() == null );
          } 
        } else {
          Arrays.sort(vals, 0, numInVal);
          int i= 0; 
          TestItem tmp= (TestItem) h.popMin();
          while ( (i < numInVal) && (tmp.priority == vals[i].priority) ) {
            if (tmp.id == vals[i].id) 
              break;
            i++;
          } 
          assertTrue("popMin did not return lowest-priority item!", 
                     tmp.id == vals[i].id);
          assertTrue("popMin did not return lowest-priority item!",
                     tmp == vals[i]);

          vals[i]= vals[numInVal - 1];
          vals[numInVal - 1]= tmp;
          numInVal--;
          numOutVal++;
        }                               

      } else {

        // peek 
        if (numInVal == 0) {
          assertTrue("heap reports non-zero size when empty", h.size() == 0);
          assertTrue("heap.peekMin() returns item when empty", 
                     h.peekMin() == null);
          assertTrue("heap.popMin() returns item when empty",
                     h.popMin() == null);
        } else {
          Arrays.sort(vals, 0, numInVal);
          int i= 0; 
          TestItem tmp= (TestItem) h.peekMin();

          while ( (i < numInVal) && (tmp.priority == vals[i].priority) ) {
            if (tmp.id == vals[i].id) 
              break;
            i++;
          } 
          assertTrue("heap.peekMin() returns wrong item",
                     tmp.id == vals[i].id);
          assertTrue("heap.peekMin() returns wrong item",
                     tmp == vals[i]);
        }                               
      }
    }

  }

}

--- NEW FILE: TestRobotsMetaProcessor.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import junit.framework.TestCase;

import net.nutch.util.RobotsMetaProcessor.*;

import java.io.ByteArrayInputStream;
import java.net.URL;

import org.cyberneko.html.parsers.*;
import org.xml.sax.*;
import org.w3c.dom.*;
import org.apache.html.dom.*;

/** Unit tests for RobotsMetaProcessor. */
public class TestRobotsMetaProcessor extends TestCase {
  public TestRobotsMetaProcessor(String name) { 
    super(name); 
  }

  /*

  some sample tags:

  <meta name="robots" content="index,follow">
  <meta name="robots" content="noindex,follow">
  <meta name="robots" content="index,nofollow">
  <meta name="robots" content="noindex,nofollow">

  <META HTTP-EQUIV="Pragma" CONTENT="no-cache">

  */


  public static String[] tests= 
  {
    "<html><head><title>test page</title>"
    + "<META NAME=\"ROBOTS\" CONTENT=\"NONE\"> "
    + "<META HTTP-EQUIV=\"PRAGMA\" CONTENT=\"NO-CACHE\"> "
    + "</head><body>"
    + " some text"
    + "</body></html>",

    "<html><head><title>test page</title>"
    + "<meta name=\"robots\" content=\"all\"> "
    + "<meta http-equiv=\"pragma\" content=\"no-cache\"> "
    + "</head><body>"
    + " some text"
    + "</body></html>",

    "<html><head><title>test page</title>"
    + "<MeTa NaMe=\"RoBoTs\" CoNtEnT=\"nOnE\"> "
    + "<MeTa HtTp-EqUiV=\"pRaGmA\" cOnTeNt=\"No-CaChE\"> "
    + "</head><body>"
    + " some text"
    + "</body></html>",

    "<html><head><title>test page</title>"
    + "<meta name=\"robots\" content=\"none\"> "
    + "</head><body>"
    + " some text"
    + "</body></html>",

    "<html><head><title>test page</title>"
    + "<meta name=\"robots\" content=\"noindex,nofollow\"> "
    + "</head><body>"
    + " some text"
    + "</body></html>",

    "<html><head><title>test page</title>"
    + "<meta name=\"robots\" content=\"noindex,follow\"> "
    + "</head><body>"
    + " some text"
    + "</body></html>",

    "<html><head><title>test page</title>"
    + "<meta name=\"robots\" content=\"index,nofollow\"> "
    + "</head><body>"
    + " some text"
    + "</body></html>",

    "<html><head><title>test page</title>"
    + "<meta name=\"robots\" content=\"index,follow\"> "
    + "<base href=\"http://www.nutch.org/\";>"
    + "</head><body>"
    + " some text"
    + "</body></html>",

    "<html><head><title>test page</title>"
    + "<meta name=\"robots\"> "
    + "<base href=\"http://www.nutch.org/base/\";>"
    + "</head><body>"
    + " some text"
    + "</body></html>",

  };

  public static final boolean[][] answers= {
    {true, true, true},     // NONE
    {false, false, true},   // all
    {true, true, true},     // nOnE
    {true, true, false},    // none
    {true, true, false},    // noindex,nofollow
    {true, false, false},   // noindex,follow
    {false, true, false},   // index,nofollow
    {false, false, false},  // index,follow
    {false, false, false},  // missing!
  };

  private URL[][] currURLsAndAnswers;

  public void testRobotsMetaProcessor() {
    DOMFragmentParser parser= new DOMFragmentParser();;

    try { 
      currURLsAndAnswers= new URL[][] {
        {new URL("http://www.nutch.org";), null},
        {new URL("http://www.nutch.org";), null},
        {new URL("http://www.nutch.org";), null},
        {new URL("http://www.nutch.org";), null},
        {new URL("http://www.nutch.org";), null},
        {new URL("http://www.nutch.org";), null},
        {new URL("http://www.nutch.org";), null},
        {new URL("http://www.nutch.org/foo/";), 
         new URL("http://www.nutch.org/";)},
        {new URL("http://www.nutch.org";), 
         new URL("http://www.nutch.org/base/";)}
      };
    } catch (Exception e) {
      assertTrue("couldn't make test URLs!", false);
    }

    for (int i= 0; i < tests.length; i++) {
      byte[] bytes= tests[i].getBytes();

      DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();

      try {
        parser.parse(new InputSource(new ByteArrayInputStream(bytes)), node);
      } catch (Exception e) {
        e.printStackTrace();
      }

      RobotsMetaIndicator robotsMeta= new RobotsMetaIndicator();
      RobotsMetaProcessor.getRobotsMetaDirectives(robotsMeta, node, 
                                                  currURLsAndAnswers[i][0]);

      assertTrue("got index wrong on test " + i,
                 robotsMeta.getNoIndex() == answers[i][0]);
      assertTrue("got follow wrong on test " + i,
                 robotsMeta.getNoFollow() == answers[i][1]);
      assertTrue("got cache wrong on test " + i,
                 robotsMeta.getNoCache() == answers[i][2]);
      assertTrue("got base href wrong on test " + i + " (got "
                 + robotsMeta.getBaseHref() + ")",
                 ( (robotsMeta.getBaseHref() == null)
                    && (currURLsAndAnswers[i][1] == null) )
                 || ( (robotsMeta.getBaseHref() != null)
                      && robotsMeta.getBaseHref().equals(
                        currURLsAndAnswers[i][1]) ) );
      
    }
  }

}

--- NEW FILE: TestDOMContentUtils.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import junit.framework.TestCase;

import net.nutch.fetcher.Outlink;

import java.io.ByteArrayInputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.StringTokenizer;

import org.cyberneko.html.parsers.*;
import org.xml.sax.*;
import org.w3c.dom.*;
import org.w3c.dom.html.*;
import org.apache.html.dom.*;

/** 
 * Unit tests for DOMContentUtils.
 */
public class TestDOMContentUtils extends TestCase {

  private static final String[] testPages= { 
    new String("<html><head><title> title </title><script> script </script>"
               + "</head><body> body <a href=\"http://www.nutch.org\";>"
               + " anchor </a><!--comment-->"
               + "</body></html>"),
    new String("<html><head><title> title </title><script> script </script>"
               + "</head><body> body <a href=\"/\">"
               + " home </a><!--comment-->"
               + "<style> style </style>"
               + " <a href=\"bot.html\">"
               + " bots </a>"
               + "</body></html>"),
    new String("<html><head><title> </title>"
               + "</head><body> "
               + "<a href=\"/\"> separate this "
               + "<a href=\"ok\"> from this"
               + "</a></a>"
               + "</body></html>"),
    // this one relies on certain neko fixup behavior, possibly
    // distributing the anchors into the LI's-but not the other
    // anchors (outside of them, instead)!  So you get a tree that
    // looks like:
    // ... <li> <a href=/> home </a> </li>
    //     <li> <a href=/> <a href="1"> 1 </a> </a> </li>
    //     <li> <a href=/> <a href="1"> <a href="2"> 2 </a> </a> </a> </li>
    new String("<html><head><title> my title </title>"
               + "</head><body> body "
               + "<ul>"
               + "<li> <a href=\"/\"> home"
               + "<li> <a href=\"1\"> 1"
               + "<li> <a href=\"2\"> 2"
               + "</ul>"
               + "</body></html>"),
  };

  private static String[] testBaseHrefs= {
    "http://www.nutch.org";,     
    "http://www.nutch.org/docs/foo.html";,     
    "http://www.nutch.org/docs/";,     
    "http://www.nutch.org/docs/";,     
  };
  
  private static final DocumentFragment testDOMs[]=
    new DocumentFragment[testPages.length];

  private static URL[] testBaseHrefURLs= 
    new URL[testPages.length];


  private static final String[] answerText= {
    "title body anchor",
    "title body home bots",
    "separate this from this",
    "my title body home 1 2",
  };

  private static final String[] answerTitle= {
    "title",
    "title",
    "",
    "my title",
  };

  // note: should be in page-order
  private static final Outlink[][] answerOutlinks= { 
    {
      new Outlink("http://www.nutch.org";, "anchor"),
    },
    {
      new Outlink("http://www.nutch.org/";, "home"),
      new Outlink("http://www.nutch.org/docs/bot.html";, "bots"),
    },
    {
      new Outlink("http://www.nutch.org/";, "separate this"),
      new Outlink("http://www.nutch.org/docs/ok";, "from this"),
    },
    {
      new Outlink("http://www.nutch.org/";, "home"),
      new Outlink("http://www.nutch.org/docs/1";, "1"),
      new Outlink("http://www.nutch.org/docs/2";, "2"),
    },
  };

  public TestDOMContentUtils(String name) { 
    super(name); 
  }

  private static void setup() {
    DOMFragmentParser parser= new DOMFragmentParser();
    for (int i= 0; i < testPages.length; i++) {
        DocumentFragment node= 
          new HTMLDocumentImpl().createDocumentFragment();
        try {
          parser.parse(
            new InputSource( 
              new ByteArrayInputStream(testPages[i].getBytes()) ),
            node);
          testBaseHrefURLs[i]= new URL(testBaseHrefs[i]);
        } catch (Exception e) {
          assertTrue("caught exception: " + e, false);
        } 
      testDOMs[i]= node;
    }
  }

  private static boolean equalsIgnoreWhitespace(String s1, String s2) {
    StringTokenizer st1= new StringTokenizer(s1);
    StringTokenizer st2= new StringTokenizer(s2);

    while (st1.hasMoreTokens()) {
      if (!st2.hasMoreTokens()) 
        return false;
      if ( ! st1.nextToken().equals(st2.nextToken()) )
        return false;
    }
    if (st2.hasMoreTokens()) 
      return false;
    return true;
  }

  public void testGetText() {
    if (testDOMs[0] == null) 
      setup();
    for (int i= 0; i < testPages.length; i++) {
      StringBuffer sb= new StringBuffer();
      DOMContentUtils.getText(sb, testDOMs[i]);
      String text= sb.toString();
      assertTrue("expecting text: " + answerText[i] 
                 + System.getProperty("line.separator") 
                 + System.getProperty("line.separator") 
                 + "got text: "+ text, 
                 equalsIgnoreWhitespace(answerText[i], text));
    }
  }

  public void testGetTitle() {
    if (testDOMs[0] == null) 
      setup();
    for (int i= 0; i < testPages.length; i++) {
      StringBuffer sb= new StringBuffer();
      DOMContentUtils.getTitle(sb, testDOMs[i]);
      String text= sb.toString();
      assertTrue("expecting text: " + answerText[i] 
                 + System.getProperty("line.separator") 
                 + System.getProperty("line.separator") 
                 + "got text: "+ text, 
                 equalsIgnoreWhitespace(answerTitle[i], text));
    }
  }

  public void testGetOutlinks() {
    if (testDOMs[0] == null) 
      setup();
    for (int i= 0; i < testPages.length; i++) {
      ArrayList outlinks= new ArrayList();
      DOMContentUtils.getOutlinks(testBaseHrefURLs[i], outlinks, testDOMs[i]);
      Outlink[] outlinkArr= new Outlink[outlinks.size()];
      outlinkArr= (Outlink[]) outlinks.toArray(outlinkArr);
      compareOutlinks(answerOutlinks[i], outlinkArr);
    }
  }

  private static final void appendOutlinks(StringBuffer sb, Outlink[] o) {
    for (int i= 0; i < o.length; i++) {
      sb.append(o[i].toString());
      sb.append(System.getProperty("line.separator"));
    }
  }

  private static final String outlinksString(Outlink[] o) {
    StringBuffer sb= new StringBuffer();
    appendOutlinks(sb, o);
    return sb.toString();
  }

  private static final void compareOutlinks(Outlink[] o1, Outlink[] o2) {
    if (o1.length != o2.length) {
      assertTrue("got wrong number of outlinks (expecting " + o1.length 
                 + ", got " + o2.length + ")" 
                 + System.getProperty("line.separator") 
                 + "answer: " + System.getProperty("line.separator") 
                 + outlinksString(o1) 
                 + System.getProperty("line.separator") 
                 + "got: " + System.getProperty("line.separator") 
                 + outlinksString(o2)
                 + System.getProperty("line.separator"),
                 false
        );
    }

    for (int i= 0; i < o1.length; i++) {
      if (!o1[i].equals(o2[i])) {
        assertTrue("got wrong outlinks at position " + i
                   + System.getProperty("line.separator") 
                   + "answer: " + System.getProperty("line.separator") 
                   + o1[i].toString()
                   + System.getProperty("line.separator") 
                   + "got: " + System.getProperty("line.separator") 
                   + o2[i].toString(),
                   false
          );
        
      }
    }
  }
}

--- NEW FILE: TestPrefixStringMatcher.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import junit.framework.TestCase;

/** Unit tests for PrefixStringMatcher. */
public class TestPrefixStringMatcher extends TestCase {
  public TestPrefixStringMatcher(String name) { 
    super(name); 
  }

  private final static int NUM_TEST_ROUNDS= 20;
  private final static int MAX_TEST_PREFIXES= 100;
  private final static int MAX_PREFIX_LEN= 10;
  private final static int NUM_TEST_INPUTS_PER_ROUND= 100;
  private final static int MAX_INPUT_LEN= 20;

  private final static char[] alphabet= 
    new char[] {
      'a', 'b', 'c', 'd',
//      'e', 'f', 'g', 'h', 'i', 'j',
//      'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
//      'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
//      '5', '6', '7', '8', '9', '0'
    };

  private String makeRandString(int minLen, int maxLen) {
    int len= minLen + (int) (Math.random() * (maxLen - minLen));
    char[] chars= new char[len];
    
    for (int pos= 0; pos < len; pos++) {
      chars[pos]= alphabet[(int) (Math.random() * alphabet.length)];
    }
    
    return new String(chars);
  }
  
  public void testPrefixMatcher() {
    int numMatches= 0;
    int numInputsTested= 0;

    for (int round= 0; round < NUM_TEST_ROUNDS; round++) {

      // build list of prefixes
      int numPrefixes= (int) (Math.random() * MAX_TEST_PREFIXES);
      String[] prefixes= new String[numPrefixes];
      for (int i= 0; i < numPrefixes; i++) {
        prefixes[i]= makeRandString(0, MAX_PREFIX_LEN);
      }

      PrefixStringMatcher prematcher= new PrefixStringMatcher(prefixes);

      // test random strings for prefix matches
      for (int i= 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
        String input= makeRandString(0, MAX_INPUT_LEN);
        boolean matches= false;
        int longestMatch= -1;
        int shortestMatch= -1;

        for (int j= 0; j < prefixes.length; j++) {

          if ((prefixes[j].length() > 0) 
              && input.startsWith(prefixes[j])) {

            matches= true;
            int matchSize= prefixes[j].length();

            if (matchSize > longestMatch) 
              longestMatch= matchSize;

            if ( (matchSize < shortestMatch)
                 || (shortestMatch == -1) )
              shortestMatch= matchSize;
          }

        }

        if (matches) 
          numMatches++;

        numInputsTested++;

        assertTrue( "'" + input + "' should " + (matches ? "" : "not ") 
                    + "match!",
                    matches == prematcher.matches(input) );
        if (matches) {
          assertTrue( shortestMatch 
                      == prematcher.shortestMatch(input).length());
          assertTrue( input.substring(0, shortestMatch).equals(
                        prematcher.shortestMatch(input)) );

          assertTrue( longestMatch 
                      == prematcher.longestMatch(input).length());
          assertTrue( input.substring(0, longestMatch).equals(
                        prematcher.longestMatch(input)) );

        }
      }
    }

    System.out.println("got " + numMatches + " matches out of " 
                       + numInputsTested + " tests");
  }

}

--- NEW FILE: TestStringUtil.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import junit.framework.TestCase;

/** Unit tests for StringUtil methods. */
public class TestStringUtil extends TestCase {
  public TestStringUtil(String name) { 
    super(name); 
  }

  public void testRightPad() {
    String s= "my string";

    String ps= StringUtil.rightPad(s, 0);
    assertTrue(s.equals(ps));

    ps= StringUtil.rightPad(s, 9);
    assertTrue(s.equals(ps));

    ps= StringUtil.rightPad(s, 10);
    assertTrue( (s+" ").equals(ps) );

    ps= StringUtil.rightPad(s, 15);
    assertTrue( (s+"      ").equals(ps) );

  }

  public void testLeftPad() {
    String s= "my string";

    String ps= StringUtil.leftPad(s, 0);
    assertTrue(s.equals(ps));

    ps= StringUtil.leftPad(s, 9);
    assertTrue(s.equals(ps));

    ps= StringUtil.leftPad(s, 10);
    assertTrue( (" "+s).equals(ps) );

    ps= StringUtil.leftPad(s, 15);
    assertTrue( ("      "+s).equals(ps) );

  }

}

--- NEW FILE: TestSuffixStringMatcher.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import junit.framework.TestCase;

/** Unit tests for SuffixStringMatcher. */
public class TestSuffixStringMatcher extends TestCase {
  public TestSuffixStringMatcher(String name) { 
    super(name); 
  }

  private final static int NUM_TEST_ROUNDS= 20;
  private final static int MAX_TEST_SUFFIXES= 100;
  private final static int MAX_SUFFIX_LEN= 10;
  private final static int NUM_TEST_INPUTS_PER_ROUND= 100;
  private final static int MAX_INPUT_LEN= 20;

  private final static char[] alphabet= 
    new char[] {
      'a', 'b', 'c', 'd',
//      'e', 'f', 'g', 'h', 'i', 'j',
//      'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
//      'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4',
//      '5', '6', '7', '8', '9', '0'
    };

  private String makeRandString(int minLen, int maxLen) {
    int len= minLen + (int) (Math.random() * (maxLen - minLen));
    char[] chars= new char[len];
    
    for (int pos= 0; pos < len; pos++) {
      chars[pos]= alphabet[(int) (Math.random() * alphabet.length)];
    }
    
    return new String(chars);
  }
  
  public void testSuffixMatcher() {
    int numMatches= 0;
    int numInputsTested= 0;

    for (int round= 0; round < NUM_TEST_ROUNDS; round++) {

      // build list of suffixes
      int numSuffixes= (int) (Math.random() * MAX_TEST_SUFFIXES);
      String[] suffixes= new String[numSuffixes];
      for (int i= 0; i < numSuffixes; i++) {
        suffixes[i]= makeRandString(0, MAX_SUFFIX_LEN);
      }

      SuffixStringMatcher sufmatcher= new SuffixStringMatcher(suffixes);

      // test random strings for suffix matches
      for (int i= 0; i < NUM_TEST_INPUTS_PER_ROUND; i++) {
        String input= makeRandString(0, MAX_INPUT_LEN);
        boolean matches= false;
        int longestMatch= -1;
        int shortestMatch= -1;

        for (int j= 0; j < suffixes.length; j++) {

          if ((suffixes[j].length() > 0) 
              && input.endsWith(suffixes[j])) {

            matches= true;
            int matchSize= suffixes[j].length();

            if (matchSize > longestMatch) 
              longestMatch= matchSize;

            if ( (matchSize < shortestMatch)
                 || (shortestMatch == -1) )
              shortestMatch= matchSize;
          }

        }

        if (matches) 
          numMatches++;

        numInputsTested++;

        assertTrue( "'" + input + "' should " + (matches ? "" : "not ") 
                    + "match!",
                    matches == sufmatcher.matches(input) );
        if (matches) {
          assertTrue( shortestMatch 
                      == sufmatcher.shortestMatch(input).length());
          assertTrue( input.substring(input.length() - shortestMatch).equals(
                        sufmatcher.shortestMatch(input)) );

          assertTrue( longestMatch 
                      == sufmatcher.longestMatch(input).length());
          assertTrue( input.substring(input.length() - longestMatch).equals(
                        sufmatcher.longestMatch(input)) );

        }
      }
    }

    System.out.println("got " + numMatches + " matches out of " 
                       + numInputsTested + " tests");
  }

}

--- NEW FILE: TestSoftHashMap.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import junit.framework.TestCase;

import java.util.ArrayList;
import java.util.Iterator;

/** 
 * Unit tests for SoftHashMap.
 */
public class TestSoftHashMap extends TestCase {

  // set to true to get flood of status messages on stderr- useful
  // for seeing when JVM is collecting everything.
  private static final boolean verbose= false;

  // 1kB for int[]
  private static final int TEST_VALUE_ARRAY_SIZE= 1024 / 4; 

  private static final int BASIC_OPS_SIZE= 10;

  private boolean keyHasBeenFinalized;
  private boolean valHasBeenFinalized;

  private class TestKey {
    Integer key;
    boolean notify;

    TestKey(Integer key, boolean notify) {
      this.key= key;
      this.notify= notify;
    }

    protected void finalize() {
      if (notify)
        TestSoftHashMap.this.keyFinalized(key);
    }

    public int hashCode() {
      return key.hashCode();
    }

    public boolean equals(Object o) {
      if (o == null) 
        return false;
      if ( !(o instanceof TestKey) )
        return false;
      TestKey other= (TestKey) o;
      return (this.key.equals(other.key));
    }

    public String toString() {
      return "Key:"+key;
    }

  }

  private class TestValue implements SoftHashMap.FinalizationNotifier {
    int[] val;
    boolean notify;
    ArrayList finalizationListeners;

    TestValue(int key, boolean notify) {
      this.val= new int[TEST_VALUE_ARRAY_SIZE];
      this.val[0]= key;
      this.notify= notify;
      this.finalizationListeners= new ArrayList();
    }

    public void addFinalizationListener(SoftHashMap.FinalizationListener
                                        listener) {
      finalizationListeners.add(listener);
    }

    protected void finalize() {
      if (notify)
        TestSoftHashMap.this.valFinalized(val[0]);
      for (Iterator iter= finalizationListeners.iterator();
           iter.hasNext() ; ) {
        SoftHashMap.FinalizationListener l=
          (SoftHashMap.FinalizationListener) iter.next();
        l.finalizationOccurring();
      }
    }

    boolean isMyKey(int key) {
      return key == val[0];
    }

    public String toString() {
      return "Val:"+val[0];
    }

  }

  public TestSoftHashMap(String name) { 
    super(name); 
  }

  public void testBasicOps() {
    SoftHashMap shm= new SoftHashMap();

    // cache keys & vals  so they don't go away
    TestKey[] keys= new TestKey[BASIC_OPS_SIZE];
    TestValue[] vals= new TestValue[BASIC_OPS_SIZE];

    for (int i= 0; i < BASIC_OPS_SIZE; i++) {
      keys[i]= new TestKey(new Integer(i), false);
      vals[i]= new TestValue(i, false);
      shm.put(keys[i], vals[i]);
    }

    for (int i= 0; i < BASIC_OPS_SIZE; i++) {
      TestValue v= (TestValue) shm.get(new TestKey(new Integer(i), false));
      assertTrue("got back null, expecting value! (key= "+i+")", v != null);
      assertTrue("got back wrong value (isMyKey())!", v.isMyKey(i));
      assertTrue("got back wrong value (!=)!", v == vals[i]);
      assertTrue("contains key doesn't have " + i, 
                 shm.containsKey(new TestKey(new Integer(i), false)));
      assertTrue("isEmpty returns true when it shouldn't",
                 !shm.isEmpty());
    }

    Object removed= shm.remove(
      new TestKey(new Integer(BASIC_OPS_SIZE - 1), false));
    if (verbose) 
      System.err.println("removed: " + removed);
    TestValue v= (TestValue) 
      shm.get(new TestKey(new Integer(BASIC_OPS_SIZE), false));
    assertTrue("got back val after delete!", v == null);

    int size= shm.size();
    assertTrue("got bad value from size(); returned " + size, 
               size == (BASIC_OPS_SIZE - 1) );

    shm.clear();
    assertTrue("isEmpty returns false when it shouldn't",
               shm.isEmpty());
  }

  public void testExpiry() {
    if (verbose) 
      System.err.println("entering testExpiry()");
    SoftHashMap shm= new SoftHashMap();

    valHasBeenFinalized= false;
    keyHasBeenFinalized= false;
    int i= 0;

    try {
      while (!valHasBeenFinalized) {
        if (verbose) 
          System.err.println("(!v) trying to put " + i);
        shm.put(new TestKey(new Integer(i), true), new TestValue(i, true));
        i++;
        if (verbose) 
          System.err.println("after adding " + i
                             + " items, size is " + shm.size());
      }
      while (!keyHasBeenFinalized) {
        if (verbose) 
          System.err.println("(!k) trying to put " + i);
        shm.put(new TestKey(new Integer(i), true), new TestValue(i, true));
        i++;
      }

      // sleep and busy loop to see if JVM goes on collecting stuff...
      if (verbose) 
        System.err.println("sleeping... ");
      Thread.sleep(20 * 1000);
      if (verbose) 
        System.err.println("busy looping...");
      int j;
      for (j= 0; j < 2000000; j++) {
        i+= j;
      }
      if (verbose) 
        System.err.println("done, j=" + j);

    } catch (Exception e)  {
      System.err.println("caught exception");
      e.printStackTrace();
    } finally {
      if (verbose) 
        System.err.println("out of put loops");
    }

  }

  void keyFinalized(Integer key) {
    if (verbose) 
      System.err.println("notified of finalized key: " + key);
    keyHasBeenFinalized= true;
  }

  void valFinalized(int key) {
    if (verbose) 
      System.err.println("notified of finalized value for: " + key);
    valHasBeenFinalized= true;
  }

  public static final void main(String[] a) {
    TestSoftHashMap t= new TestSoftHashMap("test");
    t.testExpiry();
  }

}

--- NEW FILE: TestGZIPUtils.java ---
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import junit.framework.TestCase;

import java.io.IOException;

/** Unit tests for GZIPUtils methods. */
public class TestGZIPUtils extends TestCase {
  public TestGZIPUtils(String name) { 
    super(name); 
  }

  /* a short, highly compressable, string */
  String SHORT_TEST_STRING= 
    "aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbcccccccccccccccc";

  /* a short, highly compressable, string */
  String LONGER_TEST_STRING= 
    SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING 
    + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING 
    + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING 
    + SHORT_TEST_STRING + SHORT_TEST_STRING + SHORT_TEST_STRING;

  /* a snapshot of the nutch webpage */
  String WEBPAGE= 
  "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n"
  + "<html>\n"
  + "<head>\n"
  + "  <meta http-equiv=\"content-type\"\n"
  + " content=\"text/html; charset=ISO-8859-1\">\n"
  + "  <title>Nutch</title>\n"
  + "</head>\n"
  + "<body>\n"
  + "<h1\n"
  + " style=\"font-family: helvetica,arial,sans-serif; text-align: center; color: 
rgb(255, 153, 0);\"><a\n"
  + " href=\"http://www.nutch.org/\";><font style=\"color: rgb(255, 153, 
0);\">Nutch</font></a><br>\n"
  + "<small>an open source web-search engine</small></h1>\n"
  + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
  + "<table\n"
  + " style=\"width: 100%; text-align: left; margin-left: auto; margin-right: 
auto;\"\n"
  + " border=\"0\" cellspacing=\"0\" cellpadding=\"0\">\n"
  + "  <tbody>\n"
  + "    <tr>\n"
  + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
  + " 
href=\"http://sourceforge.net/project/showfiles.php?group_id=59548\";>Download</a><br>\n"
  + "      </td>\n"
  + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
  + " href=\"tutorial.html\">Tutorial</a><br>\n"
  + "      </td>\n"
  + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
  + " 
href=\"http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/nutch/nutch/\";>CVS</a><br>\n"
  + "      </td>\n"
  + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
  + " href=\"api/index.html\">Javadoc</a><br>\n"
  + "      </td>\n"
  + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
  + " 
href=\"http://sourceforge.net/tracker/?atid=491356&amp;group_id=59548&amp;func=browse\";>Bugs</a><br>\n"
  + "      </td>\n"
  + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
  + " href=\"http://sourceforge.net/mail/?group_id=59548\";>Lists</a></td>\n"
  + "      <td style=\"vertical-align: top; text-align: center;\"><a\n"
  + " href=\"policies.html\">Policies</a><br>\n"
  + "      </td>\n"
  + "    </tr>\n"
  + "  </tbody>\n"
  + "</table>\n"
  + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\">\n"
  + "<h2>Introduction</h2>\n"
  + "Nutch is a nascent effort to implement an open-source web search\n"
  + "engine. Web search is a basic requirement for internet navigation, yet\n"
  + "the number of web search engines is decreasing. Today's oligopoly could\n"
  + "soon be a monopoly, with a single company controlling nearly all web\n"
  + "search for its commercial gain. &nbsp;That would not be good for the\n"
  + "users of internet. &nbsp;Nutch aims to enable anyone to easily and\n"
  + "cost-effectively deploy a world-class web search engine.<br>\n"
  + "<br>\n"
  + "To succeed, the Nutch software must be able to:<br>\n"
  + "<ul>\n"
  + "  <li> crawl several billion pages per month</li>\n"
  + "  <li>maintain an index of these pages</li>\n"
  + "  <li>search that index up to 1000 times per second</li>\n"
  + "  <li>provide very high quality search results</li>\n"
  + "  <li>operate at minimal cost</li>\n"
  + "</ul>\n"
  + "<h2>Status</h2>\n"
  + "Currently we're just a handful of developers working part-time to put\n"
  + "together a demo. &nbsp;The demo is coded entirely in Java. &nbsp;However\n"
  + "persistent data is written in well-documented formats so that modules\n"
  + "may eventually be re-written in other languages (e.g., Perl, C++) as the\n"
  + "project progresses.<br>\n"
  + "<br>\n"
  + "<hr style=\"width: 100%; height: 1px;\" noshade=\"noshade\"> <a\n"
  + " href=\"http://sourceforge.net\";> </a>\n"
  + "<div style=\"text-align: center;\"><a href=\"http://sourceforge.net\";><img\n"
  + " src=\"http://sourceforge.net/sflogo.php?group_id=59548&amp;type=1\"\n";
  + " style=\"border: 0px solid ; width: 88px; height: 31px;\"\n"
  + " alt=\"SourceForge.net Logo\" title=\"\"></a></div>\n"
  + "</body>\n"
  + "</html>\n";

  // tests

  public void testZipUnzip() {
    byte[] testBytes= SHORT_TEST_STRING.getBytes();
    testZipUnzip(testBytes);
    testBytes= LONGER_TEST_STRING.getBytes();
    testZipUnzip(testBytes);
    testBytes= WEBPAGE.getBytes();
    testZipUnzip(testBytes);
  }

  public void testZipUnzipBestEffort() {
    byte[] testBytes= SHORT_TEST_STRING.getBytes();
    testZipUnzipBestEffort(testBytes);
    testBytes= LONGER_TEST_STRING.getBytes();
    testZipUnzipBestEffort(testBytes);
    testBytes= WEBPAGE.getBytes();
    testZipUnzipBestEffort(testBytes);
  }
  
  public void testTruncation() {
    byte[] testBytes= SHORT_TEST_STRING.getBytes();
    testTruncation(testBytes);
    testBytes= LONGER_TEST_STRING.getBytes();
    testTruncation(testBytes);
    testBytes= WEBPAGE.getBytes();
    testTruncation(testBytes);
  }

  public void testLimit() {
    byte[] testBytes= SHORT_TEST_STRING.getBytes();
    testLimit(testBytes);
    testBytes= LONGER_TEST_STRING.getBytes();
    testLimit(testBytes);
    testBytes= WEBPAGE.getBytes();
    testLimit(testBytes);
  }

  // helpers

  public void testZipUnzip(byte[] origBytes) {
    byte[] compressedBytes= GZIPUtils.zip(origBytes);

    assertTrue("compressed array is not smaller!",
               compressedBytes.length < origBytes.length);

    byte[] uncompressedBytes= null;
    try {
      uncompressedBytes= GZIPUtils.unzip(compressedBytes);
    } catch (IOException e) {
      e.printStackTrace();
      assertTrue("caught exception '" + e + "' during unzip()",
                 false);
    }
    assertTrue("uncompressedBytes is wrong size", 
               uncompressedBytes.length == origBytes.length);

    for (int i= 0; i < origBytes.length; i++) 
      if (origBytes[i] != uncompressedBytes[i])
        assertTrue("uncompressedBytes does not match origBytes", false);
  }

  public void testZipUnzipBestEffort(byte[] origBytes) {
    byte[] compressedBytes= GZIPUtils.zip(origBytes);

    assertTrue("compressed array is not smaller!",
               compressedBytes.length < origBytes.length);

    byte[] uncompressedBytes= GZIPUtils.unzipBestEffort(compressedBytes);
    assertTrue("uncompressedBytes is wrong size", 
               uncompressedBytes.length == origBytes.length);

    for (int i= 0; i < origBytes.length; i++) 
      if (origBytes[i] != uncompressedBytes[i])
        assertTrue("uncompressedBytes does not match origBytes", false);
  }

  public void testTruncation(byte[] origBytes) {
    byte[] compressedBytes= GZIPUtils.zip(origBytes);

    System.out.println("original data has len " + origBytes.length);
    System.out.println("compressed data has len " 
                       + compressedBytes.length);

    for (int i= compressedBytes.length; i >= 0; i--) {

      byte[] truncCompressed= new byte[i];

      for (int j= 0; j < i; j++)
        truncCompressed[j]= compressedBytes[j];

      byte[] trunc= GZIPUtils.unzipBestEffort(truncCompressed);

      if (trunc == null) {
        System.out.println("truncated to len "
                           + i + ", trunc is null");
      } else {
        System.out.println("truncated to len "
                           + i + ", trunc.length=  " 
                           + trunc.length);

        for (int j= 0; j < trunc.length; j++)
          if (trunc[j] != origBytes[j]) 
            assertTrue("truncated/uncompressed array differs at pos "
                       + j + " (compressed data had been truncated to len "
                       + i + ")", false);
      }
    }
  }

  public void testLimit(byte[] origBytes) {
    byte[] compressedBytes= GZIPUtils.zip(origBytes);

    assertTrue("compressed array is not smaller!",
               compressedBytes.length < origBytes.length);

    for (int i= 0; i < origBytes.length; i++) {

      byte[] uncompressedBytes= 
        GZIPUtils.unzipBestEffort(compressedBytes, i);

      assertTrue("uncompressedBytes is wrong size", 
                 uncompressedBytes.length == i);

      for (int j= 0; j < i; j++) 
        if (origBytes[j] != uncompressedBytes[j])
          assertTrue("uncompressedBytes does not match origBytes", false);
    }
  }

}



-------------------------------------------------------
The SF.Net email is sponsored by EclipseCon 2004
Premiere Conference on Open Tools Development and Integration
See the breadth of Eclipse activity. February 3-5 in Anaheim, CA.
http://www.eclipsecon.org/osdn
_______________________________________________
Nutch-cvs mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-cvs

Reply via email to