Here is DumpSegment.java, which I use to debug.
The output is sorted by url.

Doug:
where should I put it, ./src/java/net/nutch/fetcher or
./src/java/net/nutch/util ?

John


------------------------------ DumpSegment.java ------------------------------
/* Copyright (c) 2004 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.fetcher;

import net.nutch.pagedb.FetchListEntry;
import net.nutch.io.*;
import net.nutch.util.*;
import net.nutch.protocol.*;
import net.nutch.parse.*;
//import net.nutch.plugin.*;

import java.io.File;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import java.util.Properties;
import java.util.logging.*;

/********************************
 * Dump contents in one segment.
 *
 * @author John Xing
 ********************************/

public class DumpSegment {

  public static final Logger LOG =
    LogFormatter.getLogger("net.nutch.fetcher.DumpSegment");

  // for stats
  private long start;                             // start time
  private long bytes;                             // total bytes parsed
  private int pages;                              // total pages parsed
  private int errors;                             // total pages errored

  private String fetcherFile;
  private String contentFile;
  private String parseDataFile;
  private String parseTextFile;

  private String unsortedFile;
  private String sortedFile;

  // ctor
  public DumpSegment(String directory) {
    this.fetcherFile = directory+"/"+FetcherOutput.DIR_NAME;
    this.contentFile = directory+"/"+Content.DIR_NAME;
    this.parseDataFile = directory+"/"+ParseData.DIR_NAME;
    this.parseTextFile = directory+"/"+ParseText.DIR_NAME;

    this.unsortedFile = directory+"/"+FetcherOutput.DIR_NAME+".url.unsorted";
    this.sortedFile = directory+"/"+FetcherOutput.DIR_NAME+".url.sorted";
  }

  // dump
  public void dump() throws IOException {

    ArrayFile.Reader fetcherReader = new ArrayFile.Reader(fetcherFile);
    ArrayFile.Reader contentReader = new ArrayFile.Reader(contentFile);
    ArrayFile.Reader parseDataReader = new ArrayFile.Reader(parseDataFile);
    ArrayFile.Reader parseTextReader = new ArrayFile.Reader(parseTextFile);

    SequenceFile.Reader seqReader = new SequenceFile.Reader(sortedFile);

    UTF8 url = new UTF8();
    LongWritable entry = new LongWritable();

    FetcherOutput fetcherOutput = new FetcherOutput();
    Content content = new Content();
    ParseData parseData = new ParseData();
    ParseText parseText = new ParseText();

    while (seqReader.next(url,entry)) {
      String urlString = url.toString();
      long recno = entry.get();

      if (fetcherReader.get(recno, fetcherOutput) == null
        || contentReader.get(recno, content) == null
        || parseDataReader.get(recno, parseData) == null
        || parseTextReader.get(recno, parseText) == null)
        break;

      //System.out.print("URL:: "+urlString+"\n");
      System.out.print("Recno:: "+recno+"\n");
      System.out.print("FetcherOutput::\n");
      System.out.print(fetcherOutput.toString());
      System.out.print("ParseData::\n";);
      System.out.print(parseData.toString());
      System.out.print("ParseText::\n");
      System.out.print(parseText.toString());
      System.out.print("\n");
    }

    fetcherReader.close();
    contentReader.close();
    parseDataReader.close();
    parseTextReader.close();

    seqReader.close();

    new File(sortedFile).delete();
  }

  // sort
  public void sort() throws IOException {

    // make a SequenceFile
    ArrayFile.Reader fetcherReader = new ArrayFile.Reader(fetcherFile);

    SequenceFile.Writer seqWriter = new SequenceFile.Writer
      (unsortedFile, UTF8.class, LongWritable.class);

    FetchListEntry fle;
    String urlString;
    FetcherOutput fetcherOutput = new FetcherOutput();

    long count = 0;
    while (fetcherReader.next(fetcherOutput) != null) {
      fle = fetcherOutput.getFetchListEntry();
      urlString = fle.getPage().getURL().toString();
      seqWriter.append(new UTF8(urlString), new LongWritable(count));
      count++;
    }

    fetcherReader.close();
    seqWriter.close();

    // sort the SequenceFile
    long start = System.currentTimeMillis();

    SequenceFile.Sorter sorter = new SequenceFile.Sorter
      (new UTF8.Comparator(), LongWritable.class);

    sorter.sort(unsortedFile, sortedFile);

    double localSecs = (System.currentTimeMillis() - start) / 1000.0;
    LOG.info("Sorted: " + count + " entries in " + localSecs + "s, "
      + (count/localSecs) + " entries/s");

    new File(unsortedFile).delete();
  }

  // run it
  public static void main(String[] args) throws Exception {
    boolean dump = false; String which = null;
    String logLevel = "info";
    String directory = null;

    //String usage = "Usage: DumpSegment [-logLevel level] dir";
    String usage = "Usage: DumpSegment dir";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }
      
    // parse command line
    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-logLevel")) {
        logLevel = args[++i];
      } else if (i != args.length-1) {
        System.err.println(usage);
        System.exit(-1);
      } else {
        directory = args[i];
      }
    }

    DumpSegment dumpSegment = new DumpSegment(directory);
    LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));

    dumpSegment.sort();
    dumpSegment.dump();
  }
}


-------------------------------------------------------
This SF.Net email is sponsored by BEA Weblogic Workshop
FREE Java Enterprise J2EE developer tools!
Get your free copy of BEA WebLogic Workshop 8.1 today.
http://ads.osdn.com/?ad_id=4721&alloc_id=10040&op=click
_______________________________________________
Nutch-developers mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/nutch-developers

Reply via email to