p...

lewismc Wed, 28 Jan 2015 21:39:30 -0800

Modified: nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java Thu Jan 29 05:38:59 
2015
@@ -33,16 +33,20 @@ public class FSUtils {
    * path. If removeOld is set to false then the old path will be set to the
    * name current.old.
    * 
-   * @param fs The FileSystem.
-   * @param current The end path, the one being replaced.
-   * @param replacement The path to replace with.
-   * @param removeOld True if we are removing the current path.
+   * @param fs
+   *          The FileSystem.
+   * @param current
+   *          The end path, the one being replaced.
+   * @param replacement
+   *          The path to replace with.
+   * @param removeOld
+   *          True if we are removing the current path.
    * 
-   * @throws IOException If an error occurs during replacement.
+   * @throws IOException
+   *           If an error occurs during replacement.
    */
   public static void replace(FileSystem fs, Path current, Path replacement,
-    boolean removeOld)
-    throws IOException {
+      boolean removeOld) throws IOException {
 
     // rename any current path to old
     Path old = new Path(current + ".old");
@@ -60,12 +64,14 @@ public class FSUtils {
   /**
    * Closes a group of SequenceFile readers.
    * 
-   * @param readers The SequenceFile readers to close.
-   * @throws IOException If an error occurs while closing a reader.
+   * @param readers
+   *          The SequenceFile readers to close.
+   * @throws IOException
+   *           If an error occurs while closing a reader.
    */
   public static void closeReaders(SequenceFile.Reader[] readers)
-    throws IOException {
-    
+      throws IOException {
+
     // loop through the readers, closing one by one
     if (readers != null) {
       for (int i = 0; i < readers.length; i++) {
@@ -80,12 +86,13 @@ public class FSUtils {
   /**
    * Closes a group of MapFile readers.
    * 
-   * @param readers The MapFile readers to close.
-   * @throws IOException If an error occurs while closing a reader.
+   * @param readers
+   *          The MapFile readers to close.
+   * @throws IOException
+   *           If an error occurs while closing a reader.
    */
-  public static void closeReaders(MapFile.Reader[] readers)
-    throws IOException {
-    
+  public static void closeReaders(MapFile.Reader[] readers) throws IOException 
{
+
     // loop through the readers closing one by one
     if (readers != null) {
       for (int i = 0; i < readers.length; i++) {


Modified: nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java Thu Jan 29 
05:38:59 2015
@@ -28,19 +28,18 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- *  A collection of utility methods for working on GZIPed data.
+ * A collection of utility methods for working on GZIPed data.
  */
 public class GZIPUtils {
-  
+
   private static final Logger LOG = LoggerFactory.getLogger(GZIPUtils.class);
-  private static final int EXPECTED_COMPRESSION_RATIO= 5;
-  private static final int BUF_SIZE= 4096;
+  private static final int EXPECTED_COMPRESSION_RATIO = 5;
+  private static final int BUF_SIZE = 4096;
 
   /**
-   * Returns an gunzipped copy of the input array.  If the gzipped
-   * input has been truncated or corrupted, a best-effort attempt is
-   * made to unzip as much as possible.  If no data can be extracted
-   * <code>null</code> is returned.
+   * Returns an gunzipped copy of the input array. If the gzipped input has 
been
+   * truncated or corrupted, a best-effort attempt is made to unzip as much as
+   * possible. If no data can be extracted <code>null</code> is returned.
    */
   public static final byte[] unzipBestEffort(byte[] in) {
     return unzipBestEffort(in, Integer.MAX_VALUE);
@@ -48,33 +47,32 @@ public class GZIPUtils {
 
   /**
    * Returns an gunzipped copy of the input array, truncated to
-   * <code>sizeLimit</code> bytes, if necessary.  If the gzipped input
-   * has been truncated or corrupted, a best-effort attempt is made to
-   * unzip as much as possible.  If no data can be extracted
-   * <code>null</code> is returned.
+   * <code>sizeLimit</code> bytes, if necessary. If the gzipped input has been
+   * truncated or corrupted, a best-effort attempt is made to unzip as much as
+   * possible. If no data can be extracted <code>null</code> is returned.
    */
   public static final byte[] unzipBestEffort(byte[] in, int sizeLimit) {
     try {
-      // decompress using GZIPInputStream 
-      ByteArrayOutputStream outStream = 
-        new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+      // decompress using GZIPInputStream
+      ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+          EXPECTED_COMPRESSION_RATIO * in.length);
 
-      GZIPInputStream inStream = 
-        new GZIPInputStream ( new ByteArrayInputStream(in) );
+      GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(
+          in));
 
       byte[] buf = new byte[BUF_SIZE];
       int written = 0;
       while (true) {
         try {
           int size = inStream.read(buf);
-          if (size <= 0) 
+          if (size <= 0)
             break;
           if ((written + size) > sizeLimit) {
             outStream.write(buf, 0, sizeLimit - written);
             break;
           }
           outStream.write(buf, 0, size);
-          written+= size;
+          written += size;
         } catch (Exception e) {
           break;
         }
@@ -91,23 +89,23 @@ public class GZIPUtils {
     }
   }
 
-
   /**
-   * Returns an gunzipped copy of the input array.  
-   * @throws IOException if the input cannot be properly decompressed
+   * Returns an gunzipped copy of the input array.
+   * 
+   * @throws IOException
+   *           if the input cannot be properly decompressed
    */
   public static final byte[] unzip(byte[] in) throws IOException {
-    // decompress using GZIPInputStream 
-    ByteArrayOutputStream outStream = 
-      new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+    // decompress using GZIPInputStream
+    ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+        EXPECTED_COMPRESSION_RATIO * in.length);
 
-    GZIPInputStream inStream = 
-      new GZIPInputStream ( new ByteArrayInputStream(in) );
+    GZIPInputStream inStream = new GZIPInputStream(new 
ByteArrayInputStream(in));
 
     byte[] buf = new byte[BUF_SIZE];
     while (true) {
       int size = inStream.read(buf);
-      if (size <= 0) 
+      if (size <= 0)
         break;
       outStream.write(buf, 0, size);
     }
@@ -121,11 +119,11 @@ public class GZIPUtils {
    */
   public static final byte[] zip(byte[] in) {
     try {
-      // compress using GZIPOutputStream 
-      ByteArrayOutputStream byteOut= 
-        new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO);
+      // compress using GZIPOutputStream
+      ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length
+          / EXPECTED_COMPRESSION_RATIO);
 
-      GZIPOutputStream outStream= new GZIPOutputStream(byteOut);
+      GZIPOutputStream outStream = new GZIPOutputStream(byteOut);
 
       try {
         outStream.write(in);
@@ -146,5 +144,5 @@ public class GZIPUtils {
       return null;
     }
   }
-    
+
 }

Modified: 
nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java 
Thu Jan 29 05:38:59 2015
@@ -24,12 +24,15 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.io.GenericWritable;
 import org.apache.hadoop.io.Writable;
 
-/** A generic Writable wrapper that can inject Configuration to {@link 
Configurable}s */ 
-public abstract class GenericWritableConfigurable extends GenericWritable 
-                                                  implements Configurable {
+/**
+ * A generic Writable wrapper that can inject Configuration to
+ * {@link Configurable}s
+ */
+public abstract class GenericWritableConfigurable extends GenericWritable
+    implements Configurable {
 
   private Configuration conf;
-  
+
   public Configuration getConf() {
     return conf;
   }
@@ -37,7 +40,7 @@ public abstract class GenericWritableCon
   public void setConf(Configuration conf) {
     this.conf = conf;
   }
-  
+
   @Override
   public void readFields(DataInput in) throws IOException {
     byte type = in.readByte();
@@ -50,8 +53,8 @@ public abstract class GenericWritableCon
     }
     Writable w = get();
     if (w instanceof Configurable)
-      ((Configurable)w).setConf(conf);
+      ((Configurable) w).setConf(conf);
     w.readFields(in);
   }
-  
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Thu Jan 29 
05:38:59 2015
@@ -25,48 +25,48 @@ import org.apache.hadoop.fs.PathFilter;
 
 public class HadoopFSUtil {
 
-    /**
-     * Returns PathFilter that passes all paths through.
-     */
-    public static PathFilter getPassAllFilter() {
-        return new PathFilter() {
-            public boolean accept(Path arg0) {
-                return true;
-            }
-        };
-    }
+  /**
+   * Returns PathFilter that passes all paths through.
+   */
+  public static PathFilter getPassAllFilter() {
+    return new PathFilter() {
+      public boolean accept(Path arg0) {
+        return true;
+      }
+    };
+  }
+
+  /**
+   * Returns PathFilter that passes directories through.
+   */
+  public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
+    return new PathFilter() {
+      public boolean accept(final Path path) {
+        try {
+          return fs.getFileStatus(path).isDir();
+        } catch (IOException ioe) {
+          return false;
+        }
+      }
 
-    /**
-     * Returns PathFilter that passes directories through.
-     */
-    public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
-        return new PathFilter() {
-            public boolean accept(final Path path) {
-                try {
-                    return fs.getFileStatus(path).isDir();
-                } catch (IOException ioe) {
-                    return false;
-                }
-            }
+    };
+  }
 
-        };
+  /**
+   * Turns an array of FileStatus into an array of Paths.
+   */
+  public static Path[] getPaths(FileStatus[] stats) {
+    if (stats == null) {
+      return null;
     }
-    
-    /**
-     * Turns an array of FileStatus into an array of Paths.
-     */
-    public static Path[] getPaths(FileStatus[] stats) {
-      if (stats == null) {
-        return null;
-      }
-      if (stats.length == 0) {
-        return new Path[0];
-      }
-      Path[] res = new Path[stats.length];
-      for (int i = 0; i < stats.length; i++) {
-        res[i] = stats[i].getPath();
-      }
-      return res;
+    if (stats.length == 0) {
+      return new Path[0];
+    }
+    Path[] res = new Path[stats.length];
+    for (int i = 0; i < stats.length; i++) {
+      res[i] = stats[i].getPath();
     }
+    return res;
+  }
 
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java Thu Jan 29 
05:38:59 2015
@@ -28,22 +28,29 @@ import org.apache.hadoop.fs.Path;
  * @author Andrzej Bialecki
  */
 public class LockUtil {
-  
+
   /**
    * Create a lock file.
-   * @param fs filesystem
-   * @param lockFile name of the lock file
-   * @param accept if true, and the target file exists, consider it valid. If 
false
-   * and the target file exists, throw an IOException.
-   * @throws IOException if accept is false, and the target file already 
exists,
-   * or if it's a directory.
+   * 
+   * @param fs
+   *          filesystem
+   * @param lockFile
+   *          name of the lock file
+   * @param accept
+   *          if true, and the target file exists, consider it valid. If false
+   *          and the target file exists, throw an IOException.
+   * @throws IOException
+   *           if accept is false, and the target file already exists, or if
+   *           it's a directory.
    */
-  public static void createLockFile(FileSystem fs, Path lockFile, boolean 
accept) throws IOException {
+  public static void createLockFile(FileSystem fs, Path lockFile, boolean 
accept)
+      throws IOException {
     if (fs.exists(lockFile)) {
-      if(!accept)
+      if (!accept)
         throw new IOException("lock file " + lockFile + " already exists.");
       if (fs.getFileStatus(lockFile).isDir())
-        throw new IOException("lock file " + lockFile + " already exists and 
is a directory.");
+        throw new IOException("lock file " + lockFile
+            + " already exists and is a directory.");
       // do nothing - the file already exists.
     } else {
       // make sure parents exist
@@ -55,16 +62,23 @@ public class LockUtil {
   /**
    * Remove lock file. NOTE: applications enforce the semantics of this file -
    * this method simply removes any file with a given name.
-   * @param fs filesystem
-   * @param lockFile lock file name
+   * 
+   * @param fs
+   *          filesystem
+   * @param lockFile
+   *          lock file name
    * @return false, if the lock file doesn't exist. True, if it existed and was
-   * successfully removed.
-   * @throws IOException if lock file exists but it is a directory.
+   *         successfully removed.
+   * @throws IOException
+   *           if lock file exists but it is a directory.
    */
-  public static boolean removeLockFile(FileSystem fs, Path lockFile) throws 
IOException {
-    if (!fs.exists(lockFile)) return false;
+  public static boolean removeLockFile(FileSystem fs, Path lockFile)
+      throws IOException {
+    if (!fs.exists(lockFile))
+      return false;
     if (fs.getFileStatus(lockFile).isDir())
-      throw new IOException("lock file " + lockFile + " exists but is a 
directory!");
+      throw new IOException("lock file " + lockFile
+          + " exists but is a directory!");
     return fs.delete(lockFile, false);
   }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Thu Jan 29 
05:38:59 2015
@@ -45,12 +45,12 @@ import org.apache.nutch.protocol.Protoco
  * @author mattmann
  * @since NUTCH-608
  * 
- * <p>
- * This is a facade class to insulate Nutch from its underlying Mime Type
- * substrate library, <a href="http://incubator.apache.org/tika/";>Apache 
Tika</a>.
- * Any mime handling code should be placed in this utility class, and hidden
- * from the Nutch classes that rely on it.
- * </p>
+ *        <p>
+ *        This is a facade class to insulate Nutch from its underlying Mime 
Type
+ *        substrate library, <a href="http://incubator.apache.org/tika/";>Apache
+ *        Tika</a>. Any mime handling code should be placed in this utility
+ *        class, and hidden from the Nutch classes that rely on it.
+ *        </p>
  */
 public final class MimeUtil {
 
@@ -66,7 +66,8 @@ public final class MimeUtil {
   private boolean mimeMagic;
 
   /* our log stream */
-  private static final Logger LOG = 
LoggerFactory.getLogger(MimeUtil.class.getName());
+  private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class
+      .getName());
 
   public MimeUtil(Configuration conf) {
     tika = new Tika();
@@ -75,25 +76,26 @@ public final class MimeUtil {
         .getName());
     if (mimeTypez == null) {
       try {
-          String customMimeTypeFile = conf.get("mime.types.file");
-          if (customMimeTypeFile!=null && 
customMimeTypeFile.equals("")==false){
-              try {
-              mimeTypez = MimeTypesFactory.create(conf
-                      .getConfResourceAsInputStream(customMimeTypeFile));
-              }
-              catch (Exception e){
-                  LOG.error("Can't load mime.types.file : 
"+customMimeTypeFile+" using Tika's default");
-              }
+        String customMimeTypeFile = conf.get("mime.types.file");
+        if (customMimeTypeFile != null
+            && customMimeTypeFile.equals("") == false) {
+          try {
+            mimeTypez = MimeTypesFactory.create(conf
+                .getConfResourceAsInputStream(customMimeTypeFile));
+          } catch (Exception e) {
+            LOG.error("Can't load mime.types.file : " + customMimeTypeFile
+                + " using Tika's default");
           }
-          if (mimeTypez==null)
-              mimeTypez = MimeTypes.getDefaultMimeTypes();
+        }
+        if (mimeTypez == null)
+          mimeTypez = MimeTypes.getDefaultMimeTypes();
       } catch (Exception e) {
-        LOG.error("Exception in MimeUtil "+e.getMessage());
+        LOG.error("Exception in MimeUtil " + e.getMessage());
         throw new RuntimeException(e);
       }
       objectCache.setObject(MimeTypes.class.getName(), mimeTypez);
     }
-    
+
     this.mimeTypes = mimeTypez;
     this.mimeMagic = conf.getBoolean("mime.type.magic", true);
   }
@@ -129,14 +131,13 @@ public final class MimeUtil {
   /**
    * A facade interface to trying all the possible mime type resolution
    * strategies available within Tika. First, the mime type provided in
-   * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}.
-   * Then the cleaned mime type is looked up in the underlying Tika
-   * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType}
-   * is found, then that mime type is used, otherwise URL resolution is
-   * used to try and determine the mime type. However, if
-   * <code>mime.type.magic</code> is enabled in {@link NutchConfiguration},
-   * then mime type magic resolution is used to try and obtain a
-   * better-than-the-default approximation of the {@link MimeType}.
+   * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. 
Then
+   * the cleaned mime type is looked up in the underlying Tika {@link 
MimeTypes}
+   * registry, by its cleaned name. If the {@link MimeType} is found, then that
+   * mime type is used, otherwise URL resolution is used to try and determine
+   * the mime type. However, if <code>mime.type.magic</code> is enabled in
+   * {@link NutchConfiguration}, then mime type magic resolution is used to try
+   * and obtain a better-than-the-default approximation of the {@link 
MimeType}.
    * 
    * @param typeName
    *          The original mime type, returned from a {@link ProtocolOutput}.
@@ -176,7 +177,7 @@ public final class MimeUtil {
         throw new RuntimeException(e);
       }
     } else {
-        retType = type.getName();
+      retType = type.getName();
     }
 
     // if magic is enabled use mime magic to guess if the mime type returned
@@ -194,14 +195,15 @@ public final class MimeUtil {
         InputStream stream = TikaInputStream.get(data);
         try {
           magicType = tika.detect(stream, tikaMeta);
-       } finally {
-         stream.close();
+        } finally {
+          stream.close();
         }
-      } catch (IOException ignore) {}
+      } catch (IOException ignore) {
+      }
 
       if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM)
-          && !magicType.equals(MimeTypes.PLAIN_TEXT)
-          && retType != null && !retType.equals(magicType)) {
+          && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null
+          && !retType.equals(magicType)) {
 
         // If magic enabled and the current mime type differs from that of the
         // one returned from the magic, take the magic mimeType
@@ -224,12 +226,12 @@ public final class MimeUtil {
   /**
    * Facade interface to Tika's underlying {@link 
MimeTypes#getMimeType(String)}
    * method.
-   *
+   * 
    * @param url
    *          A string representation of the document {@link URL} to sense the
    *          {@link MimeType} for.
-   * @return An appropriate {@link MimeType}, identified from the given
-   *         Document url in string form.
+   * @return An appropriate {@link MimeType}, identified from the given 
Document
+   *         url in string form.
    */
   public String getMimeType(String url) {
     return tika.detect(url);
@@ -238,11 +240,11 @@ public final class MimeUtil {
   /**
    * A facade interface to Tika's underlying {@link MimeTypes#forName(String)}
    * method.
-   *
+   * 
    * @param name
    *          The name of a valid {@link MimeType} in the Tika mime registry.
-   * @return The object representation of the {@link MimeType}, if it exists,
-   *         or null otherwise.
+   * @return The object representation of the {@link MimeType}, if it exists, 
or
+   *         null otherwise.
    */
   public String forName(String name) {
     try {
@@ -257,7 +259,7 @@ public final class MimeUtil {
   /**
    * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)}
    * method.
-   *
+   * 
    * @param f
    *          The {@link File} to sense the {@link MimeType} for.
    * @return The {@link MimeType} of the given {@link File}, or null if it
@@ -273,5 +275,4 @@ public final class MimeUtil {
     }
   }
 
-
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Thu Jan 29 
05:38:59 2015
@@ -22,13 +22,17 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 
 /**
- * <p>A utility class that allows the walking of any DOM tree using a stack 
- * instead of recursion.  As the node tree is walked the next node is popped
- * off of the stack and all of its children are automatically added to the 
- * stack to be called in tree order.</p>
+ * <p>
+ * A utility class that allows the walking of any DOM tree using a stack 
instead
+ * of recursion. As the node tree is walked the next node is popped off of the
+ * stack and all of its children are automatically added to the stack to be
+ * called in tree order.
+ * </p>
  * 
- * <p>Currently this class is not thread safe.  It is assumed that only one
- * thread will be accessing the <code>NodeWalker</code> at any given time.</p>
+ * <p>
+ * Currently this class is not thread safe. It is assumed that only one thread
+ * will be accessing the <code>NodeWalker</code> at any given time.
+ * </p>
  */
 public class NodeWalker {
 
@@ -36,7 +40,7 @@ public class NodeWalker {
   private Node currentNode;
   private NodeList currentChildren;
   private Stack<Node> nodes;
-  
+
   /**
    * Starts the <code>Node</code> tree from the root node.
    * 
@@ -47,68 +51,74 @@ public class NodeWalker {
     nodes = new Stack<Node>();
     nodes.add(rootNode);
   }
-  
+
   /**
-   * <p>Returns the next <code>Node</code> on the stack and pushes all of its
-   * children onto the stack, allowing us to walk the node tree without the
-   * use of recursion.  If there are no more nodes on the stack then null is
-   * returned.</p>
+   * <p>
+   * Returns the next <code>Node</code> on the stack and pushes all of its
+   * children onto the stack, allowing us to walk the node tree without the use
+   * of recursion. If there are no more nodes on the stack then null is
+   * returned.
+   * </p>
    * 
-   * @return Node The next <code>Node</code> on the stack or null if there
-   * isn't a next node.
+   * @return Node The next <code>Node</code> on the stack or null if there 
isn't
+   *         a next node.
    */
   public Node nextNode() {
-    
+
     // if no next node return null
     if (!hasNext()) {
       return null;
     }
-    
+
     // pop the next node off of the stack and push all of its children onto
     // the stack
     currentNode = nodes.pop();
     currentChildren = currentNode.getChildNodes();
     int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
-    
+
     // put the children node on the stack in first to last order
     for (int i = childLen - 1; i >= 0; i--) {
       nodes.add(currentChildren.item(i));
     }
-    
+
     return currentNode;
   }
-  
+
   /**
-   * <p>Skips over and removes from the node stack the children of the last
-   * node.  When getting a next node from the walker, that node's children 
-   * are automatically added to the stack.  You can call this method to remove
-   * those children from the stack.</p>
-   * 
-   * <p>This is useful when you don't want to process deeper into the 
-   * current path of the node tree but you want to continue processing sibling
-   * nodes.</p>
-   *
+   * <p>
+   * Skips over and removes from the node stack the children of the last node.
+   * When getting a next node from the walker, that node's children are
+   * automatically added to the stack. You can call this method to remove those
+   * children from the stack.
+   * </p>
+   * 
+   * <p>
+   * This is useful when you don't want to process deeper into the current path
+   * of the node tree but you want to continue processing sibling nodes.
+   * </p>
+   * 
    */
   public void skipChildren() {
-    
+
     int childLen = (currentChildren != null) ? currentChildren.getLength() : 0;
-    
-    for (int i = 0 ; i < childLen ; i++) {
+
+    for (int i = 0; i < childLen; i++) {
       Node child = nodes.peek();
       if (child.equals(currentChildren.item(i))) {
         nodes.pop();
       }
     }
   }
-  
+
   /**
    * Return the current node.
+   * 
    * @return Node
    */
   public Node getCurrentNode() {
     return currentNode;
   }
-  
+
   /**
    * @return returns true if there are more nodes on the current stack.
    * 

Modified: nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Thu Jan 
29 05:38:59 2015
@@ -23,37 +23,42 @@ import java.util.UUID;
 
 import org.apache.hadoop.conf.Configuration;
 
-
-/** Utility to create Hadoop {@link Configuration}s that include Nutch-specific
- * resources.  */
+/**
+ * Utility to create Hadoop {@link Configuration}s that include Nutch-specific
+ * resources.
+ */
 public class NutchConfiguration {
   public static final String UUID_KEY = "nutch.conf.uuid";
-  
-  private NutchConfiguration() {}                 // singleton
-  
+
+  private NutchConfiguration() {
+  } // singleton
+
   /*
-   * Configuration.hashCode() doesn't return values that
-   * correspond to a unique set of parameters. This is a workaround
-   * so that we can track instances of Configuration created by Nutch.
+   * Configuration.hashCode() doesn't return values that correspond to a unique
+   * set of parameters. This is a workaround so that we can track instances of
+   * Configuration created by Nutch.
    */
   private static void setUUID(Configuration conf) {
     UUID uuid = UUID.randomUUID();
     conf.set(UUID_KEY, uuid.toString());
   }
-  
+
   /**
-   * Retrieve a Nutch UUID of this configuration object, or null
-   * if the configuration was created elsewhere.
-   * @param conf configuration instance
+   * Retrieve a Nutch UUID of this configuration object, or null if the
+   * configuration was created elsewhere.
+   * 
+   * @param conf
+   *          configuration instance
    * @return uuid or null
    */
   public static String getUUID(Configuration conf) {
     return conf.get(UUID_KEY);
   }
 
-  /** Create a {@link Configuration} for Nutch. This will load the standard
-   * Nutch resources, <code>nutch-default.xml</code> and
-   * <code>nutch-site.xml</code> overrides.
+  /**
+   * Create a {@link Configuration} for Nutch. This will load the standard 
Nutch
+   * resources, <code>nutch-default.xml</code> and <code>nutch-site.xml</code>
+   * overrides.
    */
   public static Configuration create() {
     Configuration conf = new Configuration();
@@ -61,14 +66,19 @@ public class NutchConfiguration {
     addNutchResources(conf);
     return conf;
   }
-  
-  /** Create a {@link Configuration} from supplied properties.
-   * @param addNutchResources if true, then first 
<code>nutch-default.xml</code>,
-   * and then <code>nutch-site.xml</code> will be loaded prior to applying the
-   * properties. Otherwise these resources won't be used.
-   * @param nutchProperties a set of properties to define (or override)
+
+  /**
+   * Create a {@link Configuration} from supplied properties.
+   * 
+   * @param addNutchResources
+   *          if true, then first <code>nutch-default.xml</code>, and then
+   *          <code>nutch-site.xml</code> will be loaded prior to applying the
+   *          properties. Otherwise these resources won't be used.
+   * @param nutchProperties
+   *          a set of properties to define (or override)
    */
-  public static Configuration create(boolean addNutchResources, Properties 
nutchProperties) {
+  public static Configuration create(boolean addNutchResources,
+      Properties nutchProperties) {
     Configuration conf = new Configuration();
     setUUID(conf);
     if (addNutchResources) {
@@ -83,8 +93,8 @@ public class NutchConfiguration {
   /**
    * Add the standard Nutch resources to {@link Configuration}.
    * 
-   * @param conf               Configuration object to which
-   *                           configuration is to be added.
+   * @param conf
+   *          Configuration object to which configuration is to be added.
    */
   private static Configuration addNutchResources(Configuration conf) {
     conf.addResource("nutch-default.xml");
@@ -92,4 +102,3 @@ public class NutchConfiguration {
     return conf;
   }
 }
-

Modified: nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java Thu Jan 29 
05:38:59 2015
@@ -20,7 +20,7 @@ package org.apache.nutch.util;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapred.JobConf;
 
-/** A {@link JobConf} for Nutch jobs.  */
+/** A {@link JobConf} for Nutch jobs. */
 public class NutchJob extends JobConf {
 
   public NutchJob(Configuration conf) {
@@ -28,4 +28,3 @@ public class NutchJob extends JobConf {
   }
 
 }
-

Modified: nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java Thu Jan 29 
05:38:59 2015
@@ -24,35 +24,33 @@ import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 
 public class ObjectCache {
-  
+
   private static final Logger LOG = LoggerFactory.getLogger(ObjectCache.class);
-  
-  private static final WeakHashMap<Configuration, ObjectCache> CACHE = 
-    new WeakHashMap<Configuration, ObjectCache>();
+
+  private static final WeakHashMap<Configuration, ObjectCache> CACHE = new 
WeakHashMap<Configuration, ObjectCache>();
 
   private final HashMap<String, Object> objectMap;
-  
+
   private ObjectCache() {
     objectMap = new HashMap<String, Object>();
   }
-  
+
   public synchronized static ObjectCache get(Configuration conf) {
     ObjectCache objectCache = CACHE.get(conf);
     if (objectCache == null) {
-      LOG.debug("No object cache found for conf=" + conf 
-                  + ", instantiating a new object cache");
+      LOG.debug("No object cache found for conf=" + conf
+          + ", instantiating a new object cache");
       objectCache = new ObjectCache();
       CACHE.put(conf, objectCache);
     }
     return objectCache;
   }
-  
+
   public synchronized Object getObject(String key) {
     return objectMap.get(key);
   }
-  
+
   public synchronized void setObject(String key, Object value) {
     objectMap.put(key, value);
   }
 }
-

Modified: nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java Thu Jan 
29 05:38:59 2015
@@ -21,46 +21,47 @@ import java.util.Collection;
 import java.util.Iterator;
 
 /**
- * A class for efficiently matching <code>String</code>s against a set
- * of prefixes.
+ * A class for efficiently matching <code>String</code>s against a set of
+ * prefixes.
  */
 public class PrefixStringMatcher extends TrieStringMatcher {
 
   /**
    * Creates a new <code>PrefixStringMatcher</code> which will match
-   * <code>String</code>s with any prefix in the supplied array.
-   * Zero-length <code>Strings</code> are ignored.
+   * <code>String</code>s with any prefix in the supplied array. Zero-length
+   * <code>Strings</code> are ignored.
    */
   public PrefixStringMatcher(String[] prefixes) {
     super();
-    for (int i= 0; i < prefixes.length; i++)
+    for (int i = 0; i < prefixes.length; i++)
       addPatternForward(prefixes[i]);
   }
 
   /**
    * Creates a new <code>PrefixStringMatcher</code> which will match
-   * <code>String</code>s with any prefix in the supplied    
+   * <code>String</code>s with any prefix in the supplied
    * <code>Collection</code>.
-   *
-   * @throws ClassCastException if any <code>Object</code>s in the
-   * collection are not <code>String</code>s
+   * 
+   * @throws ClassCastException
+   *           if any <code>Object</code>s in the collection are not
+   *           <code>String</code>s
    */
   public PrefixStringMatcher(Collection<String> prefixes) {
     super();
-    Iterator<String> iter= prefixes.iterator();
+    Iterator<String> iter = prefixes.iterator();
     while (iter.hasNext())
       addPatternForward(iter.next());
   }
 
   /**
-   * Returns true if the given <code>String</code> is matched by a
-   * prefix in the trie
+   * Returns true if the given <code>String</code> is matched by a prefix in 
the
+   * trie
    */
   public boolean matches(String input) {
-    TrieNode node= root;
-    for (int i= 0; i < input.length(); i++) {
-      node= node.getChild(input.charAt(i));
-      if (node == null) 
+    TrieNode node = root;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
         return false;
       if (node.isTerminal())
         return true;
@@ -73,13 +74,13 @@ public class PrefixStringMatcher extends
    * or <code>null<code> if no match exists.
    */
   public String shortestMatch(String input) {
-    TrieNode node= root;
-    for (int i= 0; i < input.length(); i++) {
-      node= node.getChild(input.charAt(i));
-      if (node == null) 
+    TrieNode node = root;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
         return null;
       if (node.isTerminal())
-        return input.substring(0, i+1);
+        return input.substring(0, i + 1);
     }
     return null;
   }
@@ -89,29 +90,26 @@ public class PrefixStringMatcher extends
    * or <code>null<code> if no match exists.
    */
   public String longestMatch(String input) {
-    TrieNode node= root;
-    String result= null;
-    for (int i= 0; i < input.length(); i++) {
-      node= node.getChild(input.charAt(i));
-      if (node == null) 
+    TrieNode node = root;
+    String result = null;
+    for (int i = 0; i < input.length(); i++) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
         break;
       if (node.isTerminal())
-        result= input.substring(0, i+1);
+        result = input.substring(0, i + 1);
     }
     return result;
   }
 
   public static final void main(String[] argv) {
-    PrefixStringMatcher matcher= 
-      new PrefixStringMatcher( 
-        new String[] 
-        {"abcd", "abc", "aac", "baz", "foo", "foobar"} );
-
-    String[] tests= {"a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
-                     "aaccca", "abaz", "baz", "bazooka", "fo", "foobar",
-                     "kite", };
+    PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] {
+        "abcd", "abc", "aac", "baz", "foo", "foobar" });
 
-    for (int i= 0; i < tests.length; i++) {
+    String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
+        "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
+
+    for (int i = 0; i < tests.length; i++) {
       System.out.println("testing: " + tests[i]);
       System.out.println("   matches: " + matcher.matches(tests[i]));
       System.out.println("  shortest: " + matcher.shortestMatch(tests[i]));

Modified: nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Thu Jan 29 
05:38:59 2015
@@ -18,42 +18,42 @@
 package org.apache.nutch.util;
 
 /**
- * A collection of String processing utility methods. 
+ * A collection of String processing utility methods.
  */
 public class StringUtil {
 
   /**
-   * Returns a copy of <code>s</code> padded with trailing spaces so
-   * that it's length is <code>length</code>.  Strings already
-   * <code>length</code> characters long or longer are not altered.
+   * Returns a copy of <code>s</code> padded with trailing spaces so that it's
+   * length is <code>length</code>. Strings already <code>length</code>
+   * characters long or longer are not altered.
    */
   public static String rightPad(String s, int length) {
-    StringBuffer sb= new StringBuffer(s);
-    for (int i= length - s.length(); i > 0; i--) 
+    StringBuffer sb = new StringBuffer(s);
+    for (int i = length - s.length(); i > 0; i--)
       sb.append(" ");
     return sb.toString();
   }
 
   /**
-   * Returns a copy of <code>s</code> padded with leading spaces so
-   * that it's length is <code>length</code>.  Strings already
-   * <code>length</code> characters long or longer are not altered.
+   * Returns a copy of <code>s</code> padded with leading spaces so that it's
+   * length is <code>length</code>. Strings already <code>length</code>
+   * characters long or longer are not altered.
    */
   public static String leftPad(String s, int length) {
-    StringBuffer sb= new StringBuffer();
-    for (int i= length - s.length(); i > 0; i--) 
+    StringBuffer sb = new StringBuffer();
+    for (int i = length - s.length(); i > 0; i--)
       sb.append(" ");
     sb.append(s);
     return sb.toString();
   }
 
-
-  private static final char[] HEX_DIGITS =
-  {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
+  private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6',
+      '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
 
   /**
    * Convenience call for {@link #toHexString(byte[], String, int)}, where
    * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+   * 
    * @param buf
    */
   public static String toHexString(byte[] buf) {
@@ -63,37 +63,48 @@ public class StringUtil {
   /**
    * Get a text representation of a byte[] as hexadecimal String, where each
    * pair of hexadecimal digits corresponds to consecutive bytes in the array.
-   * @param buf input data
-   * @param sep separate every pair of hexadecimal digits with this separator, 
or
-   * null if no separation is needed.
-   * @param lineLen break the output String into lines containing output for 
lineLen
-   * bytes.
+   * 
+   * @param buf
+   *          input data
+   * @param sep
+   *          separate every pair of hexadecimal digits with this separator, or
+   *          null if no separation is needed.
+   * @param lineLen
+   *          break the output String into lines containing output for lineLen
+   *          bytes.
    */
   public static String toHexString(byte[] buf, String sep, int lineLen) {
-    if (buf == null) return null;
-    if (lineLen <= 0) lineLen = Integer.MAX_VALUE;
+    if (buf == null)
+      return null;
+    if (lineLen <= 0)
+      lineLen = Integer.MAX_VALUE;
     StringBuffer res = new StringBuffer(buf.length * 2);
     for (int i = 0; i < buf.length; i++) {
       int b = buf[i];
       res.append(HEX_DIGITS[(b >> 4) & 0xf]);
       res.append(HEX_DIGITS[b & 0xf]);
-      if (i > 0 && (i % lineLen) == 0) res.append('\n');
-      else if (sep != null && i < lineLen - 1) res.append(sep); 
+      if (i > 0 && (i % lineLen) == 0)
+        res.append('\n');
+      else if (sep != null && i < lineLen - 1)
+        res.append(sep);
     }
     return res.toString();
   }
-  
+
   /**
    * Convert a String containing consecutive (no inside whitespace) hexadecimal
-   * digits into a corresponding byte array. If the number of digits is not 
even,
-   * a '0' will be appended in the front of the String prior to conversion.
-   * Leading and trailing whitespace is ignored.
-   * @param text input text
+   * digits into a corresponding byte array. If the number of digits is not
+   * even, a '0' will be appended in the front of the String prior to
+   * conversion. Leading and trailing whitespace is ignored.
+   * 
+   * @param text
+   *          input text
    * @return converted byte array, or null if unable to convert
    */
   public static byte[] fromHexString(String text) {
     text = text.trim();
-    if (text.length() % 2 != 0) text = "0" + text;
+    if (text.length() % 2 != 0)
+      text = "0" + text;
     int resLen = text.length() / 2;
     int loNibble, hiNibble;
     byte[] res = new byte[resLen];
@@ -101,12 +112,13 @@ public class StringUtil {
       int j = i << 1;
       hiNibble = charToNibble(text.charAt(j));
       loNibble = charToNibble(text.charAt(j + 1));
-      if (loNibble == -1 || hiNibble == -1) return null;
-      res[i] = (byte)(hiNibble << 4 | loNibble);
+      if (loNibble == -1 || hiNibble == -1)
+        return null;
+      res[i] = (byte) (hiNibble << 4 | loNibble);
     }
     return res;
   }
-  
+
   private static final int charToNibble(char c) {
     if (c >= '0' && c <= '9') {
       return c - '0';
@@ -125,7 +137,7 @@ public class StringUtil {
   public static boolean isEmpty(String str) {
     return (str == null) || (str.equals(""));
   }
-  
+
   /**
    * Simple character substitution which cleans all ï¿½ chars from a given 
String.
    */
@@ -136,8 +148,8 @@ public class StringUtil {
   public static void main(String[] args) {
     if (args.length != 1)
       System.out.println("Usage: StringUtil <encoding name>");
-    else 
-      System.out.println(args[0] + " is resolved to " +
-                         EncodingDetector.resolveEncodingAlias(args[0]));
+    else
+      System.out.println(args[0] + " is resolved to "
+          + EncodingDetector.resolveEncodingAlias(args[0]));
   }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java Thu Jan 
29 05:38:59 2015
@@ -21,8 +21,8 @@ import java.util.Collection;
 import java.util.Iterator;
 
 /**
- * A class for efficiently matching <code>String</code>s against a set
- * of suffixes.  Zero-length <code>Strings</code> are ignored.
+ * A class for efficiently matching <code>String</code>s against a set of
+ * suffixes. Zero-length <code>Strings</code> are ignored.
  */
 public class SuffixStringMatcher extends TrieStringMatcher {
 
@@ -32,7 +32,7 @@ public class SuffixStringMatcher extends
    */
   public SuffixStringMatcher(String[] suffixes) {
     super();
-    for (int i= 0; i < suffixes.length; i++)
+    for (int i = 0; i < suffixes.length; i++)
       addPatternBackward(suffixes[i]);
   }
 
@@ -43,20 +43,20 @@ public class SuffixStringMatcher extends
    */
   public SuffixStringMatcher(Collection<String> suffixes) {
     super();
-    Iterator<String> iter= suffixes.iterator();
+    Iterator<String> iter = suffixes.iterator();
     while (iter.hasNext())
       addPatternBackward(iter.next());
   }
 
   /**
-   * Returns true if the given <code>String</code> is matched by a
-   * suffix in the trie
+   * Returns true if the given <code>String</code> is matched by a suffix in 
the
+   * trie
    */
   public boolean matches(String input) {
-    TrieNode node= root;
-    for (int i= input.length() - 1; i >= 0; i--) {
-      node= node.getChild(input.charAt(i));
-      if (node == null) 
+    TrieNode node = root;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
         return false;
       if (node.isTerminal())
         return true;
@@ -64,16 +64,15 @@ public class SuffixStringMatcher extends
     return false;
   }
 
-
   /**
    * Returns the shortest suffix of <code>input<code> that is matched,
    * or <code>null<code> if no match exists.
    */
   public String shortestMatch(String input) {
-    TrieNode node= root;
-    for (int i= input.length() - 1; i >= 0; i--) {
-      node= node.getChild(input.charAt(i));
-      if (node == null) 
+    TrieNode node = root;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
         return null;
       if (node.isTerminal())
         return input.substring(i);
@@ -86,29 +85,26 @@ public class SuffixStringMatcher extends
    * or <code>null<code> if no match exists.
    */
   public String longestMatch(String input) {
-    TrieNode node= root;
-    String result= null;
-    for (int i= input.length() - 1; i >= 0; i--) {
-      node= node.getChild(input.charAt(i));
-      if (node == null) 
+    TrieNode node = root;
+    String result = null;
+    for (int i = input.length() - 1; i >= 0; i--) {
+      node = node.getChild(input.charAt(i));
+      if (node == null)
         break;
       if (node.isTerminal())
-        result= input.substring(i);
+        result = input.substring(i);
     }
     return result;
   }
 
   public static final void main(String[] argv) {
-    SuffixStringMatcher matcher= 
-      new SuffixStringMatcher( 
-        new String[] 
-        {"a", "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar"} 
);
-
-    String[] tests= {"a", "ac", "abcd", "abcdefg", "apple", "aa", "aac",
-                    "aaccca", "abaz", "baz", "bazooka", "fo", "foobar",
-                    "kite", };
+    SuffixStringMatcher matcher = new SuffixStringMatcher(new String[] { "a",
+        "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar" });
+
+    String[] tests = { "a", "ac", "abcd", "abcdefg", "apple", "aa", "aac",
+        "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
 
-    for (int i= 0; i < tests.length; i++) {
+    for (int i = 0; i < tests.length; i++) {
       System.out.println("testing: " + tests[i]);
       System.out.println("   matches: " + matcher.matches(tests[i]));
       System.out.println("  shortest: " + matcher.shortestMatch(tests[i]));

Modified: nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java Thu Jan 29 
05:38:59 2015
@@ -21,35 +21,39 @@ import java.text.NumberFormat;
 
 public class TimingUtil {
 
-    private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 };
+  private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 };
 
-    /**
-     * Calculate the elapsed time between two times specified in milliseconds.
-     * @param start The start of the time period
-     * @param end The end of the time period
-     * @return a string of the form "XhYmZs" when the elapsed time is X hours, 
Y minutes and Z seconds or null if start > end.
-     */
-    public static String elapsedTime(long start, long end){
-        if (start > end) {
-            return null;
-        }
-
-        long[] elapsedTime = new long[TIME_FACTOR.length];
-
-        for (int i = 0; i < TIME_FACTOR.length; i++) {
-            elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i];
-            start += TIME_FACTOR[i] * elapsedTime[i];
-        }
-
-        NumberFormat nf = NumberFormat.getInstance();
-        nf.setMinimumIntegerDigits(2);
-        StringBuffer buf = new StringBuffer();
-        for (int i = 0; i < elapsedTime.length; i++) {
-            if (i > 0) {
-                buf.append(":");
-            }
-            buf.append(nf.format(elapsedTime[i]));
-        }
-        return buf.toString();
+  /**
+   * Calculate the elapsed time between two times specified in milliseconds.
+   * 
+   * @param start
+   *          The start of the time period
+   * @param end
+   *          The end of the time period
+   * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y
+   *         minutes and Z seconds or null if start > end.
+   */
+  public static String elapsedTime(long start, long end) {
+    if (start > end) {
+      return null;
     }
+
+    long[] elapsedTime = new long[TIME_FACTOR.length];
+
+    for (int i = 0; i < TIME_FACTOR.length; i++) {
+      elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i];
+      start += TIME_FACTOR[i] * elapsedTime[i];
+    }
+
+    NumberFormat nf = NumberFormat.getInstance();
+    nf.setMinimumIntegerDigits(2);
+    StringBuffer buf = new StringBuffer();
+    for (int i = 0; i < elapsedTime.length; i++) {
+      if (i > 0) {
+        buf.append(":");
+      }
+      buf.append(nf.format(elapsedTime[i]));
+    }
+    return buf.toString();
+  }
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java Thu Jan 
29 05:38:59 2015
@@ -17,21 +17,19 @@
 
 package org.apache.nutch.util;
 
-
 import java.util.Arrays;
 import java.util.LinkedList;
 import java.util.ListIterator;
 
 /**
- * TrieStringMatcher is a base class for simple tree-based string
- * matching.
- *
+ * TrieStringMatcher is a base class for simple tree-based string matching.
+ * 
  */
 public abstract class TrieStringMatcher {
   protected TrieNode root;
 
   protected TrieStringMatcher() {
-    this.root= new TrieNode('\000', false);
+    this.root = new TrieNode('\000', false);
   }
 
   /**
@@ -44,20 +42,19 @@ public abstract class TrieStringMatcher
     protected boolean terminal;
 
     /**
-     * Creates a new TrieNode, which contains the given
-     * <code>nodeChar</code>.  If <code>isTerminal</code> is
-     * <code>true</code>, the new node is a <em>terminal</em> node in
-     * the trie.
-     */  
+     * Creates a new TrieNode, which contains the given <code>nodeChar</code>.
+     * If <code>isTerminal</code> is <code>true</code>, the new node is a
+     * <em>terminal</em> node in the trie.
+     */
     TrieNode(char nodeChar, boolean isTerminal) {
-      this.nodeChar= nodeChar;
-      this.terminal= isTerminal;
-      this.childrenList= new LinkedList<TrieNode>();
+      this.nodeChar = nodeChar;
+      this.terminal = isTerminal;
+      this.childrenList = new LinkedList<TrieNode>();
     }
 
     /**
-     * Returns <code>true</code> if this node is a <em>terminal</em>
-     * node in the trie.
+     * Returns <code>true</code> if this node is a <em>terminal</em> node in 
the
+     * trie.
      */
     boolean isTerminal() {
       return terminal;
@@ -65,67 +62,68 @@ public abstract class TrieStringMatcher
 
     /**
      * Returns the child node of this node whose node-character is
-     * <code>nextChar</code>.  If no such node exists, one will be is
-     * added.  If <em>isTerminal</em> is <code>true</code>, the node 
-     * will be a terminal node in the trie.
+     * <code>nextChar</code>. If no such node exists, one will be is added. If
+     * <em>isTerminal</em> is <code>true</code>, the node will be a terminal
+     * node in the trie.
      */
     TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) {
       if (childrenList == null) {
-        childrenList= new LinkedList<TrieNode>();
+        childrenList = new LinkedList<TrieNode>();
         childrenList.addAll(Arrays.asList(children));
-        children= null;
+        children = null;
       }
 
       if (childrenList.size() == 0) {
-        TrieNode newNode= new TrieNode(nextChar, isTerminal);
+        TrieNode newNode = new TrieNode(nextChar, isTerminal);
         childrenList.add(newNode);
         return newNode;
       }
 
-      ListIterator<TrieNode> iter= childrenList.listIterator();
-      TrieNode node= iter.next();
-      while ( (node.nodeChar < nextChar) && iter.hasNext() ) 
-        node= iter.next();
-                        
+      ListIterator<TrieNode> iter = childrenList.listIterator();
+      TrieNode node = iter.next();
+      while ((node.nodeChar < nextChar) && iter.hasNext())
+        node = iter.next();
+
       if (node.nodeChar == nextChar) {
-        node.terminal= node.terminal | isTerminal;
+        node.terminal = node.terminal | isTerminal;
         return node;
       }
 
-      if (node.nodeChar > nextChar) 
+      if (node.nodeChar > nextChar)
         iter.previous();
 
-      TrieNode newNode= new TrieNode(nextChar, isTerminal);
+      TrieNode newNode = new TrieNode(nextChar, isTerminal);
       iter.add(newNode);
-      return newNode;                   
+      return newNode;
     }
 
     /**
      * Returns the child node of this node whose node-character is
-     * <code>nextChar</code>.  If no such node exists,
-     * <code>null</code> is returned.
+     * <code>nextChar</code>. If no such node exists, <code>null</code> is
+     * returned.
      */
     TrieNode getChild(char nextChar) {
       if (children == null) {
-        children= childrenList.toArray(new TrieNode[childrenList.size()]);
-        childrenList= null;
+        children = childrenList.toArray(new TrieNode[childrenList.size()]);
+        childrenList = null;
         Arrays.sort(children);
       }
 
-      int min= 0;
-      int max= children.length - 1;
-      int mid= 0;
+      int min = 0;
+      int max = children.length - 1;
+      int mid = 0;
       while (min < max) {
-        mid= (min + max) / 2;
-        if (children[mid].nodeChar == nextChar) 
+        mid = (min + max) / 2;
+        if (children[mid].nodeChar == nextChar)
           return children[mid];
         if (children[mid].nodeChar < nextChar)
-          min= mid + 1;
-        else // if (children[mid].nodeChar > nextChar)
-          max= mid - 1;
+          min = mid + 1;
+        else
+          // if (children[mid].nodeChar > nextChar)
+          max = mid - 1;
       }
 
-      if (min == max) 
+      if (min == max)
         if (children[min].nodeChar == nextChar)
           return children[min];
 
@@ -133,59 +131,57 @@ public abstract class TrieStringMatcher
     }
 
     public int compareTo(TrieNode other) {
-      if (this.nodeChar < other.nodeChar) 
+      if (this.nodeChar < other.nodeChar)
         return -1;
-      if (this.nodeChar == other.nodeChar) 
+      if (this.nodeChar == other.nodeChar)
         return 0;
-//    if (this.nodeChar > other.nodeChar) 
+      // if (this.nodeChar > other.nodeChar)
       return 1;
     }
   }
 
   /**
    * Returns the next {@link TrieNode} visited, given that you are at
-   * <code>node</code>, and the the next character in the input is 
-   * the <code>idx</code>'th character of <code>s</code>.
+   * <code>node</code>, and the the next character in the input is the
+   * <code>idx</code>'th character of <code>s</code>.
    */
   protected final TrieNode matchChar(TrieNode node, String s, int idx) {
     return node.getChild(s.charAt(idx));
   }
 
   /**
-   * Adds any necessary nodes to the trie so that the given
-   * <code>String</code> can be decoded and the last character is
-   * represented by a terminal node.  Zero-length <code>Strings</code>
-   * are ignored.
+   * Adds any necessary nodes to the trie so that the given <code>String</code>
+   * can be decoded and the last character is represented by a terminal node.
+   * Zero-length <code>Strings</code> are ignored.
    */
   protected final void addPatternForward(String s) {
-    TrieNode node= root;
-    int stop= s.length() - 1;
+    TrieNode node = root;
+    int stop = s.length() - 1;
     int i;
     if (s.length() > 0) {
-      for (i= 0; i < stop; i++)
-        node= node.getChildAddIfNotPresent(s.charAt(i), false);
-      node= node.getChildAddIfNotPresent(s.charAt(i), true);
+      for (i = 0; i < stop; i++)
+        node = node.getChildAddIfNotPresent(s.charAt(i), false);
+      node = node.getChildAddIfNotPresent(s.charAt(i), true);
     }
   }
 
   /**
-   * Adds any necessary nodes to the trie so that the given
-   * <code>String</code> can be decoded <em>in reverse</em> and the
-   * first character is represented by a terminal node.  Zero-length
-   * <code>Strings</code> are ignored.
+   * Adds any necessary nodes to the trie so that the given <code>String</code>
+   * can be decoded <em>in reverse</em> and the first character is represented
+   * by a terminal node. Zero-length <code>Strings</code> are ignored.
    */
   protected final void addPatternBackward(String s) {
-    TrieNode node= root;
+    TrieNode node = root;
     if (s.length() > 0) {
-      for (int i= s.length()-1; i > 0; i--) 
-        node= node.getChildAddIfNotPresent(s.charAt(i), false);
-      node= node.getChildAddIfNotPresent(s.charAt(0), true);
+      for (int i = s.length() - 1; i > 0; i--)
+        node = node.getChildAddIfNotPresent(s.charAt(i), false);
+      node = node.getChildAddIfNotPresent(s.charAt(0), true);
     }
   }
 
   /**
-   * Returns true if the given <code>String</code> is matched by a
-   * pattern in the trie
+   * Returns true if the given <code>String</code> is matched by a pattern in
+   * the trie
    */
   public abstract boolean matches(String input);
 

Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Thu Jan 29 05:38:59 
2015
@@ -26,17 +26,20 @@ import org.apache.nutch.util.domain.Doma
 
 /** Utility class for URL analysis */
 public class URLUtil {
-  
+
   /**
-   * Resolve relative URL-s and fix a java.net.URL error
-   * in handling of URLs with pure query targets.
-   * @param base base url
-   * @param target target url (may be relative)
+   * Resolve relative URL-s and fix a java.net.URL error in handling of URLs
+   * with pure query targets.
+   * 
+   * @param base
+   *          base url
+   * @param target
+   *          target url (may be relative)
    * @return resolved absolute url.
    * @throws MalformedURLException
    */
   public static URL resolveURL(URL base, String target)
-          throws MalformedURLException {
+      throws MalformedURLException {
     target = target.trim();
 
     // handle the case that there is a target that is a pure query,
@@ -58,9 +61,10 @@ public class URLUtil {
   }
 
   /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
-   static URL fixPureQueryTargets(URL base, String target)
-          throws MalformedURLException {
-    if (!target.startsWith("?")) return new URL(base, target);
+  static URL fixPureQueryTargets(URL base, String target)
+      throws MalformedURLException {
+    if (!target.startsWith("?"))
+      return new URL(base, target);
 
     String basePath = base.getPath();
     String baseRightMost = "";
@@ -69,63 +73,75 @@ public class URLUtil {
       baseRightMost = basePath.substring(baseRightMostIdx + 1);
     }
 
-    if (target.startsWith("?")) target = baseRightMost + target;
+    if (target.startsWith("?"))
+      target = baseRightMost + target;
 
     return new URL(base, target);
   }
 
-  private static Pattern IP_PATTERN = 
Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
+  private static Pattern IP_PATTERN = Pattern
+      .compile("(\\d{1,3}\\.){3}(\\d{1,3})");
 
-  /** Returns the domain name of the url. The domain name of a url is
-   *  the substring of the url's hostname, w/o subdomain names. As an
-   *  example <br><code>
+  /**
+   * Returns the domain name of the url. The domain name of a url is the
+   * substring of the url's hostname, w/o subdomain names. As an example <br>
+   * <code>
    *  getDomainName(conf, new URL(http://lucene.apache.org/))
    *  </code><br>
-   *  will return <br><code> apache.org</code>
-   *   */
+   * will return <br>
+   * <code> apache.org</code>
+   * */
   public static String getDomainName(URL url) {
     DomainSuffixes tlds = DomainSuffixes.getInstance();
     String host = url.getHost();
-    //it seems that java returns hostnames ending with .
-    if(host.endsWith("."))
+    // it seems that java returns hostnames ending with .
+    if (host.endsWith("."))
       host = host.substring(0, host.length() - 1);
-    if(IP_PATTERN.matcher(host).matches())
+    if (IP_PATTERN.matcher(host).matches())
       return host;
-    
+
     int index = 0;
     String candidate = host;
-    for(;index >= 0;) {
+    for (; index >= 0;) {
       index = candidate.indexOf('.');
-      String subCandidate = candidate.substring(index+1); 
-      if(tlds.isDomainSuffix(subCandidate)) {
-        return candidate; 
+      String subCandidate = candidate.substring(index + 1);
+      if (tlds.isDomainSuffix(subCandidate)) {
+        return candidate;
       }
       candidate = subCandidate;
     }
     return candidate;
   }
 
-  /** Returns the domain name of the url. The domain name of a url is
-   *  the substring of the url's hostname, w/o subdomain names. As an
-   *  example <br><code>
+  /**
+   * Returns the domain name of the url. The domain name of a url is the
+   * substring of the url's hostname, w/o subdomain names. As an example <br>
+   * <code>
    *  getDomainName(conf, new http://lucene.apache.org/)
    *  </code><br>
-   *  will return <br><code> apache.org</code>
+   * will return <br>
+   * <code> apache.org</code>
+   * 
    * @throws MalformedURLException
    */
   public static String getDomainName(String url) throws MalformedURLException {
     return getDomainName(new URL(url));
   }
 
-  /** Returns the top level domain name of the url. The top level domain name
-   *  of a url is the substring of the url's hostname, w/o subdomain names.
-   *  As an example <br><code>
+  /**
+   * Returns the top level domain name of the url. The top level domain name of
+   * a url is the substring of the url's hostname, w/o subdomain names. As an
+   * example <br>
+   * <code>
    *  getTopLevelDomainName(conf, new http://lucene.apache.org/)
    *  </code><br>
-   *  will return <br><code> org</code>
+   * will return <br>
+   * <code> org</code>
+   * 
    * @throws MalformedURLException
    */
-  public static String getTopLevelDomainName(URL url) throws 
MalformedURLException {
+  public static String getTopLevelDomainName(URL url)
+      throws MalformedURLException {
     String suffix = getDomainSuffix(url).toString();
     int idx = suffix.lastIndexOf(".");
     if (idx != -1) {
@@ -135,94 +151,110 @@ public class URLUtil {
     }
   }
 
-  /** Returns the top level domain name of the url. The top level domain name
-   *  of a url is the substring of the url's hostname, w/o subdomain names.
-   *  As an example <br><code>
+  /**
+   * Returns the top level domain name of the url. The top level domain name of
+   * a url is the substring of the url's hostname, w/o subdomain names. As an
+   * example <br>
+   * <code>
    *  getTopLevelDomainName(conf, new http://lucene.apache.org/)
    *  </code><br>
-   *  will return <br><code> org</code>
+   * will return <br>
+   * <code> org</code>
+   * 
    * @throws MalformedURLException
    */
-  public static String getTopLevelDomainName(String url) throws 
MalformedURLException {
+  public static String getTopLevelDomainName(String url)
+      throws MalformedURLException {
     return getTopLevelDomainName(new URL(url));
   }
 
-  /** Returns whether the given urls have the same domain name.
-   * As an example, <br>
+  /**
+   * Returns whether the given urls have the same domain name. As an example, 
<br>
    * <code> isSameDomain(new URL("http://lucene.apache.org";)
    * , new URL("http://people.apache.org/";))
    * <br> will return true. </code>
-   *
+   * 
    * @return true if the domain names are equal
    */
   public static boolean isSameDomainName(URL url1, URL url2) {
     return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
   }
 
-  /**Returns whether the given urls have the same domain name.
-  * As an example, <br>
-  * <code> isSameDomain("http://lucene.apache.org";
-  * ,"http://people.apache.org/";)
-  * <br> will return true. </code>
-  * @return true if the domain names are equal
-  * @throws MalformedURLException
-  */
+  /**
+   * Returns whether the given urls have the same domain name. As an example, 
<br>
+   * <code> isSameDomain("http://lucene.apache.org";
+   * ,"http://people.apache.org/";)
+   * <br> will return true. </code>
+   * 
+   * @return true if the domain names are equal
+   * @throws MalformedURLException
+   */
   public static boolean isSameDomainName(String url1, String url2)
-    throws MalformedURLException {
+      throws MalformedURLException {
     return isSameDomainName(new URL(url1), new URL(url2));
   }
 
-  /** Returns the {@link DomainSuffix} corresponding to the
-   * last public part of the hostname
+  /**
+   * Returns the {@link DomainSuffix} corresponding to the last public part of
+   * the hostname
    */
   public static DomainSuffix getDomainSuffix(URL url) {
     DomainSuffixes tlds = DomainSuffixes.getInstance();
     String host = url.getHost();
-    if(IP_PATTERN.matcher(host).matches())
+    if (IP_PATTERN.matcher(host).matches())
       return null;
-    
+
     int index = 0;
     String candidate = host;
-    for(;index >= 0;) {
+    for (; index >= 0;) {
       index = candidate.indexOf('.');
-      String subCandidate = candidate.substring(index+1);
+      String subCandidate = candidate.substring(index + 1);
       DomainSuffix d = tlds.get(subCandidate);
-      if(d != null) {
-        return d; 
+      if (d != null) {
+        return d;
       }
       candidate = subCandidate;
     }
     return null;
   }
 
-  /** Returns the {@link DomainSuffix} corresponding to the
-   * last public part of the hostname
+  /**
+   * Returns the {@link DomainSuffix} corresponding to the last public part of
+   * the hostname
    */
-  public static DomainSuffix getDomainSuffix(String url) throws 
MalformedURLException {
+  public static DomainSuffix getDomainSuffix(String url)
+      throws MalformedURLException {
     return getDomainSuffix(new URL(url));
   }
 
-  /** Partitions of the hostname of the url by "."  */
+  /** Partitions of the hostname of the url by "." */
   public static String[] getHostSegments(URL url) {
     String host = url.getHost();
-    //return whole hostname, if it is an ipv4
-    //TODO : handle ipv6
-    if(IP_PATTERN.matcher(host).matches())
-      return new String[] {host};
+    // return whole hostname, if it is an ipv4
+    // TODO : handle ipv6
+    if (IP_PATTERN.matcher(host).matches())
+      return new String[] { host };
     return host.split("\\.");
   }
 
-  /** Partitions of the hostname of the url by "."
-   * @throws MalformedURLException */
-  public static String[] getHostSegments(String url) throws 
MalformedURLException {
-   return getHostSegments(new URL(url));
+  /**
+   * Partitions of the hostname of the url by "."
+   * 
+   * @throws MalformedURLException
+   */
+  public static String[] getHostSegments(String url)
+      throws MalformedURLException {
+    return getHostSegments(new URL(url));
   }
 
   /**
-   * <p>Given two urls, a src and a destination of a redirect, it returns the 
-   * representative url.<p>
+   * <p>
+   * Given two urls, a src and a destination of a redirect, it returns the
+   * representative url.
+   * <p>
    * 
-   * <p>This method implements an extended version of the algorithm used by the
+   * <p>
+   * This method implements an extended version of the algorithm used by the
    * Yahoo! Slurp crawler described here:<br>
    * <a href=
    * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html";> 
How
@@ -230,46 +262,63 @@ public class URLUtil {
    * <br>
    * <ol>
    * <li>Choose target url if either url is malformed.</li>
-   * <li>If different domains the keep the destination whether or not the 
+   * <li>If different domains the keep the destination whether or not the
    * redirect is temp or perm</li>
-   * <ul><li>a.com -> b.com*</li></ul>
+   * <ul>
+   * <li>a.com -> b.com*</li>
+   * </ul>
    * <li>If the redirect is permanent and the source is root, keep the 
source.</li>
-   * <ul><li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li></ul>
-   * <li>If the redirect is permanent and the source is not root and the 
+   * <ul>
+   * <li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li>
+   * </ul>
+   * <li>If the redirect is permanent and the source is not root and the
    * destination is root, keep the destination</li>
-   * <ul><li>a.com/xyz/index.html -> a.com*</li></ul>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com*</li>
+   * </ul>
    * <li>If the redirect is permanent and neither the source nor the 
destination
    * is root, then keep the destination</li>
-   * <ul><li>a.com/xyz/index.html -> a.com/abc/page.html*</li></ul>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
+   * </ul>
    * <li>If the redirect is temporary and source is root and destination is not
    * root, then keep the source</li>
-   * <ul><li>*a.com -> a.com/xyz/index.html</li></ul>
+   * <ul>
+   * <li>*a.com -> a.com/xyz/index.html</li>
+   * </ul>
    * <li>If the redirect is temporary and source is not root and destination is
    * root, then keep the destination</li>
-   * <ul><li>a.com/xyz/index.html -> a.com*</li></ul>
+   * <ul>
+   * <li>a.com/xyz/index.html -> a.com*</li>
+   * </ul>
    * <li>If the redirect is temporary and neither the source or the destination
-   * is root, then keep the shortest url.  First check for the shortest host,
-   * and if both are equal then check by path.  Path is first by length then by
-   * the number of / path separators.</li>
+   * is root, then keep the shortest url. First check for the shortest host, 
and
+   * if both are equal then check by path. Path is first by length then by the
+   * number of / path separators.</li>
    * <ul>
    * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li>
    * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li>
    * </ul>
    * <li>If the redirect is temporary and both the source and the destination
    * are root, then keep the shortest sub-domain</li>
-   * <ul><li>*www.a.com -> www.news.a.com</li></ul>
+   * <ul>
+   * <li>*www.a.com -> www.news.a.com</li>
+   * </ul>
    * <br>
-   * While not in this logic there is a further piece of representative url 
-   * logic that occurs during indexing and after scoring.  During creation of 
-   * the basic fields before indexing, if a url has a representative url stored
-   * we check both the url and its representative url (which should never be 
-   * the same) against their linkrank scores and the highest scoring one is 
-   * kept as the url and the lower scoring one is held as the orig url inside 
-   * of the index.
-   * 
-   * @param src The source url.
-   * @param dst The destination url.
-   * @param temp Is the redirect a temporary redirect.
+   * While not in this logic there is a further piece of representative url
+   * logic that occurs during indexing and after scoring. During creation of 
the
+   * basic fields before indexing, if a url has a representative url stored we
+   * check both the url and its representative url (which should never be the
+   * same) against their linkrank scores and the highest scoring one is kept as
+   * the url and the lower scoring one is held as the orig url inside of the
+   * index.
+   * 
+   * @param src
+   *          The source url.
+   * @param dst
+   *          The destination url.
+   * @param temp
+   *          Is the redirect a temporary redirect.
    * 
    * @return String The representative url.
    */
@@ -281,8 +330,7 @@ public class URLUtil {
     try {
       srcUrl = new URL(src);
       dstUrl = new URL(dst);
-    }
-    catch (MalformedURLException e) {
+    } catch (MalformedURLException e) {
       return dst;
     }
 
@@ -300,27 +348,27 @@ public class URLUtil {
 
     // 1) different domain them keep dest, temp or perm
     // a.com -> b.com*
-    //    
+    //
     // 2) permanent and root, keep src
     // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
-    //      
+    //
     // 3) permanent and not root and dest root, keep dest
     // a.com/xyz/index.html -> a.com*
-    //      
+    //
     // 4) permanent and neither root keep dest
     // a.com/xyz/index.html -> a.com/abc/page.html*
-    //      
+    //
     // 5) temp and root and dest not root keep src
     // *a.com -> a.com/xyz/index.html
-    //  
+    //
     // 7) temp and not root and dest root keep dest
     // a.com/xyz/index.html -> a.com*
-    //  
+    //
     // 8) temp and neither root, keep shortest, if hosts equal by path else by
     // hosts. paths are first by length then by number of / separators
     // a.com/xyz/index.html -> a.com/abc/page.html*
     // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
-    //  
+    //
     // 9) temp and both root keep shortest sub domain
     // *www.a.com -> www.news.a.com
 
@@ -332,39 +380,33 @@ public class URLUtil {
 
     // if it is a permanent redirect
     if (!temp) {
-      
+
       // if source is root return source, otherwise destination
       if (srcRoot) {
         return src;
-      }
-      else {
+      } else {
         return dst;
       }
-    }
-    else { // temporary redirect
+    } else { // temporary redirect
 
       // source root and destination not root
       if (srcRoot && !destRoot) {
         return src;
-      }
-      else if (!srcRoot && destRoot) { // destination root and source not
+      } else if (!srcRoot && destRoot) { // destination root and source not
         return dst;
-      }
-      else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) {
+      } else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) {
 
         // source and destination hosts are the same, check paths, host length
         int numSrcPaths = srcFile.split("/").length;
         int numDstPaths = dstFile.split("/").length;
         if (numSrcPaths != numDstPaths) {
           return (numDstPaths < numSrcPaths ? dst : src);
-        }
-        else {
+        } else {
           int srcPathLength = srcFile.length();
           int dstPathLength = dstFile.length();
           return (dstPathLength < srcPathLength ? dst : src);
         }
-      }
-      else {
+      } else {
 
         // different host names and both root take the shortest
         int numSrcSubs = srcHost.split("\\.").length;
@@ -378,24 +420,25 @@ public class URLUtil {
    * Returns the lowercased hostname for the url or null if the url is not well
    * formed.
    * 
-   * @param url The url to check.
+   * @param url
+   *          The url to check.
    * @return String The hostname for the url.
    */
   public static String getHost(String url) {
     try {
       return new URL(url).getHost().toLowerCase();
-    }
-    catch (MalformedURLException e) {
+    } catch (MalformedURLException e) {
       return null;
     }
   }
 
   /**
-   * Returns the page for the url.  The page consists of the protocol, host,
-   * and path, but does not include the query string.  The host is lowercased
-   * but the path is not.
+   * Returns the page for the url. The page consists of the protocol, host, and
+   * path, but does not include the query string. The host is lowercased but 
the
+   * path is not.
    * 
-   * @param url The url to check.
+   * @param url
+   *          The url to check.
    * @return String The page for the url.
    */
   public static String getPage(String url) {
@@ -404,12 +447,11 @@ public class URLUtil {
       url = url.toLowerCase();
       String queryStr = new URL(url).getQuery();
       return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
-    }
-    catch (MalformedURLException e) {
+    } catch (MalformedURLException e) {
       return null;
     }
   }
-  
+
   public static String getProtocol(String url) {
     try {
       return getProtocol(new URL(url));
@@ -417,7 +459,7 @@ public class URLUtil {
       return null;
     }
   }
-  
+
   public static String getProtocol(URL url) {
     return url.getProtocol();
   }
@@ -431,17 +473,11 @@ public class URLUtil {
         // also do not add additional slashes for file: URLs (NUTCH-1880)
         return url;
       }
-      URI p = new URI(u.getProtocol(),
-        u.getUserInfo(),
-        IDN.toASCII(host),
-        u.getPort(),
-        u.getPath(),
-        u.getQuery(),
-        u.getRef());
+      URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
+          u.getPort(), u.getPath(), u.getQuery(), u.getRef());
 
       return p.toString();
-    }
-    catch (Exception e) {
+    } catch (Exception e) {
       return null;
     }
   }
@@ -474,26 +510,23 @@ public class URLUtil {
       }
 
       return sb.toString();
-    }
-    catch (Exception e) {
+    } catch (Exception e) {
       return null;
     }
   }
 
-
   /** For testing */
-  public static void main(String[] args){
-    
-    if(args.length!=1) {
+  public static void main(String[] args) {
+
+    if (args.length != 1) {
       System.err.println("Usage : URLUtil <url>");
-      return ;
+      return;
     }
-    
+
     String url = args[0];
     try {
       System.out.println(URLUtil.getDomainName(new URL(url)));
-    }
-    catch (MalformedURLException ex) {
+    } catch (MalformedURLException ex) {
       ex.printStackTrace();
     }
   }

Modified: 
nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java Thu 
Jan 29 05:38:59 2015
@@ -48,12 +48,15 @@ import org.apache.nutch.util.URLUtil;
  */
 public class DomainStatistics extends Configured implements Tool {
 
-  private static final Logger LOG = 
LoggerFactory.getLogger(DomainStatistics.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainStatistics.class);
 
   private static final Text FETCHED_TEXT = new Text("FETCHED");
   private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED");
 
-  public static enum MyCounter {FETCHED, NOT_FETCHED, EMPTY_RESULT};
+  public static enum MyCounter {
+    FETCHED, NOT_FETCHED, EMPTY_RESULT
+  };
 
   private static final int MODE_HOST = 1;
   private static final int MODE_DOMAIN = 2;
@@ -64,7 +67,8 @@ public class DomainStatistics extends Co
 
   public int run(String[] args) throws Exception {
     if (args.length < 3) {
-      System.out.println("usage: DomainStatistics inputDirs outDir 
host|domain|suffix|tld [numOfReducer]");
+      System.out
+          .println("usage: DomainStatistics inputDirs outDir 
host|domain|suffix|tld [numOfReducer]");
       return 1;
     }
     String inputDir = args[0];
@@ -81,16 +85,16 @@ public class DomainStatistics extends Co
 
     int mode = 0;
     String jobName = "DomainStatistics";
-    if(args[2].equals("host")) {
+    if (args[2].equals("host")) {
       jobName = "Host statistics";
       mode = MODE_HOST;
-    } else if(args[2].equals("domain")) {
-      jobName  = "Domain statistics";
+    } else if (args[2].equals("domain")) {
+      jobName = "Domain statistics";
       mode = MODE_DOMAIN;
-    } else if(args[2].equals("suffix")) {
+    } else if (args[2].equals("suffix")) {
       jobName = "Suffix statistics";
       mode = MODE_SUFFIX;
-    } else if(args[2].equals("tld")) {
+    } else if (args[2].equals("tld")) {
       jobName = "TLD statistics";
       mode = MODE_TLD;
     }
@@ -128,59 +132,65 @@ public class DomainStatistics extends Co
     }
 
     long end = System.currentTimeMillis();
-    LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: 
" + TimingUtil.elapsedTime(start, end));
+    LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: "
+        + TimingUtil.elapsedTime(start, end));
     return 0;
   }
 
-  static class DomainStatisticsMapper extends Mapper<Text, CrawlDatum, Text, 
LongWritable> {
+  static class DomainStatisticsMapper extends
+      Mapper<Text, CrawlDatum, Text, LongWritable> {
     int mode = 0;
 
     public void setup(Context context) {
-      mode = context.getConfiguration().getInt("domain.statistics.mode", 
MODE_DOMAIN);
+      mode = context.getConfiguration().getInt("domain.statistics.mode",
+          MODE_DOMAIN);
     }
 
-    public void map(Text urlText, CrawlDatum datum, Context context) throws 
IOException, InterruptedException {
+    public void map(Text urlText, CrawlDatum datum, Context context)
+        throws IOException, InterruptedException {
 
-      if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
+      if (datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED
           || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
 
         try {
           URL url = new URL(urlText.toString());
           String out = null;
           switch (mode) {
-            case MODE_HOST:
-              out = url.getHost();
-              break;
-            case MODE_DOMAIN:
-              out = URLUtil.getDomainName(url);
-              break;
-            case MODE_SUFFIX:
-              out = URLUtil.getDomainSuffix(url).getDomain();
-              break;
-            case MODE_TLD:
-              out = URLUtil.getTopLevelDomainName(url);
-              break;
+          case MODE_HOST:
+            out = url.getHost();
+            break;
+          case MODE_DOMAIN:
+            out = URLUtil.getDomainName(url);
+            break;
+          case MODE_SUFFIX:
+            out = URLUtil.getDomainSuffix(url).getDomain();
+            break;
+          case MODE_TLD:
+            out = URLUtil.getTopLevelDomainName(url);
+            break;
           }
-          if(out.trim().equals("")) {
+          if (out.trim().equals("")) {
             LOG.info("url : " + url);
             context.getCounter(MyCounter.EMPTY_RESULT).increment(1);
           }
 
           context.write(new Text(out), new LongWritable(1));
-        } catch (Exception ex) { }
+        } catch (Exception ex) {
+        }
 
         context.getCounter(MyCounter.FETCHED).increment(1);
         context.write(FETCHED_TEXT, new LongWritable(1));
-      }
-      else {
+      } else {
         context.getCounter(MyCounter.NOT_FETCHED).increment(1);
         context.write(NOT_FETCHED_TEXT, new LongWritable(1));
       }
     }
   }
 
-  static class DomainStatisticsReducer extends Reducer <Text, LongWritable, 
LongWritable, Text> {
-    public void reduce(Text key, Iterable<LongWritable> values, Context 
context) throws IOException, InterruptedException {
+  static class DomainStatisticsReducer extends
+      Reducer<Text, LongWritable, LongWritable, Text> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context 
context)
+        throws IOException, InterruptedException {
       long total = 0;
 
       for (LongWritable val : values) {
@@ -191,8 +201,10 @@ public class DomainStatistics extends Co
     }
   }
 
-  public static class DomainStatisticsCombiner extends Reducer <Text, 
LongWritable, Text, LongWritable> {
-    public void reduce(Text key, Iterable<LongWritable> values, Context 
context) throws IOException, InterruptedException {
+  public static class DomainStatisticsCombiner extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+    public void reduce(Text key, Iterable<LongWritable> values, Context 
context)
+        throws IOException, InterruptedException {
       long total = 0;
 
       for (LongWritable val : values) {

svn commit: r1655526 [12/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Reply via email to