Modified: nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/FSUtils.java Thu Jan 29 05:38:59 2015 @@ -33,16 +33,20 @@ public class FSUtils { * path. If removeOld is set to false then the old path will be set to the * name current.old. * - * @param fs The FileSystem. - * @param current The end path, the one being replaced. - * @param replacement The path to replace with. - * @param removeOld True if we are removing the current path. + * @param fs + * The FileSystem. + * @param current + * The end path, the one being replaced. + * @param replacement + * The path to replace with. + * @param removeOld + * True if we are removing the current path. * - * @throws IOException If an error occurs during replacement. + * @throws IOException + * If an error occurs during replacement. */ public static void replace(FileSystem fs, Path current, Path replacement, - boolean removeOld) - throws IOException { + boolean removeOld) throws IOException { // rename any current path to old Path old = new Path(current + ".old"); @@ -60,12 +64,14 @@ public class FSUtils { /** * Closes a group of SequenceFile readers. * - * @param readers The SequenceFile readers to close. - * @throws IOException If an error occurs while closing a reader. + * @param readers + * The SequenceFile readers to close. + * @throws IOException + * If an error occurs while closing a reader. */ public static void closeReaders(SequenceFile.Reader[] readers) - throws IOException { - + throws IOException { + // loop through the readers, closing one by one if (readers != null) { for (int i = 0; i < readers.length; i++) { @@ -80,12 +86,13 @@ public class FSUtils { /** * Closes a group of MapFile readers. * - * @param readers The MapFile readers to close. - * @throws IOException If an error occurs while closing a reader. + * @param readers + * The MapFile readers to close. + * @throws IOException + * If an error occurs while closing a reader. */ - public static void closeReaders(MapFile.Reader[] readers) - throws IOException { - + public static void closeReaders(MapFile.Reader[] readers) throws IOException { + // loop through the readers closing one by one if (readers != null) { for (int i = 0; i < readers.length; i++) {
Modified: nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/GZIPUtils.java Thu Jan 29 05:38:59 2015 @@ -28,19 +28,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * A collection of utility methods for working on GZIPed data. + * A collection of utility methods for working on GZIPed data. */ public class GZIPUtils { - + private static final Logger LOG = LoggerFactory.getLogger(GZIPUtils.class); - private static final int EXPECTED_COMPRESSION_RATIO= 5; - private static final int BUF_SIZE= 4096; + private static final int EXPECTED_COMPRESSION_RATIO = 5; + private static final int BUF_SIZE = 4096; /** - * Returns an gunzipped copy of the input array. If the gzipped - * input has been truncated or corrupted, a best-effort attempt is - * made to unzip as much as possible. If no data can be extracted - * <code>null</code> is returned. + * Returns an gunzipped copy of the input array. If the gzipped input has been + * truncated or corrupted, a best-effort attempt is made to unzip as much as + * possible. If no data can be extracted <code>null</code> is returned. */ public static final byte[] unzipBestEffort(byte[] in) { return unzipBestEffort(in, Integer.MAX_VALUE); @@ -48,33 +47,32 @@ public class GZIPUtils { /** * Returns an gunzipped copy of the input array, truncated to - * <code>sizeLimit</code> bytes, if necessary. If the gzipped input - * has been truncated or corrupted, a best-effort attempt is made to - * unzip as much as possible. If no data can be extracted - * <code>null</code> is returned. + * <code>sizeLimit</code> bytes, if necessary. If the gzipped input has been + * truncated or corrupted, a best-effort attempt is made to unzip as much as + * possible. If no data can be extracted <code>null</code> is returned. */ public static final byte[] unzipBestEffort(byte[] in, int sizeLimit) { try { - // decompress using GZIPInputStream - ByteArrayOutputStream outStream = - new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + // decompress using GZIPInputStream + ByteArrayOutputStream outStream = new ByteArrayOutputStream( + EXPECTED_COMPRESSION_RATIO * in.length); - GZIPInputStream inStream = - new GZIPInputStream ( new ByteArrayInputStream(in) ); + GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream( + in)); byte[] buf = new byte[BUF_SIZE]; int written = 0; while (true) { try { int size = inStream.read(buf); - if (size <= 0) + if (size <= 0) break; if ((written + size) > sizeLimit) { outStream.write(buf, 0, sizeLimit - written); break; } outStream.write(buf, 0, size); - written+= size; + written += size; } catch (Exception e) { break; } @@ -91,23 +89,23 @@ public class GZIPUtils { } } - /** - * Returns an gunzipped copy of the input array. - * @throws IOException if the input cannot be properly decompressed + * Returns an gunzipped copy of the input array. + * + * @throws IOException + * if the input cannot be properly decompressed */ public static final byte[] unzip(byte[] in) throws IOException { - // decompress using GZIPInputStream - ByteArrayOutputStream outStream = - new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + // decompress using GZIPInputStream + ByteArrayOutputStream outStream = new ByteArrayOutputStream( + EXPECTED_COMPRESSION_RATIO * in.length); - GZIPInputStream inStream = - new GZIPInputStream ( new ByteArrayInputStream(in) ); + GZIPInputStream inStream = new GZIPInputStream(new ByteArrayInputStream(in)); byte[] buf = new byte[BUF_SIZE]; while (true) { int size = inStream.read(buf); - if (size <= 0) + if (size <= 0) break; outStream.write(buf, 0, size); } @@ -121,11 +119,11 @@ public class GZIPUtils { */ public static final byte[] zip(byte[] in) { try { - // compress using GZIPOutputStream - ByteArrayOutputStream byteOut= - new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO); + // compress using GZIPOutputStream + ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length + / EXPECTED_COMPRESSION_RATIO); - GZIPOutputStream outStream= new GZIPOutputStream(byteOut); + GZIPOutputStream outStream = new GZIPOutputStream(byteOut); try { outStream.write(in); @@ -146,5 +144,5 @@ public class GZIPUtils { return null; } } - + } Modified: nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java Thu Jan 29 05:38:59 2015 @@ -24,12 +24,15 @@ import org.apache.hadoop.conf.Configurat import org.apache.hadoop.io.GenericWritable; import org.apache.hadoop.io.Writable; -/** A generic Writable wrapper that can inject Configuration to {@link Configurable}s */ -public abstract class GenericWritableConfigurable extends GenericWritable - implements Configurable { +/** + * A generic Writable wrapper that can inject Configuration to + * {@link Configurable}s + */ +public abstract class GenericWritableConfigurable extends GenericWritable + implements Configurable { private Configuration conf; - + public Configuration getConf() { return conf; } @@ -37,7 +40,7 @@ public abstract class GenericWritableCon public void setConf(Configuration conf) { this.conf = conf; } - + @Override public void readFields(DataInput in) throws IOException { byte type = in.readByte(); @@ -50,8 +53,8 @@ public abstract class GenericWritableCon } Writable w = get(); if (w instanceof Configurable) - ((Configurable)w).setConf(conf); + ((Configurable) w).setConf(conf); w.readFields(in); } - + } Modified: nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/HadoopFSUtil.java Thu Jan 29 05:38:59 2015 @@ -25,48 +25,48 @@ import org.apache.hadoop.fs.PathFilter; public class HadoopFSUtil { - /** - * Returns PathFilter that passes all paths through. - */ - public static PathFilter getPassAllFilter() { - return new PathFilter() { - public boolean accept(Path arg0) { - return true; - } - }; - } + /** + * Returns PathFilter that passes all paths through. + */ + public static PathFilter getPassAllFilter() { + return new PathFilter() { + public boolean accept(Path arg0) { + return true; + } + }; + } + + /** + * Returns PathFilter that passes directories through. + */ + public static PathFilter getPassDirectoriesFilter(final FileSystem fs) { + return new PathFilter() { + public boolean accept(final Path path) { + try { + return fs.getFileStatus(path).isDir(); + } catch (IOException ioe) { + return false; + } + } - /** - * Returns PathFilter that passes directories through. - */ - public static PathFilter getPassDirectoriesFilter(final FileSystem fs) { - return new PathFilter() { - public boolean accept(final Path path) { - try { - return fs.getFileStatus(path).isDir(); - } catch (IOException ioe) { - return false; - } - } + }; + } - }; + /** + * Turns an array of FileStatus into an array of Paths. + */ + public static Path[] getPaths(FileStatus[] stats) { + if (stats == null) { + return null; } - - /** - * Turns an array of FileStatus into an array of Paths. - */ - public static Path[] getPaths(FileStatus[] stats) { - if (stats == null) { - return null; - } - if (stats.length == 0) { - return new Path[0]; - } - Path[] res = new Path[stats.length]; - for (int i = 0; i < stats.length; i++) { - res[i] = stats[i].getPath(); - } - return res; + if (stats.length == 0) { + return new Path[0]; + } + Path[] res = new Path[stats.length]; + for (int i = 0; i < stats.length; i++) { + res[i] = stats[i].getPath(); } + return res; + } } Modified: nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/LockUtil.java Thu Jan 29 05:38:59 2015 @@ -28,22 +28,29 @@ import org.apache.hadoop.fs.Path; * @author Andrzej Bialecki */ public class LockUtil { - + /** * Create a lock file. - * @param fs filesystem - * @param lockFile name of the lock file - * @param accept if true, and the target file exists, consider it valid. If false - * and the target file exists, throw an IOException. - * @throws IOException if accept is false, and the target file already exists, - * or if it's a directory. + * + * @param fs + * filesystem + * @param lockFile + * name of the lock file + * @param accept + * if true, and the target file exists, consider it valid. If false + * and the target file exists, throw an IOException. + * @throws IOException + * if accept is false, and the target file already exists, or if + * it's a directory. */ - public static void createLockFile(FileSystem fs, Path lockFile, boolean accept) throws IOException { + public static void createLockFile(FileSystem fs, Path lockFile, boolean accept) + throws IOException { if (fs.exists(lockFile)) { - if(!accept) + if (!accept) throw new IOException("lock file " + lockFile + " already exists."); if (fs.getFileStatus(lockFile).isDir()) - throw new IOException("lock file " + lockFile + " already exists and is a directory."); + throw new IOException("lock file " + lockFile + + " already exists and is a directory."); // do nothing - the file already exists. } else { // make sure parents exist @@ -55,16 +62,23 @@ public class LockUtil { /** * Remove lock file. NOTE: applications enforce the semantics of this file - * this method simply removes any file with a given name. - * @param fs filesystem - * @param lockFile lock file name + * + * @param fs + * filesystem + * @param lockFile + * lock file name * @return false, if the lock file doesn't exist. True, if it existed and was - * successfully removed. - * @throws IOException if lock file exists but it is a directory. + * successfully removed. + * @throws IOException + * if lock file exists but it is a directory. */ - public static boolean removeLockFile(FileSystem fs, Path lockFile) throws IOException { - if (!fs.exists(lockFile)) return false; + public static boolean removeLockFile(FileSystem fs, Path lockFile) + throws IOException { + if (!fs.exists(lockFile)) + return false; if (fs.getFileStatus(lockFile).isDir()) - throw new IOException("lock file " + lockFile + " exists but is a directory!"); + throw new IOException("lock file " + lockFile + + " exists but is a directory!"); return fs.delete(lockFile, false); } } Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Thu Jan 29 05:38:59 2015 @@ -45,12 +45,12 @@ import org.apache.nutch.protocol.Protoco * @author mattmann * @since NUTCH-608 * - * <p> - * This is a facade class to insulate Nutch from its underlying Mime Type - * substrate library, <a href="http://incubator.apache.org/tika/">Apache Tika</a>. - * Any mime handling code should be placed in this utility class, and hidden - * from the Nutch classes that rely on it. - * </p> + * <p> + * This is a facade class to insulate Nutch from its underlying Mime Type + * substrate library, <a href="http://incubator.apache.org/tika/">Apache + * Tika</a>. Any mime handling code should be placed in this utility + * class, and hidden from the Nutch classes that rely on it. + * </p> */ public final class MimeUtil { @@ -66,7 +66,8 @@ public final class MimeUtil { private boolean mimeMagic; /* our log stream */ - private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class.getName()); + private static final Logger LOG = LoggerFactory.getLogger(MimeUtil.class + .getName()); public MimeUtil(Configuration conf) { tika = new Tika(); @@ -75,25 +76,26 @@ public final class MimeUtil { .getName()); if (mimeTypez == null) { try { - String customMimeTypeFile = conf.get("mime.types.file"); - if (customMimeTypeFile!=null && customMimeTypeFile.equals("")==false){ - try { - mimeTypez = MimeTypesFactory.create(conf - .getConfResourceAsInputStream(customMimeTypeFile)); - } - catch (Exception e){ - LOG.error("Can't load mime.types.file : "+customMimeTypeFile+" using Tika's default"); - } + String customMimeTypeFile = conf.get("mime.types.file"); + if (customMimeTypeFile != null + && customMimeTypeFile.equals("") == false) { + try { + mimeTypez = MimeTypesFactory.create(conf + .getConfResourceAsInputStream(customMimeTypeFile)); + } catch (Exception e) { + LOG.error("Can't load mime.types.file : " + customMimeTypeFile + + " using Tika's default"); } - if (mimeTypez==null) - mimeTypez = MimeTypes.getDefaultMimeTypes(); + } + if (mimeTypez == null) + mimeTypez = MimeTypes.getDefaultMimeTypes(); } catch (Exception e) { - LOG.error("Exception in MimeUtil "+e.getMessage()); + LOG.error("Exception in MimeUtil " + e.getMessage()); throw new RuntimeException(e); } objectCache.setObject(MimeTypes.class.getName(), mimeTypez); } - + this.mimeTypes = mimeTypez; this.mimeMagic = conf.getBoolean("mime.type.magic", true); } @@ -129,14 +131,13 @@ public final class MimeUtil { /** * A facade interface to trying all the possible mime type resolution * strategies available within Tika. First, the mime type provided in - * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. - * Then the cleaned mime type is looked up in the underlying Tika - * {@link MimeTypes} registry, by its cleaned name. If the {@link MimeType} - * is found, then that mime type is used, otherwise URL resolution is - * used to try and determine the mime type. However, if - * <code>mime.type.magic</code> is enabled in {@link NutchConfiguration}, - * then mime type magic resolution is used to try and obtain a - * better-than-the-default approximation of the {@link MimeType}. + * <code>typeName</code> is cleaned, with {@link #cleanMimeType(String)}. Then + * the cleaned mime type is looked up in the underlying Tika {@link MimeTypes} + * registry, by its cleaned name. If the {@link MimeType} is found, then that + * mime type is used, otherwise URL resolution is used to try and determine + * the mime type. However, if <code>mime.type.magic</code> is enabled in + * {@link NutchConfiguration}, then mime type magic resolution is used to try + * and obtain a better-than-the-default approximation of the {@link MimeType}. * * @param typeName * The original mime type, returned from a {@link ProtocolOutput}. @@ -176,7 +177,7 @@ public final class MimeUtil { throw new RuntimeException(e); } } else { - retType = type.getName(); + retType = type.getName(); } // if magic is enabled use mime magic to guess if the mime type returned @@ -194,14 +195,15 @@ public final class MimeUtil { InputStream stream = TikaInputStream.get(data); try { magicType = tika.detect(stream, tikaMeta); - } finally { - stream.close(); + } finally { + stream.close(); } - } catch (IOException ignore) {} + } catch (IOException ignore) { + } if (magicType != null && !magicType.equals(MimeTypes.OCTET_STREAM) - && !magicType.equals(MimeTypes.PLAIN_TEXT) - && retType != null && !retType.equals(magicType)) { + && !magicType.equals(MimeTypes.PLAIN_TEXT) && retType != null + && !retType.equals(magicType)) { // If magic enabled and the current mime type differs from that of the // one returned from the magic, take the magic mimeType @@ -224,12 +226,12 @@ public final class MimeUtil { /** * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(String)} * method. - * + * * @param url * A string representation of the document {@link URL} to sense the * {@link MimeType} for. - * @return An appropriate {@link MimeType}, identified from the given - * Document url in string form. + * @return An appropriate {@link MimeType}, identified from the given Document + * url in string form. */ public String getMimeType(String url) { return tika.detect(url); @@ -238,11 +240,11 @@ public final class MimeUtil { /** * A facade interface to Tika's underlying {@link MimeTypes#forName(String)} * method. - * + * * @param name * The name of a valid {@link MimeType} in the Tika mime registry. - * @return The object representation of the {@link MimeType}, if it exists, - * or null otherwise. + * @return The object representation of the {@link MimeType}, if it exists, or + * null otherwise. */ public String forName(String name) { try { @@ -257,7 +259,7 @@ public final class MimeUtil { /** * Facade interface to Tika's underlying {@link MimeTypes#getMimeType(File)} * method. - * + * * @param f * The {@link File} to sense the {@link MimeType} for. * @return The {@link MimeType} of the given {@link File}, or null if it @@ -273,5 +275,4 @@ public final class MimeUtil { } } - } Modified: nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/NodeWalker.java Thu Jan 29 05:38:59 2015 @@ -22,13 +22,17 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** - * <p>A utility class that allows the walking of any DOM tree using a stack - * instead of recursion. As the node tree is walked the next node is popped - * off of the stack and all of its children are automatically added to the - * stack to be called in tree order.</p> + * <p> + * A utility class that allows the walking of any DOM tree using a stack instead + * of recursion. As the node tree is walked the next node is popped off of the + * stack and all of its children are automatically added to the stack to be + * called in tree order. + * </p> * - * <p>Currently this class is not thread safe. It is assumed that only one - * thread will be accessing the <code>NodeWalker</code> at any given time.</p> + * <p> + * Currently this class is not thread safe. It is assumed that only one thread + * will be accessing the <code>NodeWalker</code> at any given time. + * </p> */ public class NodeWalker { @@ -36,7 +40,7 @@ public class NodeWalker { private Node currentNode; private NodeList currentChildren; private Stack<Node> nodes; - + /** * Starts the <code>Node</code> tree from the root node. * @@ -47,68 +51,74 @@ public class NodeWalker { nodes = new Stack<Node>(); nodes.add(rootNode); } - + /** - * <p>Returns the next <code>Node</code> on the stack and pushes all of its - * children onto the stack, allowing us to walk the node tree without the - * use of recursion. If there are no more nodes on the stack then null is - * returned.</p> + * <p> + * Returns the next <code>Node</code> on the stack and pushes all of its + * children onto the stack, allowing us to walk the node tree without the use + * of recursion. If there are no more nodes on the stack then null is + * returned. + * </p> * - * @return Node The next <code>Node</code> on the stack or null if there - * isn't a next node. + * @return Node The next <code>Node</code> on the stack or null if there isn't + * a next node. */ public Node nextNode() { - + // if no next node return null if (!hasNext()) { return null; } - + // pop the next node off of the stack and push all of its children onto // the stack currentNode = nodes.pop(); currentChildren = currentNode.getChildNodes(); int childLen = (currentChildren != null) ? currentChildren.getLength() : 0; - + // put the children node on the stack in first to last order for (int i = childLen - 1; i >= 0; i--) { nodes.add(currentChildren.item(i)); } - + return currentNode; } - + /** - * <p>Skips over and removes from the node stack the children of the last - * node. When getting a next node from the walker, that node's children - * are automatically added to the stack. You can call this method to remove - * those children from the stack.</p> - * - * <p>This is useful when you don't want to process deeper into the - * current path of the node tree but you want to continue processing sibling - * nodes.</p> - * + * <p> + * Skips over and removes from the node stack the children of the last node. + * When getting a next node from the walker, that node's children are + * automatically added to the stack. You can call this method to remove those + * children from the stack. + * </p> + * + * <p> + * This is useful when you don't want to process deeper into the current path + * of the node tree but you want to continue processing sibling nodes. + * </p> + * */ public void skipChildren() { - + int childLen = (currentChildren != null) ? currentChildren.getLength() : 0; - - for (int i = 0 ; i < childLen ; i++) { + + for (int i = 0; i < childLen; i++) { Node child = nodes.peek(); if (child.equals(currentChildren.item(i))) { nodes.pop(); } } } - + /** * Return the current node. + * * @return Node */ public Node getCurrentNode() { return currentNode; } - + /** * @return returns true if there are more nodes on the current stack. * Modified: nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/NutchConfiguration.java Thu Jan 29 05:38:59 2015 @@ -23,37 +23,42 @@ import java.util.UUID; import org.apache.hadoop.conf.Configuration; - -/** Utility to create Hadoop {@link Configuration}s that include Nutch-specific - * resources. */ +/** + * Utility to create Hadoop {@link Configuration}s that include Nutch-specific + * resources. + */ public class NutchConfiguration { public static final String UUID_KEY = "nutch.conf.uuid"; - - private NutchConfiguration() {} // singleton - + + private NutchConfiguration() { + } // singleton + /* - * Configuration.hashCode() doesn't return values that - * correspond to a unique set of parameters. This is a workaround - * so that we can track instances of Configuration created by Nutch. + * Configuration.hashCode() doesn't return values that correspond to a unique + * set of parameters. This is a workaround so that we can track instances of + * Configuration created by Nutch. */ private static void setUUID(Configuration conf) { UUID uuid = UUID.randomUUID(); conf.set(UUID_KEY, uuid.toString()); } - + /** - * Retrieve a Nutch UUID of this configuration object, or null - * if the configuration was created elsewhere. - * @param conf configuration instance + * Retrieve a Nutch UUID of this configuration object, or null if the + * configuration was created elsewhere. + * + * @param conf + * configuration instance * @return uuid or null */ public static String getUUID(Configuration conf) { return conf.get(UUID_KEY); } - /** Create a {@link Configuration} for Nutch. This will load the standard - * Nutch resources, <code>nutch-default.xml</code> and - * <code>nutch-site.xml</code> overrides. + /** + * Create a {@link Configuration} for Nutch. This will load the standard Nutch + * resources, <code>nutch-default.xml</code> and <code>nutch-site.xml</code> + * overrides. */ public static Configuration create() { Configuration conf = new Configuration(); @@ -61,14 +66,19 @@ public class NutchConfiguration { addNutchResources(conf); return conf; } - - /** Create a {@link Configuration} from supplied properties. - * @param addNutchResources if true, then first <code>nutch-default.xml</code>, - * and then <code>nutch-site.xml</code> will be loaded prior to applying the - * properties. Otherwise these resources won't be used. - * @param nutchProperties a set of properties to define (or override) + + /** + * Create a {@link Configuration} from supplied properties. + * + * @param addNutchResources + * if true, then first <code>nutch-default.xml</code>, and then + * <code>nutch-site.xml</code> will be loaded prior to applying the + * properties. Otherwise these resources won't be used. + * @param nutchProperties + * a set of properties to define (or override) */ - public static Configuration create(boolean addNutchResources, Properties nutchProperties) { + public static Configuration create(boolean addNutchResources, + Properties nutchProperties) { Configuration conf = new Configuration(); setUUID(conf); if (addNutchResources) { @@ -83,8 +93,8 @@ public class NutchConfiguration { /** * Add the standard Nutch resources to {@link Configuration}. * - * @param conf Configuration object to which - * configuration is to be added. + * @param conf + * Configuration object to which configuration is to be added. */ private static Configuration addNutchResources(Configuration conf) { conf.addResource("nutch-default.xml"); @@ -92,4 +102,3 @@ public class NutchConfiguration { return conf; } } - Modified: nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/NutchJob.java Thu Jan 29 05:38:59 2015 @@ -20,7 +20,7 @@ package org.apache.nutch.util; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.JobConf; -/** A {@link JobConf} for Nutch jobs. */ +/** A {@link JobConf} for Nutch jobs. */ public class NutchJob extends JobConf { public NutchJob(Configuration conf) { @@ -28,4 +28,3 @@ public class NutchJob extends JobConf { } } - Modified: nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/ObjectCache.java Thu Jan 29 05:38:59 2015 @@ -24,35 +24,33 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; public class ObjectCache { - + private static final Logger LOG = LoggerFactory.getLogger(ObjectCache.class); - - private static final WeakHashMap<Configuration, ObjectCache> CACHE = - new WeakHashMap<Configuration, ObjectCache>(); + + private static final WeakHashMap<Configuration, ObjectCache> CACHE = new WeakHashMap<Configuration, ObjectCache>(); private final HashMap<String, Object> objectMap; - + private ObjectCache() { objectMap = new HashMap<String, Object>(); } - + public synchronized static ObjectCache get(Configuration conf) { ObjectCache objectCache = CACHE.get(conf); if (objectCache == null) { - LOG.debug("No object cache found for conf=" + conf - + ", instantiating a new object cache"); + LOG.debug("No object cache found for conf=" + conf + + ", instantiating a new object cache"); objectCache = new ObjectCache(); CACHE.put(conf, objectCache); } return objectCache; } - + public synchronized Object getObject(String key) { return objectMap.get(key); } - + public synchronized void setObject(String key, Object value) { objectMap.put(key, value); } } - Modified: nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java Thu Jan 29 05:38:59 2015 @@ -21,46 +21,47 @@ import java.util.Collection; import java.util.Iterator; /** - * A class for efficiently matching <code>String</code>s against a set - * of prefixes. + * A class for efficiently matching <code>String</code>s against a set of + * prefixes. */ public class PrefixStringMatcher extends TrieStringMatcher { /** * Creates a new <code>PrefixStringMatcher</code> which will match - * <code>String</code>s with any prefix in the supplied array. - * Zero-length <code>Strings</code> are ignored. + * <code>String</code>s with any prefix in the supplied array. Zero-length + * <code>Strings</code> are ignored. */ public PrefixStringMatcher(String[] prefixes) { super(); - for (int i= 0; i < prefixes.length; i++) + for (int i = 0; i < prefixes.length; i++) addPatternForward(prefixes[i]); } /** * Creates a new <code>PrefixStringMatcher</code> which will match - * <code>String</code>s with any prefix in the supplied + * <code>String</code>s with any prefix in the supplied * <code>Collection</code>. - * - * @throws ClassCastException if any <code>Object</code>s in the - * collection are not <code>String</code>s + * + * @throws ClassCastException + * if any <code>Object</code>s in the collection are not + * <code>String</code>s */ public PrefixStringMatcher(Collection<String> prefixes) { super(); - Iterator<String> iter= prefixes.iterator(); + Iterator<String> iter = prefixes.iterator(); while (iter.hasNext()) addPatternForward(iter.next()); } /** - * Returns true if the given <code>String</code> is matched by a - * prefix in the trie + * Returns true if the given <code>String</code> is matched by a prefix in the + * trie */ public boolean matches(String input) { - TrieNode node= root; - for (int i= 0; i < input.length(); i++) { - node= node.getChild(input.charAt(i)); - if (node == null) + TrieNode node = root; + for (int i = 0; i < input.length(); i++) { + node = node.getChild(input.charAt(i)); + if (node == null) return false; if (node.isTerminal()) return true; @@ -73,13 +74,13 @@ public class PrefixStringMatcher extends * or <code>null<code> if no match exists. */ public String shortestMatch(String input) { - TrieNode node= root; - for (int i= 0; i < input.length(); i++) { - node= node.getChild(input.charAt(i)); - if (node == null) + TrieNode node = root; + for (int i = 0; i < input.length(); i++) { + node = node.getChild(input.charAt(i)); + if (node == null) return null; if (node.isTerminal()) - return input.substring(0, i+1); + return input.substring(0, i + 1); } return null; } @@ -89,29 +90,26 @@ public class PrefixStringMatcher extends * or <code>null<code> if no match exists. */ public String longestMatch(String input) { - TrieNode node= root; - String result= null; - for (int i= 0; i < input.length(); i++) { - node= node.getChild(input.charAt(i)); - if (node == null) + TrieNode node = root; + String result = null; + for (int i = 0; i < input.length(); i++) { + node = node.getChild(input.charAt(i)); + if (node == null) break; if (node.isTerminal()) - result= input.substring(0, i+1); + result = input.substring(0, i + 1); } return result; } public static final void main(String[] argv) { - PrefixStringMatcher matcher= - new PrefixStringMatcher( - new String[] - {"abcd", "abc", "aac", "baz", "foo", "foobar"} ); - - String[] tests= {"a", "ab", "abc", "abcdefg", "apple", "aa", "aac", - "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", - "kite", }; + PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] { + "abcd", "abc", "aac", "baz", "foo", "foobar" }); - for (int i= 0; i < tests.length; i++) { + String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac", + "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", }; + + for (int i = 0; i < tests.length; i++) { System.out.println("testing: " + tests[i]); System.out.println(" matches: " + matcher.matches(tests[i])); System.out.println(" shortest: " + matcher.shortestMatch(tests[i])); Modified: nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Thu Jan 29 05:38:59 2015 @@ -18,42 +18,42 @@ package org.apache.nutch.util; /** - * A collection of String processing utility methods. + * A collection of String processing utility methods. */ public class StringUtil { /** - * Returns a copy of <code>s</code> padded with trailing spaces so - * that it's length is <code>length</code>. Strings already - * <code>length</code> characters long or longer are not altered. + * Returns a copy of <code>s</code> padded with trailing spaces so that it's + * length is <code>length</code>. Strings already <code>length</code> + * characters long or longer are not altered. */ public static String rightPad(String s, int length) { - StringBuffer sb= new StringBuffer(s); - for (int i= length - s.length(); i > 0; i--) + StringBuffer sb = new StringBuffer(s); + for (int i = length - s.length(); i > 0; i--) sb.append(" "); return sb.toString(); } /** - * Returns a copy of <code>s</code> padded with leading spaces so - * that it's length is <code>length</code>. Strings already - * <code>length</code> characters long or longer are not altered. + * Returns a copy of <code>s</code> padded with leading spaces so that it's + * length is <code>length</code>. Strings already <code>length</code> + * characters long or longer are not altered. */ public static String leftPad(String s, int length) { - StringBuffer sb= new StringBuffer(); - for (int i= length - s.length(); i > 0; i--) + StringBuffer sb = new StringBuffer(); + for (int i = length - s.length(); i > 0; i--) sb.append(" "); sb.append(s); return sb.toString(); } - - private static final char[] HEX_DIGITS = - {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'}; + private static final char[] HEX_DIGITS = { '0', '1', '2', '3', '4', '5', '6', + '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; /** * Convenience call for {@link #toHexString(byte[], String, int)}, where * <code>sep = null; lineLen = Integer.MAX_VALUE</code>. + * * @param buf */ public static String toHexString(byte[] buf) { @@ -63,37 +63,48 @@ public class StringUtil { /** * Get a text representation of a byte[] as hexadecimal String, where each * pair of hexadecimal digits corresponds to consecutive bytes in the array. - * @param buf input data - * @param sep separate every pair of hexadecimal digits with this separator, or - * null if no separation is needed. - * @param lineLen break the output String into lines containing output for lineLen - * bytes. + * + * @param buf + * input data + * @param sep + * separate every pair of hexadecimal digits with this separator, or + * null if no separation is needed. + * @param lineLen + * break the output String into lines containing output for lineLen + * bytes. */ public static String toHexString(byte[] buf, String sep, int lineLen) { - if (buf == null) return null; - if (lineLen <= 0) lineLen = Integer.MAX_VALUE; + if (buf == null) + return null; + if (lineLen <= 0) + lineLen = Integer.MAX_VALUE; StringBuffer res = new StringBuffer(buf.length * 2); for (int i = 0; i < buf.length; i++) { int b = buf[i]; res.append(HEX_DIGITS[(b >> 4) & 0xf]); res.append(HEX_DIGITS[b & 0xf]); - if (i > 0 && (i % lineLen) == 0) res.append('\n'); - else if (sep != null && i < lineLen - 1) res.append(sep); + if (i > 0 && (i % lineLen) == 0) + res.append('\n'); + else if (sep != null && i < lineLen - 1) + res.append(sep); } return res.toString(); } - + /** * Convert a String containing consecutive (no inside whitespace) hexadecimal - * digits into a corresponding byte array. If the number of digits is not even, - * a '0' will be appended in the front of the String prior to conversion. - * Leading and trailing whitespace is ignored. - * @param text input text + * digits into a corresponding byte array. If the number of digits is not + * even, a '0' will be appended in the front of the String prior to + * conversion. Leading and trailing whitespace is ignored. + * + * @param text + * input text * @return converted byte array, or null if unable to convert */ public static byte[] fromHexString(String text) { text = text.trim(); - if (text.length() % 2 != 0) text = "0" + text; + if (text.length() % 2 != 0) + text = "0" + text; int resLen = text.length() / 2; int loNibble, hiNibble; byte[] res = new byte[resLen]; @@ -101,12 +112,13 @@ public class StringUtil { int j = i << 1; hiNibble = charToNibble(text.charAt(j)); loNibble = charToNibble(text.charAt(j + 1)); - if (loNibble == -1 || hiNibble == -1) return null; - res[i] = (byte)(hiNibble << 4 | loNibble); + if (loNibble == -1 || hiNibble == -1) + return null; + res[i] = (byte) (hiNibble << 4 | loNibble); } return res; } - + private static final int charToNibble(char c) { if (c >= '0' && c <= '9') { return c - '0'; @@ -125,7 +137,7 @@ public class StringUtil { public static boolean isEmpty(String str) { return (str == null) || (str.equals("")); } - + /** * Simple character substitution which cleans all � chars from a given String. */ @@ -136,8 +148,8 @@ public class StringUtil { public static void main(String[] args) { if (args.length != 1) System.out.println("Usage: StringUtil <encoding name>"); - else - System.out.println(args[0] + " is resolved to " + - EncodingDetector.resolveEncodingAlias(args[0])); + else + System.out.println(args[0] + " is resolved to " + + EncodingDetector.resolveEncodingAlias(args[0])); } } Modified: nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java Thu Jan 29 05:38:59 2015 @@ -21,8 +21,8 @@ import java.util.Collection; import java.util.Iterator; /** - * A class for efficiently matching <code>String</code>s against a set - * of suffixes. Zero-length <code>Strings</code> are ignored. + * A class for efficiently matching <code>String</code>s against a set of + * suffixes. Zero-length <code>Strings</code> are ignored. */ public class SuffixStringMatcher extends TrieStringMatcher { @@ -32,7 +32,7 @@ public class SuffixStringMatcher extends */ public SuffixStringMatcher(String[] suffixes) { super(); - for (int i= 0; i < suffixes.length; i++) + for (int i = 0; i < suffixes.length; i++) addPatternBackward(suffixes[i]); } @@ -43,20 +43,20 @@ public class SuffixStringMatcher extends */ public SuffixStringMatcher(Collection<String> suffixes) { super(); - Iterator<String> iter= suffixes.iterator(); + Iterator<String> iter = suffixes.iterator(); while (iter.hasNext()) addPatternBackward(iter.next()); } /** - * Returns true if the given <code>String</code> is matched by a - * suffix in the trie + * Returns true if the given <code>String</code> is matched by a suffix in the + * trie */ public boolean matches(String input) { - TrieNode node= root; - for (int i= input.length() - 1; i >= 0; i--) { - node= node.getChild(input.charAt(i)); - if (node == null) + TrieNode node = root; + for (int i = input.length() - 1; i >= 0; i--) { + node = node.getChild(input.charAt(i)); + if (node == null) return false; if (node.isTerminal()) return true; @@ -64,16 +64,15 @@ public class SuffixStringMatcher extends return false; } - /** * Returns the shortest suffix of <code>input<code> that is matched, * or <code>null<code> if no match exists. */ public String shortestMatch(String input) { - TrieNode node= root; - for (int i= input.length() - 1; i >= 0; i--) { - node= node.getChild(input.charAt(i)); - if (node == null) + TrieNode node = root; + for (int i = input.length() - 1; i >= 0; i--) { + node = node.getChild(input.charAt(i)); + if (node == null) return null; if (node.isTerminal()) return input.substring(i); @@ -86,29 +85,26 @@ public class SuffixStringMatcher extends * or <code>null<code> if no match exists. */ public String longestMatch(String input) { - TrieNode node= root; - String result= null; - for (int i= input.length() - 1; i >= 0; i--) { - node= node.getChild(input.charAt(i)); - if (node == null) + TrieNode node = root; + String result = null; + for (int i = input.length() - 1; i >= 0; i--) { + node = node.getChild(input.charAt(i)); + if (node == null) break; if (node.isTerminal()) - result= input.substring(i); + result = input.substring(i); } return result; } public static final void main(String[] argv) { - SuffixStringMatcher matcher= - new SuffixStringMatcher( - new String[] - {"a", "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar"} ); - - String[] tests= {"a", "ac", "abcd", "abcdefg", "apple", "aa", "aac", - "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", - "kite", }; + SuffixStringMatcher matcher = new SuffixStringMatcher(new String[] { "a", + "abcd", "bcd", "bcdefg", "defg", "aac", "baz", "foo", "foobar" }); + + String[] tests = { "a", "ac", "abcd", "abcdefg", "apple", "aa", "aac", + "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", }; - for (int i= 0; i < tests.length; i++) { + for (int i = 0; i < tests.length; i++) { System.out.println("testing: " + tests[i]); System.out.println(" matches: " + matcher.matches(tests[i])); System.out.println(" shortest: " + matcher.shortestMatch(tests[i])); Modified: nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/TimingUtil.java Thu Jan 29 05:38:59 2015 @@ -21,35 +21,39 @@ import java.text.NumberFormat; public class TimingUtil { - private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 }; + private static long[] TIME_FACTOR = { 60 * 60 * 1000, 60 * 1000, 1000 }; - /** - * Calculate the elapsed time between two times specified in milliseconds. - * @param start The start of the time period - * @param end The end of the time period - * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y minutes and Z seconds or null if start > end. - */ - public static String elapsedTime(long start, long end){ - if (start > end) { - return null; - } - - long[] elapsedTime = new long[TIME_FACTOR.length]; - - for (int i = 0; i < TIME_FACTOR.length; i++) { - elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i]; - start += TIME_FACTOR[i] * elapsedTime[i]; - } - - NumberFormat nf = NumberFormat.getInstance(); - nf.setMinimumIntegerDigits(2); - StringBuffer buf = new StringBuffer(); - for (int i = 0; i < elapsedTime.length; i++) { - if (i > 0) { - buf.append(":"); - } - buf.append(nf.format(elapsedTime[i])); - } - return buf.toString(); + /** + * Calculate the elapsed time between two times specified in milliseconds. + * + * @param start + * The start of the time period + * @param end + * The end of the time period + * @return a string of the form "XhYmZs" when the elapsed time is X hours, Y + * minutes and Z seconds or null if start > end. + */ + public static String elapsedTime(long start, long end) { + if (start > end) { + return null; } + + long[] elapsedTime = new long[TIME_FACTOR.length]; + + for (int i = 0; i < TIME_FACTOR.length; i++) { + elapsedTime[i] = start > end ? -1 : (end - start) / TIME_FACTOR[i]; + start += TIME_FACTOR[i] * elapsedTime[i]; + } + + NumberFormat nf = NumberFormat.getInstance(); + nf.setMinimumIntegerDigits(2); + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < elapsedTime.length; i++) { + if (i > 0) { + buf.append(":"); + } + buf.append(nf.format(elapsedTime[i])); + } + return buf.toString(); + } } Modified: nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/TrieStringMatcher.java Thu Jan 29 05:38:59 2015 @@ -17,21 +17,19 @@ package org.apache.nutch.util; - import java.util.Arrays; import java.util.LinkedList; import java.util.ListIterator; /** - * TrieStringMatcher is a base class for simple tree-based string - * matching. - * + * TrieStringMatcher is a base class for simple tree-based string matching. + * */ public abstract class TrieStringMatcher { protected TrieNode root; protected TrieStringMatcher() { - this.root= new TrieNode('\000', false); + this.root = new TrieNode('\000', false); } /** @@ -44,20 +42,19 @@ public abstract class TrieStringMatcher protected boolean terminal; /** - * Creates a new TrieNode, which contains the given - * <code>nodeChar</code>. If <code>isTerminal</code> is - * <code>true</code>, the new node is a <em>terminal</em> node in - * the trie. - */ + * Creates a new TrieNode, which contains the given <code>nodeChar</code>. + * If <code>isTerminal</code> is <code>true</code>, the new node is a + * <em>terminal</em> node in the trie. + */ TrieNode(char nodeChar, boolean isTerminal) { - this.nodeChar= nodeChar; - this.terminal= isTerminal; - this.childrenList= new LinkedList<TrieNode>(); + this.nodeChar = nodeChar; + this.terminal = isTerminal; + this.childrenList = new LinkedList<TrieNode>(); } /** - * Returns <code>true</code> if this node is a <em>terminal</em> - * node in the trie. + * Returns <code>true</code> if this node is a <em>terminal</em> node in the + * trie. */ boolean isTerminal() { return terminal; @@ -65,67 +62,68 @@ public abstract class TrieStringMatcher /** * Returns the child node of this node whose node-character is - * <code>nextChar</code>. If no such node exists, one will be is - * added. If <em>isTerminal</em> is <code>true</code>, the node - * will be a terminal node in the trie. + * <code>nextChar</code>. If no such node exists, one will be is added. If + * <em>isTerminal</em> is <code>true</code>, the node will be a terminal + * node in the trie. */ TrieNode getChildAddIfNotPresent(char nextChar, boolean isTerminal) { if (childrenList == null) { - childrenList= new LinkedList<TrieNode>(); + childrenList = new LinkedList<TrieNode>(); childrenList.addAll(Arrays.asList(children)); - children= null; + children = null; } if (childrenList.size() == 0) { - TrieNode newNode= new TrieNode(nextChar, isTerminal); + TrieNode newNode = new TrieNode(nextChar, isTerminal); childrenList.add(newNode); return newNode; } - ListIterator<TrieNode> iter= childrenList.listIterator(); - TrieNode node= iter.next(); - while ( (node.nodeChar < nextChar) && iter.hasNext() ) - node= iter.next(); - + ListIterator<TrieNode> iter = childrenList.listIterator(); + TrieNode node = iter.next(); + while ((node.nodeChar < nextChar) && iter.hasNext()) + node = iter.next(); + if (node.nodeChar == nextChar) { - node.terminal= node.terminal | isTerminal; + node.terminal = node.terminal | isTerminal; return node; } - if (node.nodeChar > nextChar) + if (node.nodeChar > nextChar) iter.previous(); - TrieNode newNode= new TrieNode(nextChar, isTerminal); + TrieNode newNode = new TrieNode(nextChar, isTerminal); iter.add(newNode); - return newNode; + return newNode; } /** * Returns the child node of this node whose node-character is - * <code>nextChar</code>. If no such node exists, - * <code>null</code> is returned. + * <code>nextChar</code>. If no such node exists, <code>null</code> is + * returned. */ TrieNode getChild(char nextChar) { if (children == null) { - children= childrenList.toArray(new TrieNode[childrenList.size()]); - childrenList= null; + children = childrenList.toArray(new TrieNode[childrenList.size()]); + childrenList = null; Arrays.sort(children); } - int min= 0; - int max= children.length - 1; - int mid= 0; + int min = 0; + int max = children.length - 1; + int mid = 0; while (min < max) { - mid= (min + max) / 2; - if (children[mid].nodeChar == nextChar) + mid = (min + max) / 2; + if (children[mid].nodeChar == nextChar) return children[mid]; if (children[mid].nodeChar < nextChar) - min= mid + 1; - else // if (children[mid].nodeChar > nextChar) - max= mid - 1; + min = mid + 1; + else + // if (children[mid].nodeChar > nextChar) + max = mid - 1; } - if (min == max) + if (min == max) if (children[min].nodeChar == nextChar) return children[min]; @@ -133,59 +131,57 @@ public abstract class TrieStringMatcher } public int compareTo(TrieNode other) { - if (this.nodeChar < other.nodeChar) + if (this.nodeChar < other.nodeChar) return -1; - if (this.nodeChar == other.nodeChar) + if (this.nodeChar == other.nodeChar) return 0; -// if (this.nodeChar > other.nodeChar) + // if (this.nodeChar > other.nodeChar) return 1; } } /** * Returns the next {@link TrieNode} visited, given that you are at - * <code>node</code>, and the the next character in the input is - * the <code>idx</code>'th character of <code>s</code>. + * <code>node</code>, and the the next character in the input is the + * <code>idx</code>'th character of <code>s</code>. */ protected final TrieNode matchChar(TrieNode node, String s, int idx) { return node.getChild(s.charAt(idx)); } /** - * Adds any necessary nodes to the trie so that the given - * <code>String</code> can be decoded and the last character is - * represented by a terminal node. Zero-length <code>Strings</code> - * are ignored. + * Adds any necessary nodes to the trie so that the given <code>String</code> + * can be decoded and the last character is represented by a terminal node. + * Zero-length <code>Strings</code> are ignored. */ protected final void addPatternForward(String s) { - TrieNode node= root; - int stop= s.length() - 1; + TrieNode node = root; + int stop = s.length() - 1; int i; if (s.length() > 0) { - for (i= 0; i < stop; i++) - node= node.getChildAddIfNotPresent(s.charAt(i), false); - node= node.getChildAddIfNotPresent(s.charAt(i), true); + for (i = 0; i < stop; i++) + node = node.getChildAddIfNotPresent(s.charAt(i), false); + node = node.getChildAddIfNotPresent(s.charAt(i), true); } } /** - * Adds any necessary nodes to the trie so that the given - * <code>String</code> can be decoded <em>in reverse</em> and the - * first character is represented by a terminal node. Zero-length - * <code>Strings</code> are ignored. + * Adds any necessary nodes to the trie so that the given <code>String</code> + * can be decoded <em>in reverse</em> and the first character is represented + * by a terminal node. Zero-length <code>Strings</code> are ignored. */ protected final void addPatternBackward(String s) { - TrieNode node= root; + TrieNode node = root; if (s.length() > 0) { - for (int i= s.length()-1; i > 0; i--) - node= node.getChildAddIfNotPresent(s.charAt(i), false); - node= node.getChildAddIfNotPresent(s.charAt(0), true); + for (int i = s.length() - 1; i > 0; i--) + node = node.getChildAddIfNotPresent(s.charAt(i), false); + node = node.getChildAddIfNotPresent(s.charAt(0), true); } } /** - * Returns true if the given <code>String</code> is matched by a - * pattern in the trie + * Returns true if the given <code>String</code> is matched by a pattern in + * the trie */ public abstract boolean matches(String input); Modified: nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/URLUtil.java Thu Jan 29 05:38:59 2015 @@ -26,17 +26,20 @@ import org.apache.nutch.util.domain.Doma /** Utility class for URL analysis */ public class URLUtil { - + /** - * Resolve relative URL-s and fix a java.net.URL error - * in handling of URLs with pure query targets. - * @param base base url - * @param target target url (may be relative) + * Resolve relative URL-s and fix a java.net.URL error in handling of URLs + * with pure query targets. + * + * @param base + * base url + * @param target + * target url (may be relative) * @return resolved absolute url. * @throws MalformedURLException */ public static URL resolveURL(URL base, String target) - throws MalformedURLException { + throws MalformedURLException { target = target.trim(); // handle the case that there is a target that is a pure query, @@ -58,9 +61,10 @@ public class URLUtil { } /** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */ - static URL fixPureQueryTargets(URL base, String target) - throws MalformedURLException { - if (!target.startsWith("?")) return new URL(base, target); + static URL fixPureQueryTargets(URL base, String target) + throws MalformedURLException { + if (!target.startsWith("?")) + return new URL(base, target); String basePath = base.getPath(); String baseRightMost = ""; @@ -69,63 +73,75 @@ public class URLUtil { baseRightMost = basePath.substring(baseRightMostIdx + 1); } - if (target.startsWith("?")) target = baseRightMost + target; + if (target.startsWith("?")) + target = baseRightMost + target; return new URL(base, target); } - private static Pattern IP_PATTERN = Pattern.compile("(\\d{1,3}\\.){3}(\\d{1,3})"); + private static Pattern IP_PATTERN = Pattern + .compile("(\\d{1,3}\\.){3}(\\d{1,3})"); - /** Returns the domain name of the url. The domain name of a url is - * the substring of the url's hostname, w/o subdomain names. As an - * example <br><code> + /** + * Returns the domain name of the url. The domain name of a url is the + * substring of the url's hostname, w/o subdomain names. As an example <br> + * <code> * getDomainName(conf, new URL(http://lucene.apache.org/)) * </code><br> - * will return <br><code> apache.org</code> - * */ + * will return <br> + * <code> apache.org</code> + * */ public static String getDomainName(URL url) { DomainSuffixes tlds = DomainSuffixes.getInstance(); String host = url.getHost(); - //it seems that java returns hostnames ending with . - if(host.endsWith(".")) + // it seems that java returns hostnames ending with . + if (host.endsWith(".")) host = host.substring(0, host.length() - 1); - if(IP_PATTERN.matcher(host).matches()) + if (IP_PATTERN.matcher(host).matches()) return host; - + int index = 0; String candidate = host; - for(;index >= 0;) { + for (; index >= 0;) { index = candidate.indexOf('.'); - String subCandidate = candidate.substring(index+1); - if(tlds.isDomainSuffix(subCandidate)) { - return candidate; + String subCandidate = candidate.substring(index + 1); + if (tlds.isDomainSuffix(subCandidate)) { + return candidate; } candidate = subCandidate; } return candidate; } - /** Returns the domain name of the url. The domain name of a url is - * the substring of the url's hostname, w/o subdomain names. As an - * example <br><code> + /** + * Returns the domain name of the url. The domain name of a url is the + * substring of the url's hostname, w/o subdomain names. As an example <br> + * <code> * getDomainName(conf, new http://lucene.apache.org/) * </code><br> - * will return <br><code> apache.org</code> + * will return <br> + * <code> apache.org</code> + * * @throws MalformedURLException */ public static String getDomainName(String url) throws MalformedURLException { return getDomainName(new URL(url)); } - /** Returns the top level domain name of the url. The top level domain name - * of a url is the substring of the url's hostname, w/o subdomain names. - * As an example <br><code> + /** + * Returns the top level domain name of the url. The top level domain name of + * a url is the substring of the url's hostname, w/o subdomain names. As an + * example <br> + * <code> * getTopLevelDomainName(conf, new http://lucene.apache.org/) * </code><br> - * will return <br><code> org</code> + * will return <br> + * <code> org</code> + * * @throws MalformedURLException */ - public static String getTopLevelDomainName(URL url) throws MalformedURLException { + public static String getTopLevelDomainName(URL url) + throws MalformedURLException { String suffix = getDomainSuffix(url).toString(); int idx = suffix.lastIndexOf("."); if (idx != -1) { @@ -135,94 +151,110 @@ public class URLUtil { } } - /** Returns the top level domain name of the url. The top level domain name - * of a url is the substring of the url's hostname, w/o subdomain names. - * As an example <br><code> + /** + * Returns the top level domain name of the url. The top level domain name of + * a url is the substring of the url's hostname, w/o subdomain names. As an + * example <br> + * <code> * getTopLevelDomainName(conf, new http://lucene.apache.org/) * </code><br> - * will return <br><code> org</code> + * will return <br> + * <code> org</code> + * * @throws MalformedURLException */ - public static String getTopLevelDomainName(String url) throws MalformedURLException { + public static String getTopLevelDomainName(String url) + throws MalformedURLException { return getTopLevelDomainName(new URL(url)); } - /** Returns whether the given urls have the same domain name. - * As an example, <br> + /** + * Returns whether the given urls have the same domain name. As an example, <br> * <code> isSameDomain(new URL("http://lucene.apache.org") * , new URL("http://people.apache.org/")) * <br> will return true. </code> - * + * * @return true if the domain names are equal */ public static boolean isSameDomainName(URL url1, URL url2) { return getDomainName(url1).equalsIgnoreCase(getDomainName(url2)); } - /**Returns whether the given urls have the same domain name. - * As an example, <br> - * <code> isSameDomain("http://lucene.apache.org" - * ,"http://people.apache.org/") - * <br> will return true. </code> - * @return true if the domain names are equal - * @throws MalformedURLException - */ + /** + * Returns whether the given urls have the same domain name. As an example, <br> + * <code> isSameDomain("http://lucene.apache.org" + * ,"http://people.apache.org/") + * <br> will return true. </code> + * + * @return true if the domain names are equal + * @throws MalformedURLException + */ public static boolean isSameDomainName(String url1, String url2) - throws MalformedURLException { + throws MalformedURLException { return isSameDomainName(new URL(url1), new URL(url2)); } - /** Returns the {@link DomainSuffix} corresponding to the - * last public part of the hostname + /** + * Returns the {@link DomainSuffix} corresponding to the last public part of + * the hostname */ public static DomainSuffix getDomainSuffix(URL url) { DomainSuffixes tlds = DomainSuffixes.getInstance(); String host = url.getHost(); - if(IP_PATTERN.matcher(host).matches()) + if (IP_PATTERN.matcher(host).matches()) return null; - + int index = 0; String candidate = host; - for(;index >= 0;) { + for (; index >= 0;) { index = candidate.indexOf('.'); - String subCandidate = candidate.substring(index+1); + String subCandidate = candidate.substring(index + 1); DomainSuffix d = tlds.get(subCandidate); - if(d != null) { - return d; + if (d != null) { + return d; } candidate = subCandidate; } return null; } - /** Returns the {@link DomainSuffix} corresponding to the - * last public part of the hostname + /** + * Returns the {@link DomainSuffix} corresponding to the last public part of + * the hostname */ - public static DomainSuffix getDomainSuffix(String url) throws MalformedURLException { + public static DomainSuffix getDomainSuffix(String url) + throws MalformedURLException { return getDomainSuffix(new URL(url)); } - /** Partitions of the hostname of the url by "." */ + /** Partitions of the hostname of the url by "." */ public static String[] getHostSegments(URL url) { String host = url.getHost(); - //return whole hostname, if it is an ipv4 - //TODO : handle ipv6 - if(IP_PATTERN.matcher(host).matches()) - return new String[] {host}; + // return whole hostname, if it is an ipv4 + // TODO : handle ipv6 + if (IP_PATTERN.matcher(host).matches()) + return new String[] { host }; return host.split("\\."); } - /** Partitions of the hostname of the url by "." - * @throws MalformedURLException */ - public static String[] getHostSegments(String url) throws MalformedURLException { - return getHostSegments(new URL(url)); + /** + * Partitions of the hostname of the url by "." + * + * @throws MalformedURLException + */ + public static String[] getHostSegments(String url) + throws MalformedURLException { + return getHostSegments(new URL(url)); } /** - * <p>Given two urls, a src and a destination of a redirect, it returns the - * representative url.<p> + * <p> + * Given two urls, a src and a destination of a redirect, it returns the + * representative url. + * <p> * - * <p>This method implements an extended version of the algorithm used by the + * <p> + * This method implements an extended version of the algorithm used by the * Yahoo! Slurp crawler described here:<br> * <a href= * "http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How @@ -230,46 +262,63 @@ public class URLUtil { * <br> * <ol> * <li>Choose target url if either url is malformed.</li> - * <li>If different domains the keep the destination whether or not the + * <li>If different domains the keep the destination whether or not the * redirect is temp or perm</li> - * <ul><li>a.com -> b.com*</li></ul> + * <ul> + * <li>a.com -> b.com*</li> + * </ul> * <li>If the redirect is permanent and the source is root, keep the source.</li> - * <ul><li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li></ul> - * <li>If the redirect is permanent and the source is not root and the + * <ul> + * <li>*a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html</li> + * </ul> + * <li>If the redirect is permanent and the source is not root and the * destination is root, keep the destination</li> - * <ul><li>a.com/xyz/index.html -> a.com*</li></ul> + * <ul> + * <li>a.com/xyz/index.html -> a.com*</li> + * </ul> * <li>If the redirect is permanent and neither the source nor the destination * is root, then keep the destination</li> - * <ul><li>a.com/xyz/index.html -> a.com/abc/page.html*</li></ul> + * <ul> + * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li> + * </ul> * <li>If the redirect is temporary and source is root and destination is not * root, then keep the source</li> - * <ul><li>*a.com -> a.com/xyz/index.html</li></ul> + * <ul> + * <li>*a.com -> a.com/xyz/index.html</li> + * </ul> * <li>If the redirect is temporary and source is not root and destination is * root, then keep the destination</li> - * <ul><li>a.com/xyz/index.html -> a.com*</li></ul> + * <ul> + * <li>a.com/xyz/index.html -> a.com*</li> + * </ul> * <li>If the redirect is temporary and neither the source or the destination - * is root, then keep the shortest url. First check for the shortest host, - * and if both are equal then check by path. Path is first by length then by - * the number of / path separators.</li> + * is root, then keep the shortest url. First check for the shortest host, and + * if both are equal then check by path. Path is first by length then by the + * number of / path separators.</li> * <ul> * <li>a.com/xyz/index.html -> a.com/abc/page.html*</li> * <li>*www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html</li> * </ul> * <li>If the redirect is temporary and both the source and the destination * are root, then keep the shortest sub-domain</li> - * <ul><li>*www.a.com -> www.news.a.com</li></ul> + * <ul> + * <li>*www.a.com -> www.news.a.com</li> + * </ul> * <br> - * While not in this logic there is a further piece of representative url - * logic that occurs during indexing and after scoring. During creation of - * the basic fields before indexing, if a url has a representative url stored - * we check both the url and its representative url (which should never be - * the same) against their linkrank scores and the highest scoring one is - * kept as the url and the lower scoring one is held as the orig url inside - * of the index. - * - * @param src The source url. - * @param dst The destination url. - * @param temp Is the redirect a temporary redirect. + * While not in this logic there is a further piece of representative url + * logic that occurs during indexing and after scoring. During creation of the + * basic fields before indexing, if a url has a representative url stored we + * check both the url and its representative url (which should never be the + * same) against their linkrank scores and the highest scoring one is kept as + * the url and the lower scoring one is held as the orig url inside of the + * index. + * + * @param src + * The source url. + * @param dst + * The destination url. + * @param temp + * Is the redirect a temporary redirect. * * @return String The representative url. */ @@ -281,8 +330,7 @@ public class URLUtil { try { srcUrl = new URL(src); dstUrl = new URL(dst); - } - catch (MalformedURLException e) { + } catch (MalformedURLException e) { return dst; } @@ -300,27 +348,27 @@ public class URLUtil { // 1) different domain them keep dest, temp or perm // a.com -> b.com* - // + // // 2) permanent and root, keep src // *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html - // + // // 3) permanent and not root and dest root, keep dest // a.com/xyz/index.html -> a.com* - // + // // 4) permanent and neither root keep dest // a.com/xyz/index.html -> a.com/abc/page.html* - // + // // 5) temp and root and dest not root keep src // *a.com -> a.com/xyz/index.html - // + // // 7) temp and not root and dest root keep dest // a.com/xyz/index.html -> a.com* - // + // // 8) temp and neither root, keep shortest, if hosts equal by path else by // hosts. paths are first by length then by number of / separators // a.com/xyz/index.html -> a.com/abc/page.html* // *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html - // + // // 9) temp and both root keep shortest sub domain // *www.a.com -> www.news.a.com @@ -332,39 +380,33 @@ public class URLUtil { // if it is a permanent redirect if (!temp) { - + // if source is root return source, otherwise destination if (srcRoot) { return src; - } - else { + } else { return dst; } - } - else { // temporary redirect + } else { // temporary redirect // source root and destination not root if (srcRoot && !destRoot) { return src; - } - else if (!srcRoot && destRoot) { // destination root and source not + } else if (!srcRoot && destRoot) { // destination root and source not return dst; - } - else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) { + } else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) { // source and destination hosts are the same, check paths, host length int numSrcPaths = srcFile.split("/").length; int numDstPaths = dstFile.split("/").length; if (numSrcPaths != numDstPaths) { return (numDstPaths < numSrcPaths ? dst : src); - } - else { + } else { int srcPathLength = srcFile.length(); int dstPathLength = dstFile.length(); return (dstPathLength < srcPathLength ? dst : src); } - } - else { + } else { // different host names and both root take the shortest int numSrcSubs = srcHost.split("\\.").length; @@ -378,24 +420,25 @@ public class URLUtil { * Returns the lowercased hostname for the url or null if the url is not well * formed. * - * @param url The url to check. + * @param url + * The url to check. * @return String The hostname for the url. */ public static String getHost(String url) { try { return new URL(url).getHost().toLowerCase(); - } - catch (MalformedURLException e) { + } catch (MalformedURLException e) { return null; } } /** - * Returns the page for the url. The page consists of the protocol, host, - * and path, but does not include the query string. The host is lowercased - * but the path is not. + * Returns the page for the url. The page consists of the protocol, host, and + * path, but does not include the query string. The host is lowercased but the + * path is not. * - * @param url The url to check. + * @param url + * The url to check. * @return String The page for the url. */ public static String getPage(String url) { @@ -404,12 +447,11 @@ public class URLUtil { url = url.toLowerCase(); String queryStr = new URL(url).getQuery(); return (queryStr != null) ? url.replace("?" + queryStr, "") : url; - } - catch (MalformedURLException e) { + } catch (MalformedURLException e) { return null; } } - + public static String getProtocol(String url) { try { return getProtocol(new URL(url)); @@ -417,7 +459,7 @@ public class URLUtil { return null; } } - + public static String getProtocol(URL url) { return url.getProtocol(); } @@ -431,17 +473,11 @@ public class URLUtil { // also do not add additional slashes for file: URLs (NUTCH-1880) return url; } - URI p = new URI(u.getProtocol(), - u.getUserInfo(), - IDN.toASCII(host), - u.getPort(), - u.getPath(), - u.getQuery(), - u.getRef()); + URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host), + u.getPort(), u.getPath(), u.getQuery(), u.getRef()); return p.toString(); - } - catch (Exception e) { + } catch (Exception e) { return null; } } @@ -474,26 +510,23 @@ public class URLUtil { } return sb.toString(); - } - catch (Exception e) { + } catch (Exception e) { return null; } } - /** For testing */ - public static void main(String[] args){ - - if(args.length!=1) { + public static void main(String[] args) { + + if (args.length != 1) { System.err.println("Usage : URLUtil <url>"); - return ; + return; } - + String url = args[0]; try { System.out.println(URLUtil.getDomainName(new URL(url))); - } - catch (MalformedURLException ex) { + } catch (MalformedURLException ex) { ex.printStackTrace(); } } Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java (original) +++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainStatistics.java Thu Jan 29 05:38:59 2015 @@ -48,12 +48,15 @@ import org.apache.nutch.util.URLUtil; */ public class DomainStatistics extends Configured implements Tool { - private static final Logger LOG = LoggerFactory.getLogger(DomainStatistics.class); + private static final Logger LOG = LoggerFactory + .getLogger(DomainStatistics.class); private static final Text FETCHED_TEXT = new Text("FETCHED"); private static final Text NOT_FETCHED_TEXT = new Text("NOT_FETCHED"); - public static enum MyCounter {FETCHED, NOT_FETCHED, EMPTY_RESULT}; + public static enum MyCounter { + FETCHED, NOT_FETCHED, EMPTY_RESULT + }; private static final int MODE_HOST = 1; private static final int MODE_DOMAIN = 2; @@ -64,7 +67,8 @@ public class DomainStatistics extends Co public int run(String[] args) throws Exception { if (args.length < 3) { - System.out.println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]"); + System.out + .println("usage: DomainStatistics inputDirs outDir host|domain|suffix|tld [numOfReducer]"); return 1; } String inputDir = args[0]; @@ -81,16 +85,16 @@ public class DomainStatistics extends Co int mode = 0; String jobName = "DomainStatistics"; - if(args[2].equals("host")) { + if (args[2].equals("host")) { jobName = "Host statistics"; mode = MODE_HOST; - } else if(args[2].equals("domain")) { - jobName = "Domain statistics"; + } else if (args[2].equals("domain")) { + jobName = "Domain statistics"; mode = MODE_DOMAIN; - } else if(args[2].equals("suffix")) { + } else if (args[2].equals("suffix")) { jobName = "Suffix statistics"; mode = MODE_SUFFIX; - } else if(args[2].equals("tld")) { + } else if (args[2].equals("tld")) { jobName = "TLD statistics"; mode = MODE_TLD; } @@ -128,59 +132,65 @@ public class DomainStatistics extends Co } long end = System.currentTimeMillis(); - LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("DomainStatistics: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); return 0; } - static class DomainStatisticsMapper extends Mapper<Text, CrawlDatum, Text, LongWritable> { + static class DomainStatisticsMapper extends + Mapper<Text, CrawlDatum, Text, LongWritable> { int mode = 0; public void setup(Context context) { - mode = context.getConfiguration().getInt("domain.statistics.mode", MODE_DOMAIN); + mode = context.getConfiguration().getInt("domain.statistics.mode", + MODE_DOMAIN); } - public void map(Text urlText, CrawlDatum datum, Context context) throws IOException, InterruptedException { + public void map(Text urlText, CrawlDatum datum, Context context) + throws IOException, InterruptedException { - if(datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED + if (datum.getStatus() == CrawlDatum.STATUS_DB_FETCHED || datum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) { try { URL url = new URL(urlText.toString()); String out = null; switch (mode) { - case MODE_HOST: - out = url.getHost(); - break; - case MODE_DOMAIN: - out = URLUtil.getDomainName(url); - break; - case MODE_SUFFIX: - out = URLUtil.getDomainSuffix(url).getDomain(); - break; - case MODE_TLD: - out = URLUtil.getTopLevelDomainName(url); - break; + case MODE_HOST: + out = url.getHost(); + break; + case MODE_DOMAIN: + out = URLUtil.getDomainName(url); + break; + case MODE_SUFFIX: + out = URLUtil.getDomainSuffix(url).getDomain(); + break; + case MODE_TLD: + out = URLUtil.getTopLevelDomainName(url); + break; } - if(out.trim().equals("")) { + if (out.trim().equals("")) { LOG.info("url : " + url); context.getCounter(MyCounter.EMPTY_RESULT).increment(1); } context.write(new Text(out), new LongWritable(1)); - } catch (Exception ex) { } + } catch (Exception ex) { + } context.getCounter(MyCounter.FETCHED).increment(1); context.write(FETCHED_TEXT, new LongWritable(1)); - } - else { + } else { context.getCounter(MyCounter.NOT_FETCHED).increment(1); context.write(NOT_FETCHED_TEXT, new LongWritable(1)); } } } - static class DomainStatisticsReducer extends Reducer <Text, LongWritable, LongWritable, Text> { - public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { + static class DomainStatisticsReducer extends + Reducer<Text, LongWritable, LongWritable, Text> { + public void reduce(Text key, Iterable<LongWritable> values, Context context) + throws IOException, InterruptedException { long total = 0; for (LongWritable val : values) { @@ -191,8 +201,10 @@ public class DomainStatistics extends Co } } - public static class DomainStatisticsCombiner extends Reducer <Text, LongWritable, Text, LongWritable> { - public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { + public static class DomainStatisticsCombiner extends + Reducer<Text, LongWritable, Text, LongWritable> { + public void reduce(Text key, Iterable<LongWritable> values, Context context) + throws IOException, InterruptedException { long total = 0; for (LongWritable val : values) {