sr...

lewismc Thu, 08 Jan 2015 22:35:11 -0800

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java 
Fri Jan  9 06:34:33 2015
@@ -1,6 +1,5 @@
 package org.apache.nutch.indexer.solr;
 
-
 import org.apache.http.impl.client.DefaultHttpClient;
 import org.apache.http.auth.AuthScope;
 import org.apache.http.auth.UsernamePasswordCredentials;
@@ -18,7 +17,8 @@ public class SolrUtils {
 
   public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class);
 
-  public static HttpSolrServer getHttpSolrServer(Configuration job) throws 
MalformedURLException {
+  public static HttpSolrServer getHttpSolrServer(Configuration job)
+      throws MalformedURLException {
     DefaultHttpClient client = new DefaultHttpClient();
 
     // Check for username/password
@@ -27,10 +27,13 @@ public class SolrUtils {
 
       LOG.info("Authenticating as: " + username);
 
-      AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, 
AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
+      AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT,
+          AuthScope.ANY_REALM, AuthScope.ANY_SCHEME);
 
-      client.getCredentialsProvider().setCredentials(scope, 
-          new UsernamePasswordCredentials(username, 
job.get(SolrConstants.PASSWORD)));
+      client.getCredentialsProvider().setCredentials(
+          scope,
+          new UsernamePasswordCredentials(username, job
+              .get(SolrConstants.PASSWORD)));
 
       HttpParams params = client.getParams();
       HttpClientParams.setAuthenticating(params, true);
@@ -48,12 +51,14 @@ public class SolrUtils {
     for (int i = 0; i < input.length(); i++) {
       ch = input.charAt(i);
 
-      // Strip all non-characters 
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
-      // and non-printable control characters except tabulator, new line and 
carriage return
+      // Strip all non-characters
+      // 
http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
+      // and non-printable control characters except tabulator, new line and
+      // carriage return
       if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
-              ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
-              (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
-              (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
+          ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
+          (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
+          (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
 
         retval.append(ch);
       }


Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java 
Fri Jan  9 06:34:33 2015
@@ -16,21 +16,20 @@
  */
 package org.apache.nutch.metadata;
 
-
 /**
  * A collection of Creative Commons properties names.
- *
+ * 
  * @see <a href="http://www.creativecommons.org/";>creativecommons.org</a>
- *
+ * 
  * @author Chris Mattmann
  * @author J&eacute;r&ocirc;me Charron
  */
 public interface CreativeCommons {
-  
+
   public final static String LICENSE_URL = "License-Url";
-  
+
   public final static String LICENSE_LOCATION = "License-Location";
-  
+
   public final static String WORK_TYPE = "Work-Type";
-  
+
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java Fri 
Jan  9 06:34:33 2015
@@ -16,149 +16,146 @@
  */
 package org.apache.nutch.metadata;
 
-
 /**
  * A collection of Dublin Core metadata names.
- *
- * @see <a href="http://dublincore.org";>dublincore.org</a> 
- *
+ * 
+ * @see <a href="http://dublincore.org";>dublincore.org</a>
+ * 
  * @author Chris Mattmann
  * @author J&eacute;r&ocirc;me Charron
  */
 public interface DublinCore {
-  
-    
+
   /**
-   * Typically, Format may include the media-type or dimensions of the
-   * resource. Format may be used to determine the software, hardware or other
-   * equipment needed to display or operate the resource. Examples of
-   * dimensions include size and duration. Recommended best practice is to
-   * select a value from a controlled vocabulary (for example, the list of
-   * Internet Media Types [MIME] defining computer media formats).
+   * Typically, Format may include the media-type or dimensions of the 
resource.
+   * Format may be used to determine the software, hardware or other equipment
+   * needed to display or operate the resource. Examples of dimensions include
+   * size and duration. Recommended best practice is to select a value from a
+   * controlled vocabulary (for example, the list of Internet Media Types 
[MIME]
+   * defining computer media formats).
    */
   public static final String FORMAT = "format";
-  
+
   /**
-   * Recommended best practice is to identify the resource by means of a
-   * string or number conforming to a formal identification system. Example
-   * formal identification systems include the Uniform Resource Identifier
-   * (URI) (including the Uniform Resource Locator (URL)), the Digital Object
+   * Recommended best practice is to identify the resource by means of a string
+   * or number conforming to a formal identification system. Example formal
+   * identification systems include the Uniform Resource Identifier (URI)
+   * (including the Uniform Resource Locator (URL)), the Digital Object
    * Identifier (DOI) and the International Standard Book Number (ISBN).
    */
   public static final String IDENTIFIER = "identifier";
-  
+
   /**
    * Date on which the resource was changed.
    */
   public static final String MODIFIED = "modified";
-  
+
   /**
    * An entity responsible for making contributions to the content of the
-   * resource. Examples of a Contributor include a person, an organisation, or
-   * a service. Typically, the name of a Contributor should be used to
-   * indicate the entity.
+   * resource. Examples of a Contributor include a person, an organisation, or 
a
+   * service. Typically, the name of a Contributor should be used to indicate
+   * the entity.
    */
   public static final String CONTRIBUTOR = "contributor";
-  
+
   /**
-   * The extent or scope of the content of the resource. Coverage will
-   * typically include spatial location (a place name or geographic
-   * coordinates), temporal period (a period label, date, or date range) or
-   * jurisdiction (such as a named administrative entity). Recommended best
-   * practice is to select a value from a controlled vocabulary (for example,
-   * the Thesaurus of Geographic Names [TGN]) and that, where appropriate,
-   * named places or time periods be used in preference to numeric identifiers
-   * such as sets of coordinates or date ranges.
+   * The extent or scope of the content of the resource. Coverage will 
typically
+   * include spatial location (a place name or geographic coordinates), 
temporal
+   * period (a period label, date, or date range) or jurisdiction (such as a
+   * named administrative entity). Recommended best practice is to select a
+   * value from a controlled vocabulary (for example, the Thesaurus of
+   * Geographic Names [TGN]) and that, where appropriate, named places or time
+   * periods be used in preference to numeric identifiers such as sets of
+   * coordinates or date ranges.
    */
   public static final String COVERAGE = "coverage";
-  
+
   /**
    * An entity primarily responsible for making the content of the resource.
    * Examples of a Creator include a person, an organisation, or a service.
    * Typically, the name of a Creator should be used to indicate the entity.
    */
   public static final String CREATOR = "creator";
-  
+
   /**
    * A date associated with an event in the life cycle of the resource.
-   * Typically, Date will be associated with the creation or availability of
-   * the resource. Recommended best practice for encoding the date value is
-   * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD
-   * format.
+   * Typically, Date will be associated with the creation or availability of 
the
+   * resource. Recommended best practice for encoding the date value is defined
+   * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format.
    */
   public static final String DATE = "date";
-  
+
   /**
    * An account of the content of the resource. Description may include but is
    * not limited to: an abstract, table of contents, reference to a graphical
    * representation of content or a free-text account of the content.
    */
   public static final String DESCRIPTION = "description";
-  
+
   /**
    * A language of the intellectual content of the resource. Recommended best
    * practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639
-   * [ISO639], defines two- and three-letter primary language tags with
-   * optional subtags. Examples include "en" or "eng" for English, "akk" for
-   * Akkadian, and "en-GB" for English used in the United Kingdom.
+   * [ISO639], defines two- and three-letter primary language tags with 
optional
+   * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian,
+   * and "en-GB" for English used in the United Kingdom.
    */
   public static final String LANGUAGE = "language";
-  
+
   /**
    * An entity responsible for making the resource available. Examples of a
    * Publisher include a person, an organisation, or a service. Typically, the
    * name of a Publisher should be used to indicate the entity.
    */
   public static final String PUBLISHER = "publisher";
-  
+
   /**
    * A reference to a related resource. Recommended best practice is to
    * reference the resource by means of a string or number conforming to a
    * formal identification system.
    */
   public static final String RELATION = "relation";
-  
+
   /**
-   * Information about rights held in and over the resource. Typically, a
-   * Rights element will contain a rights management statement for the
-   * resource, or reference a service providing such information. Rights
-   * information often encompasses Intellectual Property Rights (IPR),
-   * Copyright, and various Property Rights. If the Rights element is absent,
-   * no assumptions can be made about the status of these and other rights
-   * with respect to the resource.
+   * Information about rights held in and over the resource. Typically, a 
Rights
+   * element will contain a rights management statement for the resource, or
+   * reference a service providing such information. Rights information often
+   * encompasses Intellectual Property Rights (IPR), Copyright, and various
+   * Property Rights. If the Rights element is absent, no assumptions can be
+   * made about the status of these and other rights with respect to the
+   * resource.
    */
   public static final String RIGHTS = "rights";
-  
+
   /**
    * A reference to a resource from which the present resource is derived. The
    * present resource may be derived from the Source resource in whole or in
-   * part. Recommended best practice is to reference the resource by means of
-   * a string or number conforming to a formal identification system.
+   * part. Recommended best practice is to reference the resource by means of a
+   * string or number conforming to a formal identification system.
    */
   public static final String SOURCE = "source";
-  
+
   /**
    * The topic of the content of the resource. Typically, a Subject will be
-   * expressed as keywords, key phrases or classification codes that describe
-   * a topic of the resource. Recommended best practice is to select a value
-   * from a controlled vocabulary or formal classification scheme.
+   * expressed as keywords, key phrases or classification codes that describe a
+   * topic of the resource. Recommended best practice is to select a value from
+   * a controlled vocabulary or formal classification scheme.
    */
   public static final String SUBJECT = "subject";
-  
+
   /**
    * A name given to the resource. Typically, a Title will be a name by which
    * the resource is formally known.
    */
   public static final String TITLE = "title";
-  
+
   /**
    * The nature or genre of the content of the resource. Type includes terms
-   * describing general categories, functions, genres, or aggregation levels
-   * for content. Recommended best practice is to select a value from a
-   * controlled vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]).
-   * To describe the physical or digital manifestation of the resource, use
-   * the Format element.
+   * describing general categories, functions, genres, or aggregation levels 
for
+   * content. Recommended best practice is to select a value from a controlled
+   * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe
+   * the physical or digital manifestation of the resource, use the Format
+   * element.
    */
   public static final String TYPE = "type";
-  
+
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java Fri 
Jan  9 06:34:33 2015
@@ -16,14 +16,12 @@
  */
 package org.apache.nutch.metadata;
 
-
-
 /**
  * A collection of HTTP header names.
- *
- * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/";>Hypertext Transfer
- *      Protocol -- HTTP/1.1 (RFC 2616)</a>
- *
+ * 
+ * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/";>Hypertext Transfer 
Protocol
+ *      -- HTTP/1.1 (RFC 2616)</a>
+ * 
  * @author Chris Mattmann
  * @author J&eacute;r&ocirc;me Charron
  */

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java Fri 
Jan  9 06:34:33 2015
@@ -28,28 +28,29 @@ import org.apache.nutch.crawl.NutchWrita
 /**
  * This is a simple decorator that adds metadata to any Writable-s that can be
  * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be
- * temporarily enriched during processing, but this
- * temporary metadata doesn't need to be permanently stored after the job is 
done.
+ * temporarily enriched during processing, but this temporary metadata doesn't
+ * need to be permanently stored after the job is done.
  * 
  * @author Andrzej Bialecki
  */
 public class MetaWrapper extends NutchWritable {
   private Metadata metadata;
-  
+
   public MetaWrapper() {
     super();
     metadata = new Metadata();
   }
-  
+
   public MetaWrapper(Writable instance, Configuration conf) {
     super(instance);
     metadata = new Metadata();
     setConf(conf);
   }
-  
+
   public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) 
{
     super(instance);
-    if (metadata == null) metadata = new Metadata();
+    if (metadata == null)
+      metadata = new Metadata();
     this.metadata = metadata;
     setConf(conf);
   }
@@ -60,43 +61,52 @@ public class MetaWrapper extends NutchWr
   public Metadata getMetadata() {
     return metadata;
   }
-  
+
   /**
-   * Add metadata. See {@link Metadata#add(String, String)} for more 
information.
-   * @param name metadata name
-   * @param value metadata value
+   * Add metadata. See {@link Metadata#add(String, String)} for more
+   * information.
+   * 
+   * @param name
+   *          metadata name
+   * @param value
+   *          metadata value
    */
   public void addMeta(String name, String value) {
     metadata.add(name, value);
   }
-  
+
   /**
-   * Set metadata. See {@link Metadata#set(String, String)} for more 
information.
+   * Set metadata. See {@link Metadata#set(String, String)} for more
+   * information.
+   * 
    * @param name
    * @param value
    */
   public void setMeta(String name, String value) {
     metadata.set(name, value);
   }
-  
+
   /**
    * Get metadata. See {@link Metadata#get(String)} for more information.
+   * 
    * @param name
    * @return metadata value
    */
   public String getMeta(String name) {
     return metadata.get(name);
   }
-  
+
   /**
-   * Get multiple metadata. See {@link Metadata#getValues(String)} for more 
information.
+   * Get multiple metadata. See {@link Metadata#getValues(String)} for more
+   * information.
+   * 
    * @param name
    * @return multiple values
    */
   public String[] getMetaValues(String name) {
     return metadata.getValues(name);
   }
-  
+
   public void readFields(DataInput in) throws IOException {
     super.readFields(in);
     metadata = new Metadata();

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java Fri Jan 
 9 06:34:33 2015
@@ -27,23 +27,21 @@ import java.util.Properties;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 
-
 /**
  * A multi-valued metadata container.
- *
+ * 
  * @author Chris Mattmann
  * @author J&eacute;r&ocirc;me Charron
- *
+ * 
  */
-public class Metadata implements Writable, CreativeCommons,
-DublinCore, HttpHeaders, Nutch, Feed {
+public class Metadata implements Writable, CreativeCommons, DublinCore,
+    HttpHeaders, Nutch, Feed {
 
   /**
    * A map of all metadata attributes.
    */
   private Map<String, String[]> metadata = null;
 
-
   /**
    * Constructs a new, empty metadata.
    */
@@ -53,9 +51,10 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Returns true if named value is multivalued.
-   * @param name name of metadata
-   * @return true is named value is multivalued, false if single
-   * value or null
+   * 
+   * @param name
+   *          name of metadata
+   * @return true is named value is multivalued, false if single value or null
    */
   public boolean isMultiValued(final String name) {
     return metadata.get(name) != null && metadata.get(name).length > 1;
@@ -63,6 +62,7 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Returns an array of the names contained in the metadata.
+   * 
    * @return Metadata names
    */
   public String[] names() {
@@ -70,11 +70,11 @@ DublinCore, HttpHeaders, Nutch, Feed {
   }
 
   /**
-   * Get the value associated to a metadata name.
-   * If many values are assiociated to the specified name, then the first
-   * one is returned.
-   *
-   * @param name of the metadata.
+   * Get the value associated to a metadata name. If many values are 
assiociated
+   * to the specified name, then the first one is returned.
+   * 
+   * @param name
+   *          of the metadata.
    * @return the value associated to the specified metadata name.
    */
   public String get(final String name) {
@@ -88,13 +88,15 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Get the values associated to a metadata name.
-   * @param name of the metadata.
+   * 
+   * @param name
+   *          of the metadata.
    * @return the values associated to a metadata name.
    */
   public String[] getValues(final String name) {
     return _getValues(name);
   }
-  
+
   private String[] _getValues(final String name) {
     String[] values = metadata.get(name);
     if (values == null) {
@@ -104,12 +106,13 @@ DublinCore, HttpHeaders, Nutch, Feed {
   }
 
   /**
-   * Add a metadata name/value mapping.
-   * Add the specified value to the list of values associated to the
-   * specified metadata name.
-   *
-   * @param name the metadata name.
-   * @param value the metadata value.
+   * Add a metadata name/value mapping. Add the specified value to the list of
+   * values associated to the specified metadata name.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
    */
   public void add(final String name, final String value) {
     String[] values = metadata.get(name);
@@ -125,31 +128,37 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Copy All key-value pairs from properties.
-   * @param properties properties to copy from
+   * 
+   * @param properties
+   *          properties to copy from
    */
   public void setAll(Properties properties) {
     Enumeration<?> names = properties.propertyNames();
     while (names.hasMoreElements()) {
       String name = (String) names.nextElement();
-      metadata.put(name, new String[]{properties.getProperty(name)});
+      metadata.put(name, new String[] { properties.getProperty(name) });
     }
   }
 
   /**
-   * Set metadata name/value.
-   * Associate the specified value to the specified metadata name. If some
-   * previous values were associated to this name, they are removed.
-   *
-   * @param name the metadata name.
-   * @param value the metadata value.
+   * Set metadata name/value. Associate the specified value to the specified
+   * metadata name. If some previous values were associated to this name, they
+   * are removed.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
    */
   public void set(String name, String value) {
-    metadata.put(name, new String[]{value});
+    metadata.put(name, new String[] { value });
   }
 
   /**
    * Remove a metadata and all its associated values.
-   * @param name metadata name to remove
+   * 
+   * @param name
+   *          metadata name to remove
    */
   public void remove(String name) {
     metadata.remove(name);
@@ -157,12 +166,13 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Returns the number of metadata names in this metadata.
+   * 
    * @return number of metadata names
    */
   public int size() {
     return metadata.size();
   }
-  
+
   /** Remove all mappings from metadata. */
   public void clear() {
     metadata.clear();
@@ -170,7 +180,9 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   public boolean equals(Object o) {
 
-    if (o == null) { return false; }
+    if (o == null) {
+      return false;
+    }
 
     Metadata other = null;
     try {
@@ -179,7 +191,9 @@ DublinCore, HttpHeaders, Nutch, Feed {
       return false;
     }
 
-    if (other.size() != size()) { return false; }
+    if (other.size() != size()) {
+      return false;
+    }
 
     String[] names = names();
     for (int i = 0; i < names.length; i++) {
@@ -203,10 +217,7 @@ DublinCore, HttpHeaders, Nutch, Feed {
     for (int i = 0; i < names.length; i++) {
       String[] values = _getValues(names[i]);
       for (int j = 0; j < values.length; j++) {
-        buf.append(names[i])
-           .append("=")
-           .append(values[j])
-           .append(" ");
+        buf.append(names[i]).append("=").append(values[j]).append(" ");
       }
     }
     return buf.toString();

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java Fri Jan  9 
06:34:33 2015
@@ -19,20 +19,17 @@ package org.apache.nutch.metadata;
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.io.Text;
 
-
 /**
  * A collection of Nutch internal metadata constants.
- *
+ * 
  * @author Chris Mattmann
  * @author J&eacute;r&ocirc;me Charron
  */
 public interface Nutch {
 
-  public static final String ORIGINAL_CHAR_ENCODING =
-          "OriginalCharEncoding";
+  public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
 
-  public static final String CHAR_ENCODING_FOR_CONVERSION =
-          "CharEncodingForConversion";
+  public static final String CHAR_ENCODING_FOR_CONVERSION = 
"CharEncodingForConversion";
 
   public static final String SIGNATURE_KEY = "nutch.content.digest";
 
@@ -42,20 +39,26 @@ public interface Nutch {
 
   public static final String GENERATE_TIME_KEY = "_ngt_";
 
-  public static final Text WRITABLE_GENERATE_TIME_KEY = new 
Text(GENERATE_TIME_KEY);
+  public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+      GENERATE_TIME_KEY);
 
   public static final String PROTO_STATUS_KEY = "_pst_";
 
-  public static final Text WRITABLE_PROTO_STATUS_KEY = new 
Text(PROTO_STATUS_KEY);
+  public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+      PROTO_STATUS_KEY);
 
   public static final String FETCH_TIME_KEY = "_ftk_";
 
   public static final String FETCH_STATUS_KEY = "_fst_";
 
-  /** Sites may request that search engines don't provide access to cached 
documents. */
+  /**
+   * Sites may request that search engines don't provide access to cached
+   * documents.
+   */
   public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
 
-  public static final Utf8 CACHING_FORBIDDEN_KEY_UTF8 = new 
Utf8(CACHING_FORBIDDEN_KEY);
+  public static final Utf8 CACHING_FORBIDDEN_KEY_UTF8 = new Utf8(
+      CACHING_FORBIDDEN_KEY);
 
   /** Show both original forbidden content and summaries (default). */
   public static final String CACHING_FORBIDDEN_NONE = "none";
@@ -75,8 +78,7 @@ public interface Nutch {
   public static final Utf8 ALL_CRAWL_ID = new Utf8(ALL_BATCH_ID_STR);
 
   public static final String CRAWL_ID_KEY = "storage.crawl.id";
-  
-  
+
   // short constants for cmd-line args
   /** Batch id to select. */
   public static final String ARG_BATCH = "batch";
@@ -110,7 +112,7 @@ public interface Nutch {
   public static final String ARG_CLASS = "class";
   /** Depth (number of cycles) of a crawl. */
   public static final String ARG_DEPTH = "depth";
-  
+
   // short constants for status / results fields
   /** Status / result message. */
   public static final String STAT_MESSAGE = "msg";

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java 
Fri Jan  9 06:34:33 2015
@@ -33,7 +33,7 @@ public class SpellCheckedMetadata extend
 
   /**
    * Treshold divider.
-   *
+   * 
    * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
    */
   private static final int TRESHOLD_DIVIDER = 3;
@@ -52,7 +52,7 @@ public class SpellCheckedMetadata extend
 
     // Uses following array to fill the metanames index and the
     // metanames list.
-    Class<?>[] spellthese = {HttpHeaders.class};
+    Class<?>[] spellthese = { HttpHeaders.class };
 
     for (Class<?> spellCheckedNames : spellthese) {
       for (Field field : spellCheckedNames.getFields()) {
@@ -73,7 +73,7 @@ public class SpellCheckedMetadata extend
 
   /**
    * Normalizes String.
-   *
+   * 
    * @param str
    *          the string to normalize
    * @return normalized String
@@ -102,7 +102,7 @@ public class SpellCheckedMetadata extend
    * </ul>
    * If no matching with a well-known metadata name is found, then the original
    * name is returned.
-   *
+   * 
    * @param name
    *          Name to normalize
    * @return normalized name

Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java Fri Jan  9 
06:34:33 2015
@@ -23,17 +23,18 @@ import org.apache.hadoop.conf.Configurab
 // Nutch imports
 import org.apache.nutch.plugin.Pluggable;
 
-
 /**
- * Interface used to limit which URLs enter Nutch.
- * Used by the injector and the db updater.
+ * Interface used to limit which URLs enter Nutch. Used by the injector and the
+ * db updater.
  */
 
 public interface URLFilter extends Pluggable, Configurable {
   /** The name of the extension point. */
   public final static String X_POINT_ID = URLFilter.class.getName();
 
-  /* Interface for a filter that transforms a URL: it can pass the
-     original URL through or "delete" the URL by returning null */
+  /*
+   * Interface for a filter that transforms a URL: it can pass the original URL
+   * through or "delete" the URL by returning null
+   */
   public String filter(String urlString);
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java Fri 
Jan  9 06:34:33 2015
@@ -38,23 +38,23 @@ public class URLFilterChecker {
   private Configuration conf;
 
   public URLFilterChecker(Configuration conf) {
-      this.conf = conf;
+    this.conf = conf;
   }
 
   private void checkOne(String filterName) throws Exception {
     URLFilter filter = null;
 
-    ExtensionPoint point =
-      PluginRepository.get(conf).getExtensionPoint(URLFilter.X_POINT_ID);
+    ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+        URLFilter.X_POINT_ID);
 
     if (point == null)
-      throw new RuntimeException(URLFilter.X_POINT_ID+" not found.");
+      throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
 
     Extension[] extensions = point.getExtensions();
 
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
-      filter = (URLFilter)extension.getExtensionInstance();
+      filter = (URLFilter) extension.getExtensionInstance();
       if (filter.getClass().getName().equals(filterName)) {
         break;
       } else {
@@ -63,19 +63,19 @@ public class URLFilterChecker {
     }
 
     if (filter == null)
-      throw new RuntimeException("Filter "+filterName+" not found.");
+      throw new RuntimeException("Filter " + filterName + " not found.");
 
     // jerome : should we keep this behavior?
-    //if (LogFormatter.hasLoggedSevere())
-    //  throw new RuntimeException("Severe error encountered.");
+    // if (LogFormatter.hasLoggedSevere())
+    // throw new RuntimeException("Severe error encountered.");
 
-    System.out.println("Checking URLFilter "+filterName);
+    System.out.println("Checking URLFilter " + filterName);
 
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
-    while((line=in.readLine())!=null) {
-      String out=filter.filter(line);
-      if(out!=null) {
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
         System.out.print("+");
         System.out.println(out);
       } else {
@@ -90,10 +90,10 @@ public class URLFilterChecker {
 
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
-    while((line=in.readLine())!=null) {
+    while ((line = in.readLine()) != null) {
       URLFilters filters = new URLFilters(this.conf);
       String out = filters.filter(line);
-      if(out!=null) {
+      if (out != null) {
         System.out.print("+");
         System.out.println(out);
       } else {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java Fri Jan  9 
06:34:33 2015
@@ -28,7 +28,8 @@ import org.apache.nutch.plugin.PluginRep
 import org.apache.nutch.util.ObjectCache;
 
 import org.apache.hadoop.conf.Configuration;
-/** Creates and caches {@link URLFilter} implementing plugins.*/
+
+/** Creates and caches {@link URLFilter} implementing plugins. */
 public class URLFilters {
 
   public static final String URLFILTER_ORDER = "urlfilter.order";
@@ -37,7 +38,8 @@ public class URLFilters {
   public URLFilters(Configuration conf) {
     String order = conf.get(URLFILTER_ORDER);
     ObjectCache objectCache = ObjectCache.get(conf);
-    this.filters = (URLFilter[]) 
objectCache.getObject(URLFilter.class.getName());
+    this.filters = (URLFilter[]) objectCache.getObject(URLFilter.class
+        .getName());
 
     if (this.filters == null) {
       String[] orderedFilters = null;
@@ -60,8 +62,8 @@ public class URLFilters {
           }
         }
         if (orderedFilters == null) {
-          objectCache.setObject(URLFilter.class.getName(), 
filterMap.values().toArray(
-              new URLFilter[0]));
+          objectCache.setObject(URLFilter.class.getName(), filterMap.values()
+              .toArray(new URLFilter[0]));
         } else {
           ArrayList<URLFilter> filters = new ArrayList<URLFilter>();
           for (int i = 0; i < orderedFilters.length; i++) {
@@ -70,13 +72,14 @@ public class URLFilters {
               filters.add(filter);
             }
           }
-          objectCache.setObject(URLFilter.class.getName(), filters
-              .toArray(new URLFilter[filters.size()]));
+          objectCache.setObject(URLFilter.class.getName(),
+              filters.toArray(new URLFilter[filters.size()]));
         }
       } catch (PluginRuntimeException e) {
         throw new RuntimeException(e);
       }
-      this.filters = (URLFilter[]) 
objectCache.getObject(URLFilter.class.getName());
+      this.filters = (URLFilter[]) objectCache.getObject(URLFilter.class
+          .getName());
     }
   }
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java Fri Jan 
 9 06:34:33 2015
@@ -21,13 +21,17 @@ import java.net.MalformedURLException;
 
 import org.apache.hadoop.conf.Configurable;
 
-/** Interface used to convert URLs to normal form and optionally perform 
substitutions */
+/**
+ * Interface used to convert URLs to normal form and optionally perform
+ * substitutions
+ */
 public interface URLNormalizer extends Configurable {
-  
+
   /* Extension ID */
   public static final String X_POINT_ID = URLNormalizer.class.getName();
-  
+
   /* Interface for URL normalization */
-  public String normalize(String urlString, String scope) throws 
MalformedURLException;
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException;
 
 }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java 
Fri Jan  9 06:34:33 2015
@@ -36,23 +36,23 @@ public class URLNormalizerChecker {
   private Configuration conf;
 
   public URLNormalizerChecker(Configuration conf) {
-      this.conf = conf;
+    this.conf = conf;
   }
 
   private void checkOne(String normalizerName, String scope) throws Exception {
     URLNormalizer normalizer = null;
 
-    ExtensionPoint point =
-      PluginRepository.get(conf).getExtensionPoint(URLNormalizer.X_POINT_ID);
+    ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+        URLNormalizer.X_POINT_ID);
 
     if (point == null)
-      throw new RuntimeException(URLNormalizer.X_POINT_ID+" not found.");
+      throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found.");
 
     Extension[] extensions = point.getExtensions();
 
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
-      normalizer = (URLNormalizer)extension.getExtensionInstance();
+      normalizer = (URLNormalizer) extension.getExtensionInstance();
       if (normalizer.getClass().getName().equals(normalizerName)) {
         break;
       } else {
@@ -61,7 +61,8 @@ public class URLNormalizerChecker {
     }
 
     if (normalizer == null)
-      throw new RuntimeException("URLNormalizer "+normalizerName+" not 
found.");
+      throw new RuntimeException("URLNormalizer " + normalizerName
+          + " not found.");
 
     System.out.println("Checking URLNormalizer " + normalizerName);
 
@@ -79,7 +80,7 @@ public class URLNormalizerChecker {
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
     URLNormalizers normalizers = new URLNormalizers(conf, scope);
-    while((line = in.readLine()) != null) {
+    while ((line = in.readLine()) != null) {
       String out = normalizers.normalize(line, scope);
       System.out.println(out);
     }
@@ -88,7 +89,7 @@ public class URLNormalizerChecker {
   public static void main(String[] args) throws Exception {
 
     String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] 
[-scope <scope>]"
-      + "\n\tscope can be one of: 
default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
+        + "\n\tscope can be one of: 
default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
 
     String normalizerName = null;
     String scope = URLNormalizers.SCOPE_DEFAULT;
@@ -103,7 +104,8 @@ public class URLNormalizerChecker {
       }
     }
 
-    URLNormalizerChecker checker = new 
URLNormalizerChecker(NutchConfiguration.create());
+    URLNormalizerChecker checker = new URLNormalizerChecker(
+        NutchConfiguration.create());
     if (normalizerName != null) {
       checker.checkOne(normalizerName, scope);
     } else {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java Fri 
Jan  9 06:34:33 2015
@@ -43,47 +43,63 @@ import org.apache.nutch.util.ObjectCache
  * contexts where they are used (note however that they need to be activated
  * first through <tt>plugin.include</tt> property).
  * 
- * <p>There is one global scope defined by default, which consists of all
- * active normalizers. The order in which these normalizers
- * are executed may be defined in "urlnormalizer.order" property, which lists
- * space-separated implementation classes (if this property is missing 
normalizers
- * will be run in random order). If there are more
- * normalizers activated than explicitly named on this list, the remaining ones
- * will be run in random order after the ones specified on the list are 
executed.</p>
- * <p>You can define a set of contexts (or scopes) in which normalizers may be
+ * <p>
+ * There is one global scope defined by default, which consists of all active
+ * normalizers. The order in which these normalizers are executed may be 
defined
+ * in "urlnormalizer.order" property, which lists space-separated 
implementation
+ * classes (if this property is missing normalizers will be run in random
+ * order). If there are more normalizers activated than explicitly named on 
this
+ * list, the remaining ones will be run in random order after the ones 
specified
+ * on the list are executed.
+ * </p>
+ * <p>
+ * You can define a set of contexts (or scopes) in which normalizers may be
  * called. Each scope can have its own list of normalizers (defined in
  * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in
  * "urlnormalizer.order.<scope_name>" property). If any of these properties are
- * missing, default settings are used for the global scope.</p>
- * <p>In case no normalizers are required for any given scope, a
- * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> 
should be used.</p>
- * <p>Each normalizer may further select among many configurations, depending 
on
- * the scope in which it is called, because the scope name is passed as a 
parameter
- * to each normalizer. You can also use the same normalizer for many 
scopes.</p>
- * <p>Several scopes have been defined, and various Nutch tools will attempt 
using
- * scope-specific normalizers first (and fall back to default config if 
scope-specific
- * configuration is missing).</p>
- * <p>Normalizers may be run several times, to ensure that modifications 
introduced
+ * missing, default settings are used for the global scope.
+ * </p>
+ * <p>
+ * In case no normalizers are required for any given scope, a
+ * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> 
should
+ * be used.
+ * </p>
+ * <p>
+ * Each normalizer may further select among many configurations, depending on
+ * the scope in which it is called, because the scope name is passed as a
+ * parameter to each normalizer. You can also use the same normalizer for many
+ * scopes.
+ * </p>
+ * <p>
+ * Several scopes have been defined, and various Nutch tools will attempt using
+ * scope-specific normalizers first (and fall back to default config if
+ * scope-specific configuration is missing).
+ * </p>
+ * <p>
+ * Normalizers may be run several times, to ensure that modifications 
introduced
  * by normalizers at the end of the list can be further reduced by normalizers
- * executed at the beginning. By default this loop is executed just once - if 
you want
- * to ensure that all possible combinations have been applied you may want to 
run
- * this loop up to the number of activated normalizers. This loop count can be 
configured
- * through <tt>urlnormalizer.loop.count</tt> property. As soon as the url is
- * unchanged the loop will stop and return the result.</p>
+ * executed at the beginning. By default this loop is executed just once - if
+ * you want to ensure that all possible combinations have been applied you may
+ * want to run this loop up to the number of activated normalizers. This loop
+ * count can be configured through <tt>urlnormalizer.loop.count</tt> property.
+ * As soon as the url is unchanged the loop will stop and return the result.
+ * </p>
  * 
  * @author Andrzej Bialecki
  */
 public final class URLNormalizers {
-  
-  /** Default scope. If no scope properties are defined then the configuration 
for
-   * this scope will be used.
+
+  /**
+   * Default scope. If no scope properties are defined then the configuration
+   * for this scope will be used.
    */
   public static final String SCOPE_DEFAULT = "default";
   /** Scope used by {@link org.apache.nutch.crawl.URLPartitioner}. */
   public static final String SCOPE_PARTITION = "partition";
   /** Scope used by {@link org.apache.nutch.crawl.GeneratorJob}. */
   public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count";
-  /** Scope used by {@link org.apache.nutch.fetcher.FetcherJob} when processing
+  /**
+   * Scope used by {@link org.apache.nutch.fetcher.FetcherJob} when processing
    * redirect URLs.
    */
   public static final String SCOPE_FETCHER = "fetcher";
@@ -93,15 +109,18 @@ public final class URLNormalizers {
   public static final String SCOPE_LINKDB = "linkdb";
   /** Scope used by {@link org.apache.nutch.crawl.InjectorJob}. */
   public static final String SCOPE_INJECT = "inject";
-  /** Scope used when constructing new {@link org.apache.nutch.parse.Outlink} 
instances. */
+  /**
+   * Scope used when constructing new {@link org.apache.nutch.parse.Outlink}
+   * instances.
+   */
   public static final String SCOPE_OUTLINK = "outlink";
-  
 
-  public static final Logger LOG = 
LoggerFactory.getLogger(URLNormalizers.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(URLNormalizers.class);
 
   /* Empty extension list for caching purposes. */
   private final List<Extension> EMPTY_EXTENSION_LIST = Collections.emptyList();
-  
+
   private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0];
 
   private Configuration conf;
@@ -109,37 +128,39 @@ public final class URLNormalizers {
   private ExtensionPoint extensionPoint;
 
   private URLNormalizer[] normalizers;
-  
+
   private int loopCount;
 
   public URLNormalizers(Configuration conf, String scope) {
     this.conf = conf;
     this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
-            URLNormalizer.X_POINT_ID);
+        URLNormalizer.X_POINT_ID);
     ObjectCache objectCache = ObjectCache.get(conf);
-    
+
     if (this.extensionPoint == null) {
       throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID
-              + " not found.");
+          + " not found.");
     }
 
-    normalizers = 
(URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + scope);
+    normalizers = (URLNormalizer[]) objectCache
+        .getObject(URLNormalizer.X_POINT_ID + "_" + scope);
     if (normalizers == null) {
       normalizers = getURLNormalizers(scope);
     }
     if (normalizers == EMPTY_NORMALIZERS) {
-      normalizers = 
(URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + 
SCOPE_DEFAULT);
+      normalizers = (URLNormalizer[]) objectCache
+          .getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
       if (normalizers == null) {
         normalizers = getURLNormalizers(SCOPE_DEFAULT);
       }
     }
-    
+
     loopCount = conf.getInt("urlnormalizer.loop.count", 1);
   }
 
   /**
-   * Function returns an array of {@link URLNormalizer}s for a given scope,
-   * with a specified order.
+   * Function returns an array of {@link URLNormalizer}s for a given scope, 
with
+   * a specified order.
    * 
    * @param scope
    *          The scope to return the <code>Array</code> of
@@ -151,12 +172,13 @@ public final class URLNormalizers {
   URLNormalizer[] getURLNormalizers(String scope) {
     List<Extension> extensions = getExtensions(scope);
     ObjectCache objectCache = ObjectCache.get(conf);
-    
+
     if (extensions == EMPTY_EXTENSION_LIST) {
       return EMPTY_NORMALIZERS;
     }
-    
-    List<URLNormalizer> normalizers = new 
Vector<URLNormalizer>(extensions.size());
+
+    List<URLNormalizer> normalizers = new Vector<URLNormalizer>(
+        extensions.size());
 
     Iterator<Extension> it = extensions.iterator();
     while (it.hasNext()) {
@@ -174,14 +196,13 @@ public final class URLNormalizers {
       } catch (PluginRuntimeException e) {
         e.printStackTrace();
         LOG.warn("URLNormalizers:PluginRuntimeException when "
-                + "initializing url normalizer plugin "
-                + ext.getDescriptor().getPluginId()
-                + " instance in getURLNormalizers "
-                + "function: attempting to continue instantiating plugins");
+            + "initializing url normalizer plugin "
+            + ext.getDescriptor().getPluginId()
+            + " instance in getURLNormalizers "
+            + "function: attempting to continue instantiating plugins");
       }
     }
-    return normalizers.toArray(new URLNormalizer[normalizers
-            .size()]);
+    return normalizers.toArray(new URLNormalizer[normalizers.size()]);
   }
 
   /**
@@ -196,9 +217,8 @@ public final class URLNormalizers {
   @SuppressWarnings("unchecked")
   private List<Extension> getExtensions(String scope) {
     ObjectCache objectCache = ObjectCache.get(conf);
-    List<Extension> extensions = 
-      (List<Extension>) objectCache.getObject(URLNormalizer.X_POINT_ID + "_x_"
-                                                + scope);
+    List<Extension> extensions = (List<Extension>) objectCache
+        .getObject(URLNormalizer.X_POINT_ID + "_x_" + scope);
 
     // Just compare the reference:
     // if this is the empty list, we know we will find no extension.
@@ -209,11 +229,13 @@ public final class URLNormalizers {
     if (extensions == null) {
       extensions = findExtensions(scope);
       if (extensions != null) {
-        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, 
extensions);
+        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+            extensions);
       } else {
         // Put the empty extension list into cache
         // to remember we don't know any related extension.
-        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, 
EMPTY_EXTENSION_LIST);
+        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+            EMPTY_EXTENSION_LIST);
         extensions = EMPTY_EXTENSION_LIST;
       }
     }
@@ -233,7 +255,8 @@ public final class URLNormalizers {
 
     String[] orders = null;
     String orderlist = conf.get("urlnormalizer.order." + scope);
-    if (orderlist == null) orderlist = conf.get("urlnormalizer.order");
+    if (orderlist == null)
+      orderlist = conf.get("urlnormalizer.order");
     if (orderlist != null && !orderlist.trim().equals("")) {
       orders = orderlist.split("\\s+");
     }
@@ -271,13 +294,17 @@ public final class URLNormalizers {
 
   /**
    * Normalize
-   * @param urlString The URL string to normalize.
-   * @param scope The given scope.
+   * 
+   * @param urlString
+   *          The URL string to normalize.
+   * @param scope
+   *          The given scope.
    * @return A normalized String, using the given <code>scope</code>
-   * @throws MalformedURLException If the given URL string is malformed.
+   * @throws MalformedURLException
+   *           If the given URL string is malformed.
    */
   public String normalize(String urlString, String scope)
-          throws MalformedURLException {
+      throws MalformedURLException {
     // optionally loop several times, and break if no further changes
     String initialString = urlString;
     for (int k = 0; k < loopCount; k++) {
@@ -286,7 +313,8 @@ public final class URLNormalizers {
           return null;
         urlString = this.normalizers[i].normalize(urlString, scope);
       }
-      if (initialString.equals(urlString)) break;
+      if (initialString.equals(urlString))
+        break;
       initialString = urlString;
     }
     return urlString;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java Fri Jan  
9 06:34:33 2015
@@ -20,3 +20,4 @@
  * and {@link org.apache.nutch.net.URLNormalizer normalizers}.
  */
 package org.apache.nutch.net;
+

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java 
Fri Jan  9 06:34:33 2015
@@ -26,15 +26,15 @@ import java.text.ParseException;
 
 /**
  * class to handle HTTP dates.
- *
+ * 
  * Modified from FastHttpDateFormat.java in jakarta-tomcat.
- *
+ * 
  * @author John Xing
  */
 public class HttpDateFormat {
 
-  protected static SimpleDateFormat format = 
-    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
+  protected static SimpleDateFormat format = new SimpleDateFormat(
+      "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
 
   /**
    * HTTP date uses TimeZone GMT
@@ -43,29 +43,29 @@ public class HttpDateFormat {
     format.setTimeZone(TimeZone.getTimeZone("GMT"));
   }
 
-  //HttpDate (long t) {
-  //}
+  // HttpDate (long t) {
+  // }
 
-  //HttpDate (String s) {
-  //}
+  // HttpDate (String s) {
+  // }
 
-//  /**
-//   * Get the current date in HTTP format.
-//   */
-//  public static String getCurrentDate() {
-//
-//    long now = System.currentTimeMillis();
-//    if ((now - currentDateGenerated) > 1000) {
-//        synchronized (format) {
-//            if ((now - currentDateGenerated) > 1000) {
-//                currentDateGenerated = now;
-//                currentDate = format.format(new Date(now));
-//            }
-//        }
-//    }
-//    return currentDate;
-//
-//  }
+  // /**
+  // * Get the current date in HTTP format.
+  // */
+  // public static String getCurrentDate() {
+  //
+  // long now = System.currentTimeMillis();
+  // if ((now - currentDateGenerated) > 1000) {
+  // synchronized (format) {
+  // if ((now - currentDateGenerated) > 1000) {
+  // currentDateGenerated = now;
+  // currentDate = format.format(new Date(now));
+  // }
+  // }
+  // }
+  // return currentDate;
+  //
+  // }
 
   /**
    * Get the HTTP format of the specified date.

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java
 (original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java
 Fri Jan  9 06:34:33 2015
@@ -21,13 +21,13 @@ import java.io.Serializable;
 
 /**
  * Base exception for all protocol handlers
+ * 
  * @deprecated Use {@link org.apache.nutch.protocol.ProtocolException} instead.
  */
 @Deprecated
 @SuppressWarnings("serial")
 public class ProtocolException extends Exception implements Serializable {
 
-
   public ProtocolException() {
     super();
   }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java 
Fri Jan  9 06:34:33 2015
@@ -23,12 +23,11 @@ import java.net.URL;
 import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Metadata;
 
-
 /**
- * A response interface.  Makes all protocols model HTTP.
+ * A response interface. Makes all protocols model HTTP.
  */
 public interface Response extends HttpHeaders {
-  
+
   /** Returns the URL used to retrieve this response. */
   public URL getUrl();
 
@@ -40,7 +39,7 @@ public interface Response extends HttpHe
 
   /** Returns all the headers. */
   public Metadata getHeaders();
-  
+
   /** Returns the full content of the response. */
   public byte[] getContent();
 

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java 
(original)
+++ 
nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java 
Fri Jan  9 06:34:33 2015
@@ -20,3 +20,4 @@
  * interface, sea also {@link org.apache.nutch.protocol}.
  */
 package org.apache.nutch.net.protocols;
+

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java Fri 
Jan  9 06:34:33 2015
@@ -24,8 +24,8 @@ import java.util.Properties;
 import org.apache.nutch.metadata.Metadata;
 
 /**
- * This class holds the information about HTML "meta" tags extracted from 
- * a page. Some special tags have convenience methods for easy checking.
+ * This class holds the information about HTML "meta" tags extracted from a
+ * page. Some special tags have convenience methods for easy checking.
  */
 public class HTMLMetaTags {
   private boolean noIndex = false;
@@ -156,8 +156,8 @@ public class HTMLMetaTags {
   }
 
   /**
-   * A convenience method. Returns the current value of 
<code>refreshTime</code>.
-   * The value may be invalid if {@link #getRefresh()}returns
+   * A convenience method. Returns the current value of 
<code>refreshTime</code>
+   * . The value may be invalid if {@link #getRefresh()}returns
    * <code>false</code>.
    */
   public int getRefreshTime() {
@@ -179,16 +179,12 @@ public class HTMLMetaTags {
   public Properties getHttpEquivTags() {
     return httpEquivTags;
   }
-  
+
   public String toString() {
     StringBuffer sb = new StringBuffer();
-    sb.append("base=" + baseHref
-            + ", noCache=" + noCache
-            + ", noFollow=" + noFollow
-            + ", noIndex=" + noIndex
-            + ", refresh=" + refresh
-            + ", refreshHref=" + refreshHref + "\n"
-            );
+    sb.append("base=" + baseHref + ", noCache=" + noCache + ", noFollow="
+        + noFollow + ", noIndex=" + noIndex + ", refresh=" + refresh
+        + ", refreshHref=" + refreshHref + "\n");
     sb.append(" * general tags:\n");
     String[] names = generalTags.names();
     for (String name : names) {
@@ -198,7 +194,7 @@ public class HTMLMetaTags {
     sb.append(" * http-equiv tags:\n");
     Iterator<Object> it = httpEquivTags.keySet().iterator();
     while (it.hasNext()) {
-      String key = (String)it.next();
+      String key = (String) it.next();
       sb.append("   - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
     }
     return sb.toString();

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java Fri Jan  9 
06:34:33 2015
@@ -28,11 +28,13 @@ public class Outlink implements Writable
   private String toUrl;
   private String anchor;
 
-  public Outlink() {}
+  public Outlink() {
+  }
 
   public Outlink(String toUrl, String anchor) throws MalformedURLException {
     this.toUrl = toUrl;
-    if (anchor == null) anchor = "";
+    if (anchor == null)
+      anchor = "";
     this.anchor = anchor;
   }
 
@@ -43,8 +45,8 @@ public class Outlink implements Writable
 
   /** Skips over one Outlink in the input. */
   public static void skip(DataInput in) throws IOException {
-    Text.skip(in);                                // skip toUrl
-    Text.skip(in);                                // skip anchor
+    Text.skip(in); // skip toUrl
+    Text.skip(in); // skip anchor
   }
 
   public void write(DataOutput out) throws IOException {
@@ -58,21 +60,24 @@ public class Outlink implements Writable
     return outlink;
   }
 
-  public String getToUrl() { return toUrl; }
-  public String getAnchor() { return anchor; }
+  public String getToUrl() {
+    return toUrl;
+  }
 
+  public String getAnchor() {
+    return anchor;
+  }
 
   public boolean equals(Object o) {
     if (!(o instanceof Outlink))
       return false;
-    Outlink other = (Outlink)o;
-    return
-      this.toUrl.equals(other.toUrl) &&
-      this.anchor.equals(other.anchor);
+    Outlink other = (Outlink) o;
+    return this.toUrl.equals(other.toUrl) && this.anchor.equals(other.anchor);
   }
 
   public String toString() {
-    return "toUrl: " + toUrl + " anchor: " + anchor;  // removed "\n". 
toString, not printLine... WD.
+    return "toUrl: " + toUrl + " anchor: " + anchor; // removed "\n". toString,
+                                                     // not printLine... WD.
   }
 
 }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java 
Fri Jan  9 06:34:33 2015
@@ -34,8 +34,8 @@ import org.apache.oro.text.regex.Perl5Co
 import org.apache.oro.text.regex.Perl5Matcher;
 
 /**
- * Extractor to extract {@link org.apache.nutch.parse.Outlink}s 
- * / URLs from plain text using Regular Expressions.
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
+ * plain text using Regular Expressions.
  * 
  * @see <a
  *      
href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions";>Comparison
@@ -48,23 +48,26 @@ import org.apache.oro.text.regex.Perl5Ma
  * @since 0.7
  */
 public class OutlinkExtractor {
-  private static final Logger LOG = 
LoggerFactory.getLogger(OutlinkExtractor.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(OutlinkExtractor.class);
 
   /**
    * Regex pattern to get URLs within a plain text.
    * 
    * @see <a
    *      
href="http://www.truerwords.net/articles/ut/urlactivation.html";>http://www.truerwords.net/articles/ut/urlactivation.html
+
    *      </a>
    */
-  private static final String URL_PATTERN = 
-    
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+  private static final String URL_PATTERN = 
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
 
   /**
-   * Extracts <code>Outlink</code> from given plain text.
-   * Applying this method to non-plain-text can result in extremely lengthy
-   * runtimes for parasitic cases (postscript is a known example).
-   * @param plainText  the plain text from wich URLs should be extracted.
+   * Extracts <code>Outlink</code> from given plain text. Applying this method
+   * to non-plain-text can result in extremely lengthy runtimes for parasitic
+   * cases (postscript is a known example).
+   * 
+   * @param plainText
+   *          the plain text from wich URLs should be extracted.
    * 
    * @return Array of <code>Outlink</code>s within found in plainText
    */
@@ -73,15 +76,18 @@ public class OutlinkExtractor {
   }
 
   /**
-   * Extracts <code>Outlink</code> from given plain text and adds anchor
-   * to the extracted <code>Outlink</code>s
+   * Extracts <code>Outlink</code> from given plain text and adds anchor to the
+   * extracted <code>Outlink</code>s
    * 
-   * @param plainText the plain text from wich URLs should be extracted.
-   * @param anchor    the anchor of the url
+   * @param plainText
+   *          the plain text from wich URLs should be extracted.
+   * @param anchor
+   *          the anchor of the url
    * 
    * @return Array of <code>Outlink</code>s within found in plainText
    */
-  public static Outlink[] getOutlinks(final String plainText, String anchor, 
Configuration conf) {
+  public static Outlink[] getOutlinks(final String plainText, String anchor,
+      Configuration conf) {
     long start = System.currentTimeMillis();
     final List<Outlink> outlinks = new ArrayList<Outlink>();
 
@@ -97,11 +103,11 @@ public class OutlinkExtractor {
       MatchResult result;
       String url;
 
-      //loop the matches
+      // loop the matches
       while (matcher.contains(input, pattern)) {
         // if this is taking too long, stop matching
-        //   (SHOULD really check cpu time used so that heavily loaded systems
-        //   do not unnecessarily hit this limit.)
+        // (SHOULD really check cpu time used so that heavily loaded systems
+        // do not unnecessarily hit this limit.)
         if (System.currentTimeMillis() - start >= 60000L) {
           if (LOG.isWarnEnabled()) {
             LOG.warn("Time limit exceeded for getOutLinks");
@@ -117,13 +123,16 @@ public class OutlinkExtractor {
         }
       }
     } catch (Exception ex) {
-      // if the matcher fails (perhaps a malformed URL) we just log it and 
move on
-      if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
+      // if the matcher fails (perhaps a malformed URL) we just log it and move
+      // on
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getOutlinks", ex);
+      }
     }
 
     final Outlink[] retval;
 
-    //create array of the Outlinks
+    // create array of the Outlinks
     if (outlinks != null && outlinks.size() > 0) {
       retval = outlinks.toArray(new Outlink[0]);
     } else {
@@ -132,7 +141,6 @@ public class OutlinkExtractor {
 
     return retval;
   }
-  
 
   /**
    * Extracts outlinks from a plain text. <br />
@@ -162,7 +170,7 @@ public class OutlinkExtractor {
     // url = re.getParen(0);
     //
     // if (LOG.isTraceEnabled()) {
-    //   LOG.trace("Extracted url: " + url);
+    // LOG.trace("Extracted url: " + url);
     // }
     //
     // try {
@@ -192,9 +200,8 @@ public class OutlinkExtractor {
   }
 
   /**
-   * Extracts outlinks from a plain text.
-   * </p>
-   * This Method takes the JDK5 Regexp API.
+   * Extracts outlinks from a plain text. </p> This Method takes the JDK5 
Regexp
+   * API.
    * 
    * @param plainText
    * 
@@ -243,5 +250,5 @@ public class OutlinkExtractor {
     //
     // return retval;
   }
- 
+
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java Fri Jan  9 
06:34:33 2015
@@ -16,7 +16,6 @@
  
******************************************************************************/
 package org.apache.nutch.parse;
 
-
 public class Parse {
 
   private String text;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java Fri 
Jan  9 06:34:33 2015
@@ -24,7 +24,7 @@ class ParseCallable implements Callable<
   private Parser p;
   private WebPage content;
   private String url;
-  
+
   public ParseCallable(Parser p, WebPage content, String url) {
     this.p = p;
     this.content = content;
@@ -34,5 +34,5 @@ class ParseCallable implements Callable<
   @Override
   public Parse call() throws Exception {
     return p.getParse(url, content);
-  }    
+  }
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java Fri Jan 
 9 06:34:33 2015
@@ -22,18 +22,19 @@ import org.apache.nutch.plugin.FieldPlug
 import org.apache.nutch.storage.WebPage;
 import org.w3c.dom.DocumentFragment;
 
-
-/** Extension point for DOM-based parsers.  Permits one to add additional
- * metadata to parses provided by the html or tika plugins.  All plugins found 
which implement this extension
- * point are run sequentially on the parse.
+/**
+ * Extension point for DOM-based parsers. Permits one to add additional 
metadata
+ * to parses provided by the html or tika plugins. All plugins found which
+ * implement this extension point are run sequentially on the parse.
  */
 public interface ParseFilter extends FieldPluggable, Configurable {
   /** The name of the extension point. */
   final static String X_POINT_ID = ParseFilter.class.getName();
 
-  /** Adds metadata or otherwise modifies a parse, given
-   * the DOM tree of a page. */
-  Parse filter(String url, WebPage page, Parse parse,
-                    HTMLMetaTags metaTags, DocumentFragment doc);
+  /**
+   * Adds metadata or otherwise modifies a parse, given the DOM tree of a page.
+   */
+  Parse filter(String url, WebPage page, Parse parse, HTMLMetaTags metaTags,
+      DocumentFragment doc);
 
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java Fri 
Jan  9 06:34:33 2015
@@ -31,7 +31,7 @@ import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.ObjectCache;
 import org.w3c.dom.DocumentFragment;
 
-/** Creates and caches {@link ParseFilter} implementing plugins.*/
+/** Creates and caches {@link ParseFilter} implementing plugins. */
 public class ParseFilters {
 
   private ParseFilter[] parseFilters;
@@ -41,7 +41,8 @@ public class ParseFilters {
   public ParseFilters(Configuration conf) {
     String order = conf.get(HTMLPARSEFILTER_ORDER);
     ObjectCache objectCache = ObjectCache.get(conf);
-    this.parseFilters = (ParseFilter[]) 
objectCache.getObject(ParseFilter.class.getName());
+    this.parseFilters = (ParseFilter[]) objectCache.getObject(ParseFilter.class
+        .getName());
     if (parseFilters == null) {
       /*
        * If ordered filters are required, prepare array of filters based on
@@ -51,21 +52,23 @@ public class ParseFilters {
       if (order != null && !order.trim().equals("")) {
         orderedFilters = order.split("\\s+");
       }
-      HashMap<String, ParseFilter> filterMap =
-        new HashMap<String, ParseFilter>();
+      HashMap<String, ParseFilter> filterMap = new HashMap<String, 
ParseFilter>();
       try {
-        ExtensionPoint point = 
PluginRepository.get(conf).getExtensionPoint(ParseFilter.X_POINT_ID);
+        ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+            ParseFilter.X_POINT_ID);
         if (point == null)
           throw new RuntimeException(ParseFilter.X_POINT_ID + " not found.");
         Extension[] extensions = point.getExtensions();
         for (int i = 0; i < extensions.length; i++) {
           Extension extension = extensions[i];
-          ParseFilter parseFilter = (ParseFilter) 
extension.getExtensionInstance();
+          ParseFilter parseFilter = (ParseFilter) extension
+              .getExtensionInstance();
           if (!filterMap.containsKey(parseFilter.getClass().getName())) {
             filterMap.put(parseFilter.getClass().getName(), parseFilter);
           }
         }
-        ParseFilter[] htmlParseFilters = filterMap.values().toArray(new 
ParseFilter[filterMap.size()]);
+        ParseFilter[] htmlParseFilters = filterMap.values().toArray(
+            new ParseFilter[filterMap.size()]);
         /*
          * If no ordered filters required, just get the filters in an
          * indeterminate order
@@ -77,19 +80,19 @@ public class ParseFilters {
         else {
           ArrayList<ParseFilter> filters = new ArrayList<ParseFilter>();
           for (int i = 0; i < orderedFilters.length; i++) {
-            ParseFilter filter = filterMap
-            .get(orderedFilters[i]);
+            ParseFilter filter = filterMap.get(orderedFilters[i]);
             if (filter != null) {
               filters.add(filter);
             }
           }
-          objectCache.setObject(ParseFilter.class.getName(), filters
-              .toArray(new ParseFilter[filters.size()]));
+          objectCache.setObject(ParseFilter.class.getName(),
+              filters.toArray(new ParseFilter[filters.size()]));
         }
       } catch (PluginRuntimeException e) {
         throw new RuntimeException(e);
       }
-      this.parseFilters = (ParseFilter[]) 
objectCache.getObject(ParseFilter.class.getName());
+      this.parseFilters = (ParseFilter[]) objectCache
+          .getObject(ParseFilter.class.getName());
     }
   }
 

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java Fri 
Jan  9 06:34:33 2015
@@ -22,25 +22,23 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
-
 /**
  * This class represents a natural ordering for which parsing plugin should get
  * called for a particular mimeType. It provides methods to store the
  * parse-plugins.xml data, and methods to retreive the name of the appropriate
  * parsing plugin for a contentType.
- *
+ * 
  * @author mattmann
  * @version 1.0
  */
 public class ParsePluginList {
-  
+
   /* a map to link mimeType to an ordered list of parsing plugins */
   private Map<String, List<String>> fMimeTypeToPluginMap = null;
-  
+
   /* A list of aliases */
   private Map<String, String> aliases = null;
-  
-  
+
   /**
    * Constructs a new ParsePluginList
    */
@@ -48,7 +46,7 @@ public class ParsePluginList {
     fMimeTypeToPluginMap = new HashMap<String, List<String>>();
     aliases = new HashMap<String, String>();
   }
-  
+
   public List<String> getPluginList(String mimeType) {
     return fMimeTypeToPluginMap.get(mimeType);
   }
@@ -56,18 +54,18 @@ public class ParsePluginList {
   void setAliases(Map<String, String> aliases) {
     this.aliases = aliases;
   }
-  
+
   public Map<String, String> getAliases() {
     return aliases;
   }
-  
+
   void setPluginList(String mimeType, List<String> l) {
     fMimeTypeToPluginMap.put(mimeType, l);
   }
-  
+
   List<String> getSupportedMimeTypes() {
-    return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray(
-            new String[] {}));
+    return Arrays
+        .asList(fMimeTypeToPluginMap.keySet().toArray(new String[] {}));
   }
-  
+
 }

Modified: 
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=1650447&r1=1650446&r2=1650447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java 
Fri Jan  9 06:34:33 2015
@@ -42,50 +42,50 @@ import org.apache.hadoop.conf.Configurat
 // Nutch imports
 import org.apache.nutch.util.NutchConfiguration;
 
-
 /**
  * A reader to load the information stored in the
  * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file.
- *
+ * 
  * @author mattmann
  * @version 1.0
  */
 public class ParsePluginsReader {
-  
+
   /* our log stream */
-  public static final Logger LOG = 
LoggerFactory.getLogger(ParsePluginsReader.class);
-  
+  public static final Logger LOG = LoggerFactory
+      .getLogger(ParsePluginsReader.class);
+
   /** The property name of the parse-plugins location */
   private static final String PP_FILE_PROP = "parse.plugin.file";
 
   /** the parse-plugins file */
   private String fParsePluginsFile = null;
 
-  
   /**
    * Constructs a new ParsePluginsReader
    */
-  public ParsePluginsReader() { }
-  
+  public ParsePluginsReader() {
+  }
+
   /**
    * Reads the <code>parse-plugins.xml</code> file and returns the
    * {@link #ParsePluginList} defined by it.
-   *
+   * 
    * @return A {@link #ParsePluginList} specified by the
    *         <code>parse-plugins.xml</code> file.
    * @throws Exception
-   *             If any parsing error occurs.
+   *           If any parsing error occurs.
    */
   public ParsePluginList parse(Configuration conf) {
-    
+
     ParsePluginList pList = new ParsePluginList();
-    
+
     // open up the XML file
     DocumentBuilderFactory factory = null;
     DocumentBuilder parser = null;
     Document document = null;
     InputSource inputSource = null;
-    
+
     InputStream ppInputStream = null;
     if (fParsePluginsFile != null) {
       URL parsePluginUrl = null;
@@ -94,56 +94,55 @@ public class ParsePluginsReader {
         ppInputStream = parsePluginUrl.openStream();
       } catch (Exception e) {
         if (LOG.isWarnEnabled()) {
-          LOG.warn("Unable to load parse plugins file from URL " +
-                   "[" + fParsePluginsFile + "]. Reason is [" + e + "]");
+          LOG.warn("Unable to load parse plugins file from URL " + "["
+              + fParsePluginsFile + "]. Reason is [" + e + "]");
         }
         return pList;
       }
     } else {
-      ppInputStream = conf.getConfResourceAsInputStream(
-                          conf.get(PP_FILE_PROP));
+      ppInputStream = 
conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP));
     }
-    
+
     inputSource = new InputSource(ppInputStream);
-    
+
     try {
       factory = DocumentBuilderFactory.newInstance();
       parser = factory.newDocumentBuilder();
       document = parser.parse(inputSource);
     } catch (Exception e) {
       if (LOG.isWarnEnabled()) {
-        LOG.warn("Unable to parse [" + fParsePluginsFile + "]." +
-                 "Reason is [" + e + "]");
+        LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is ["
+            + e + "]");
       }
       return null;
     }
-    
+
     Element parsePlugins = document.getDocumentElement();
-    
+
     // build up the alias hash map
     Map<String, String> aliases = getAliases(parsePlugins);
     // And store it on the parse plugin list
     pList.setAliases(aliases);
-     
+
     // get all the mime type nodes
     NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType");
-    
+
     // iterate through the mime types
     for (int i = 0; i < mimeTypes.getLength(); i++) {
       Element mimeType = (Element) mimeTypes.item(i);
       String mimeTypeStr = mimeType.getAttribute("name");
-      
+
       // for each mimeType, get the plugin list
       NodeList pluginList = mimeType.getElementsByTagName("plugin");
-      
+
       // iterate through the plugins, add them in order read
       // OR if they have a special order="" attribute, then hold those in
       // a separate list, and then insert them into the final list at the
       // order specified
       if (pluginList != null && pluginList.getLength() > 0) {
         List<String> plugList = new ArrayList<String>(pluginList.getLength());
-        
-        for (int j = 0; j<pluginList.getLength(); j++) {
+
+        for (int j = 0; j < pluginList.getLength(); j++) {
           Element plugin = (Element) pluginList.item(j);
           String pluginId = plugin.getAttribute("id");
           String extId = aliases.get(pluginId);
@@ -163,110 +162,110 @@ public class ParsePluginsReader {
             plugList.add(extId);
           }
         }
-        
+
         // now add the plugin list and map it to this mimeType
         pList.setPluginList(mimeTypeStr, plugList);
-        
+
       } else if (LOG.isWarnEnabled()) {
         LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: "
-                 + mimeTypeStr + ", continuing parse");
+            + mimeTypeStr + ", continuing parse");
       }
     }
     return pList;
   }
-  
+
   /**
    * Tests parsing of the parse-plugins.xml file. An alternative name for the
-   * file can be specified via the <code>--file</code> option, although the
-   * file must be located in the <code>$NUTCH_HOME/conf</code> directory.
-   *
+   * file can be specified via the <code>--file</code> option, although the 
file
+   * must be located in the <code>$NUTCH_HOME/conf</code> directory.
+   * 
    * @param args
-   *            Currently only the --file argument to specify an alternative
-   *            name for the parse-plugins.xml file is supported.
+   *          Currently only the --file argument to specify an alternative name
+   *          for the parse-plugins.xml file is supported.
    */
   public static void main(String[] args) throws Exception {
     String parsePluginFile = null;
     String usage = "ParsePluginsReader [--file <parse plugin file location>]";
-    
-    if (( args.length != 0 && args.length != 2 )
+
+    if ((args.length != 0 && args.length != 2)
         || (args.length == 2 && !"--file".equals(args[0]))) {
       System.err.println(usage);
       System.exit(1);
     }
-    
+
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("--file")) {
         parsePluginFile = args[++i];
       }
     }
-    
+
     ParsePluginsReader reader = new ParsePluginsReader();
-    
+
     if (parsePluginFile != null) {
       reader.setFParsePluginsFile(parsePluginFile);
     }
-    
+
     ParsePluginList prefs = reader.parse(NutchConfiguration.create());
-    
+
     for (String mimeType : prefs.getSupportedMimeTypes()) {
-      
+
       System.out.println("MIMETYPE: " + mimeType);
       List<String> plugList = prefs.getPluginList(mimeType);
-      
+
       System.out.println("EXTENSION IDs:");
-      
+
       for (String j : plugList) {
         System.out.println(j);
       }
     }
-    
+
   }
-  
+
   /**
    * @return Returns the fParsePluginsFile.
    */
   public String getFParsePluginsFile() {
     return fParsePluginsFile;
   }
-  
+
   /**
    * @param parsePluginsFile
-   *            The fParsePluginsFile to set.
+   *          The fParsePluginsFile to set.
    */
   public void setFParsePluginsFile(String parsePluginsFile) {
     fParsePluginsFile = parsePluginsFile;
   }
-  
+
   private Map<String, String> getAliases(Element parsePluginsRoot) {
 
     Map<String, String> aliases = new HashMap<String, String>();
     NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");
-         
+
     if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 
0)) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("No aliases defined in parse-plugins.xml!");
       }
       return aliases;
     }
-         
+
     if (aliasRoot.getLength() > 1) {
       // log a warning, but try and continue processing
       if (LOG.isWarnEnabled()) {
         LOG.warn("There should only be one \"aliases\" tag in 
parse-plugins.xml");
       }
     }
-         
-    Element aliasRootElem = (Element)aliasRoot.item(0);
+
+    Element aliasRootElem = (Element) aliasRoot.item(0);
     NodeList aliasElements = aliasRootElem.getElementsByTagName("alias");
-         
+
     if (aliasElements != null && aliasElements.getLength() > 0) {
-      for (int i=0; i<aliasElements.getLength(); i++) {
-        Element aliasElem = (Element)aliasElements.item(i);
-       String parsePluginId = aliasElem.getAttribute("name");
-       String extensionId = aliasElem.getAttribute("extension-id");
+      for (int i = 0; i < aliasElements.getLength(); i++) {
+        Element aliasElem = (Element) aliasElements.item(i);
+        String parsePluginId = aliasElem.getAttribute("name");
+        String extensionId = aliasElem.getAttribute("extension-id");
         if (LOG.isTraceEnabled()) {
-          LOG.trace("Found alias: plugin-id: " + parsePluginId +
-                    ", extension-id: " + extensionId);
+          LOG.trace("Found alias: plugin-id: " + parsePluginId
+              + ", extension-id: " + extensionId);
         }
         if (parsePluginId != null && extensionId != null) {
           aliases.put(parsePluginId, extensionId);
@@ -275,5 +274,5 @@ public class ParsePluginsReader {
     }
     return aliases;
   }
-  
+
 }

svn commit: r1650447 [4/25] - in /nutch/branches/2.x: ./ src/java/org/apache/nutch/api/ src/java/org/apache/nutch/api/impl/ src/java/org/apache/nutch/api/impl/db/ src/java/org/apache/nutch/api/model/response/ src/java/org/apache/nutch/api/resources/ sr...

Reply via email to