Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/solr/SolrUtils.java Fri Jan 9 06:34:33 2015 @@ -1,6 +1,5 @@ package org.apache.nutch.indexer.solr; - import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.auth.AuthScope; import org.apache.http.auth.UsernamePasswordCredentials; @@ -18,7 +17,8 @@ public class SolrUtils { public static Logger LOG = LoggerFactory.getLogger(SolrUtils.class); - public static HttpSolrServer getHttpSolrServer(Configuration job) throws MalformedURLException { + public static HttpSolrServer getHttpSolrServer(Configuration job) + throws MalformedURLException { DefaultHttpClient client = new DefaultHttpClient(); // Check for username/password @@ -27,10 +27,13 @@ public class SolrUtils { LOG.info("Authenticating as: " + username); - AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, AuthScope.ANY_REALM, AuthScope.ANY_SCHEME); + AuthScope scope = new AuthScope(AuthScope.ANY_HOST, AuthScope.ANY_PORT, + AuthScope.ANY_REALM, AuthScope.ANY_SCHEME); - client.getCredentialsProvider().setCredentials(scope, - new UsernamePasswordCredentials(username, job.get(SolrConstants.PASSWORD))); + client.getCredentialsProvider().setCredentials( + scope, + new UsernamePasswordCredentials(username, job + .get(SolrConstants.PASSWORD))); HttpParams params = client.getParams(); HttpClientParams.setAuthenticating(params, true); @@ -48,12 +51,14 @@ public class SolrUtils { for (int i = 0; i < input.length(); i++) { ch = input.charAt(i); - // Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] - // and non-printable control characters except tabulator, new line and carriage return + // Strip all non-characters + // http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] + // and non-printable control characters except tabulator, new line and + // carriage return if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000 - ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range - (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef - (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) { + ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range + (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef + (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) { retval.append(ch); }
Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/CreativeCommons.java Fri Jan 9 06:34:33 2015 @@ -16,21 +16,20 @@ */ package org.apache.nutch.metadata; - /** * A collection of Creative Commons properties names. - * + * * @see <a href="http://www.creativecommons.org/">creativecommons.org</a> - * + * * @author Chris Mattmann * @author Jérôme Charron */ public interface CreativeCommons { - + public final static String LICENSE_URL = "License-Url"; - + public final static String LICENSE_LOCATION = "License-Location"; - + public final static String WORK_TYPE = "Work-Type"; - + } Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/DublinCore.java Fri Jan 9 06:34:33 2015 @@ -16,149 +16,146 @@ */ package org.apache.nutch.metadata; - /** * A collection of Dublin Core metadata names. - * - * @see <a href="http://dublincore.org">dublincore.org</a> - * + * + * @see <a href="http://dublincore.org">dublincore.org</a> + * * @author Chris Mattmann * @author Jérôme Charron */ public interface DublinCore { - - + /** - * Typically, Format may include the media-type or dimensions of the - * resource. Format may be used to determine the software, hardware or other - * equipment needed to display or operate the resource. Examples of - * dimensions include size and duration. Recommended best practice is to - * select a value from a controlled vocabulary (for example, the list of - * Internet Media Types [MIME] defining computer media formats). + * Typically, Format may include the media-type or dimensions of the resource. + * Format may be used to determine the software, hardware or other equipment + * needed to display or operate the resource. Examples of dimensions include + * size and duration. Recommended best practice is to select a value from a + * controlled vocabulary (for example, the list of Internet Media Types [MIME] + * defining computer media formats). */ public static final String FORMAT = "format"; - + /** - * Recommended best practice is to identify the resource by means of a - * string or number conforming to a formal identification system. Example - * formal identification systems include the Uniform Resource Identifier - * (URI) (including the Uniform Resource Locator (URL)), the Digital Object + * Recommended best practice is to identify the resource by means of a string + * or number conforming to a formal identification system. Example formal + * identification systems include the Uniform Resource Identifier (URI) + * (including the Uniform Resource Locator (URL)), the Digital Object * Identifier (DOI) and the International Standard Book Number (ISBN). */ public static final String IDENTIFIER = "identifier"; - + /** * Date on which the resource was changed. */ public static final String MODIFIED = "modified"; - + /** * An entity responsible for making contributions to the content of the - * resource. Examples of a Contributor include a person, an organisation, or - * a service. Typically, the name of a Contributor should be used to - * indicate the entity. + * resource. Examples of a Contributor include a person, an organisation, or a + * service. Typically, the name of a Contributor should be used to indicate + * the entity. */ public static final String CONTRIBUTOR = "contributor"; - + /** - * The extent or scope of the content of the resource. Coverage will - * typically include spatial location (a place name or geographic - * coordinates), temporal period (a period label, date, or date range) or - * jurisdiction (such as a named administrative entity). Recommended best - * practice is to select a value from a controlled vocabulary (for example, - * the Thesaurus of Geographic Names [TGN]) and that, where appropriate, - * named places or time periods be used in preference to numeric identifiers - * such as sets of coordinates or date ranges. + * The extent or scope of the content of the resource. Coverage will typically + * include spatial location (a place name or geographic coordinates), temporal + * period (a period label, date, or date range) or jurisdiction (such as a + * named administrative entity). Recommended best practice is to select a + * value from a controlled vocabulary (for example, the Thesaurus of + * Geographic Names [TGN]) and that, where appropriate, named places or time + * periods be used in preference to numeric identifiers such as sets of + * coordinates or date ranges. */ public static final String COVERAGE = "coverage"; - + /** * An entity primarily responsible for making the content of the resource. * Examples of a Creator include a person, an organisation, or a service. * Typically, the name of a Creator should be used to indicate the entity. */ public static final String CREATOR = "creator"; - + /** * A date associated with an event in the life cycle of the resource. - * Typically, Date will be associated with the creation or availability of - * the resource. Recommended best practice for encoding the date value is - * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD - * format. + * Typically, Date will be associated with the creation or availability of the + * resource. Recommended best practice for encoding the date value is defined + * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format. */ public static final String DATE = "date"; - + /** * An account of the content of the resource. Description may include but is * not limited to: an abstract, table of contents, reference to a graphical * representation of content or a free-text account of the content. */ public static final String DESCRIPTION = "description"; - + /** * A language of the intellectual content of the resource. Recommended best * practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639 - * [ISO639], defines two- and three-letter primary language tags with - * optional subtags. Examples include "en" or "eng" for English, "akk" for - * Akkadian, and "en-GB" for English used in the United Kingdom. + * [ISO639], defines two- and three-letter primary language tags with optional + * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian, + * and "en-GB" for English used in the United Kingdom. */ public static final String LANGUAGE = "language"; - + /** * An entity responsible for making the resource available. Examples of a * Publisher include a person, an organisation, or a service. Typically, the * name of a Publisher should be used to indicate the entity. */ public static final String PUBLISHER = "publisher"; - + /** * A reference to a related resource. Recommended best practice is to * reference the resource by means of a string or number conforming to a * formal identification system. */ public static final String RELATION = "relation"; - + /** - * Information about rights held in and over the resource. Typically, a - * Rights element will contain a rights management statement for the - * resource, or reference a service providing such information. Rights - * information often encompasses Intellectual Property Rights (IPR), - * Copyright, and various Property Rights. If the Rights element is absent, - * no assumptions can be made about the status of these and other rights - * with respect to the resource. + * Information about rights held in and over the resource. Typically, a Rights + * element will contain a rights management statement for the resource, or + * reference a service providing such information. Rights information often + * encompasses Intellectual Property Rights (IPR), Copyright, and various + * Property Rights. If the Rights element is absent, no assumptions can be + * made about the status of these and other rights with respect to the + * resource. */ public static final String RIGHTS = "rights"; - + /** * A reference to a resource from which the present resource is derived. The * present resource may be derived from the Source resource in whole or in - * part. Recommended best practice is to reference the resource by means of - * a string or number conforming to a formal identification system. + * part. Recommended best practice is to reference the resource by means of a + * string or number conforming to a formal identification system. */ public static final String SOURCE = "source"; - + /** * The topic of the content of the resource. Typically, a Subject will be - * expressed as keywords, key phrases or classification codes that describe - * a topic of the resource. Recommended best practice is to select a value - * from a controlled vocabulary or formal classification scheme. + * expressed as keywords, key phrases or classification codes that describe a + * topic of the resource. Recommended best practice is to select a value from + * a controlled vocabulary or formal classification scheme. */ public static final String SUBJECT = "subject"; - + /** * A name given to the resource. Typically, a Title will be a name by which * the resource is formally known. */ public static final String TITLE = "title"; - + /** * The nature or genre of the content of the resource. Type includes terms - * describing general categories, functions, genres, or aggregation levels - * for content. Recommended best practice is to select a value from a - * controlled vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). - * To describe the physical or digital manifestation of the resource, use - * the Format element. + * describing general categories, functions, genres, or aggregation levels for + * content. Recommended best practice is to select a value from a controlled + * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe + * the physical or digital manifestation of the resource, use the Format + * element. */ public static final String TYPE = "type"; - + } Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/HttpHeaders.java Fri Jan 9 06:34:33 2015 @@ -16,14 +16,12 @@ */ package org.apache.nutch.metadata; - - /** * A collection of HTTP header names. - * - * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer - * Protocol -- HTTP/1.1 (RFC 2616)</a> - * + * + * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol + * -- HTTP/1.1 (RFC 2616)</a> + * * @author Chris Mattmann * @author Jérôme Charron */ Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/MetaWrapper.java Fri Jan 9 06:34:33 2015 @@ -28,28 +28,29 @@ import org.apache.nutch.crawl.NutchWrita /** * This is a simple decorator that adds metadata to any Writable-s that can be * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be - * temporarily enriched during processing, but this - * temporary metadata doesn't need to be permanently stored after the job is done. + * temporarily enriched during processing, but this temporary metadata doesn't + * need to be permanently stored after the job is done. * * @author Andrzej Bialecki */ public class MetaWrapper extends NutchWritable { private Metadata metadata; - + public MetaWrapper() { super(); metadata = new Metadata(); } - + public MetaWrapper(Writable instance, Configuration conf) { super(instance); metadata = new Metadata(); setConf(conf); } - + public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) { super(instance); - if (metadata == null) metadata = new Metadata(); + if (metadata == null) + metadata = new Metadata(); this.metadata = metadata; setConf(conf); } @@ -60,43 +61,52 @@ public class MetaWrapper extends NutchWr public Metadata getMetadata() { return metadata; } - + /** - * Add metadata. See {@link Metadata#add(String, String)} for more information. - * @param name metadata name - * @param value metadata value + * Add metadata. See {@link Metadata#add(String, String)} for more + * information. + * + * @param name + * metadata name + * @param value + * metadata value */ public void addMeta(String name, String value) { metadata.add(name, value); } - + /** - * Set metadata. See {@link Metadata#set(String, String)} for more information. + * Set metadata. See {@link Metadata#set(String, String)} for more + * information. + * * @param name * @param value */ public void setMeta(String name, String value) { metadata.set(name, value); } - + /** * Get metadata. See {@link Metadata#get(String)} for more information. + * * @param name * @return metadata value */ public String getMeta(String name) { return metadata.get(name); } - + /** - * Get multiple metadata. See {@link Metadata#getValues(String)} for more information. + * Get multiple metadata. See {@link Metadata#getValues(String)} for more + * information. + * * @param name * @return multiple values */ public String[] getMetaValues(String name) { return metadata.getValues(name); } - + public void readFields(DataInput in) throws IOException { super.readFields(in); metadata = new Metadata(); Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java Fri Jan 9 06:34:33 2015 @@ -27,23 +27,21 @@ import java.util.Properties; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; - /** * A multi-valued metadata container. - * + * * @author Chris Mattmann * @author Jérôme Charron - * + * */ -public class Metadata implements Writable, CreativeCommons, -DublinCore, HttpHeaders, Nutch, Feed { +public class Metadata implements Writable, CreativeCommons, DublinCore, + HttpHeaders, Nutch, Feed { /** * A map of all metadata attributes. */ private Map<String, String[]> metadata = null; - /** * Constructs a new, empty metadata. */ @@ -53,9 +51,10 @@ DublinCore, HttpHeaders, Nutch, Feed { /** * Returns true if named value is multivalued. - * @param name name of metadata - * @return true is named value is multivalued, false if single - * value or null + * + * @param name + * name of metadata + * @return true is named value is multivalued, false if single value or null */ public boolean isMultiValued(final String name) { return metadata.get(name) != null && metadata.get(name).length > 1; @@ -63,6 +62,7 @@ DublinCore, HttpHeaders, Nutch, Feed { /** * Returns an array of the names contained in the metadata. + * * @return Metadata names */ public String[] names() { @@ -70,11 +70,11 @@ DublinCore, HttpHeaders, Nutch, Feed { } /** - * Get the value associated to a metadata name. - * If many values are assiociated to the specified name, then the first - * one is returned. - * - * @param name of the metadata. + * Get the value associated to a metadata name. If many values are assiociated + * to the specified name, then the first one is returned. + * + * @param name + * of the metadata. * @return the value associated to the specified metadata name. */ public String get(final String name) { @@ -88,13 +88,15 @@ DublinCore, HttpHeaders, Nutch, Feed { /** * Get the values associated to a metadata name. - * @param name of the metadata. + * + * @param name + * of the metadata. * @return the values associated to a metadata name. */ public String[] getValues(final String name) { return _getValues(name); } - + private String[] _getValues(final String name) { String[] values = metadata.get(name); if (values == null) { @@ -104,12 +106,13 @@ DublinCore, HttpHeaders, Nutch, Feed { } /** - * Add a metadata name/value mapping. - * Add the specified value to the list of values associated to the - * specified metadata name. - * - * @param name the metadata name. - * @param value the metadata value. + * Add a metadata name/value mapping. Add the specified value to the list of + * values associated to the specified metadata name. + * + * @param name + * the metadata name. + * @param value + * the metadata value. */ public void add(final String name, final String value) { String[] values = metadata.get(name); @@ -125,31 +128,37 @@ DublinCore, HttpHeaders, Nutch, Feed { /** * Copy All key-value pairs from properties. - * @param properties properties to copy from + * + * @param properties + * properties to copy from */ public void setAll(Properties properties) { Enumeration<?> names = properties.propertyNames(); while (names.hasMoreElements()) { String name = (String) names.nextElement(); - metadata.put(name, new String[]{properties.getProperty(name)}); + metadata.put(name, new String[] { properties.getProperty(name) }); } } /** - * Set metadata name/value. - * Associate the specified value to the specified metadata name. If some - * previous values were associated to this name, they are removed. - * - * @param name the metadata name. - * @param value the metadata value. + * Set metadata name/value. Associate the specified value to the specified + * metadata name. If some previous values were associated to this name, they + * are removed. + * + * @param name + * the metadata name. + * @param value + * the metadata value. */ public void set(String name, String value) { - metadata.put(name, new String[]{value}); + metadata.put(name, new String[] { value }); } /** * Remove a metadata and all its associated values. - * @param name metadata name to remove + * + * @param name + * metadata name to remove */ public void remove(String name) { metadata.remove(name); @@ -157,12 +166,13 @@ DublinCore, HttpHeaders, Nutch, Feed { /** * Returns the number of metadata names in this metadata. + * * @return number of metadata names */ public int size() { return metadata.size(); } - + /** Remove all mappings from metadata. */ public void clear() { metadata.clear(); @@ -170,7 +180,9 @@ DublinCore, HttpHeaders, Nutch, Feed { public boolean equals(Object o) { - if (o == null) { return false; } + if (o == null) { + return false; + } Metadata other = null; try { @@ -179,7 +191,9 @@ DublinCore, HttpHeaders, Nutch, Feed { return false; } - if (other.size() != size()) { return false; } + if (other.size() != size()) { + return false; + } String[] names = names(); for (int i = 0; i < names.length; i++) { @@ -203,10 +217,7 @@ DublinCore, HttpHeaders, Nutch, Feed { for (int i = 0; i < names.length; i++) { String[] values = _getValues(names[i]); for (int j = 0; j < values.length; j++) { - buf.append(names[i]) - .append("=") - .append(values[j]) - .append(" "); + buf.append(names[i]).append("=").append(values[j]).append(" "); } } return buf.toString(); Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java Fri Jan 9 06:34:33 2015 @@ -19,20 +19,17 @@ package org.apache.nutch.metadata; import org.apache.avro.util.Utf8; import org.apache.hadoop.io.Text; - /** * A collection of Nutch internal metadata constants. - * + * * @author Chris Mattmann * @author Jérôme Charron */ public interface Nutch { - public static final String ORIGINAL_CHAR_ENCODING = - "OriginalCharEncoding"; + public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding"; - public static final String CHAR_ENCODING_FOR_CONVERSION = - "CharEncodingForConversion"; + public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion"; public static final String SIGNATURE_KEY = "nutch.content.digest"; @@ -42,20 +39,26 @@ public interface Nutch { public static final String GENERATE_TIME_KEY = "_ngt_"; - public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(GENERATE_TIME_KEY); + public static final Text WRITABLE_GENERATE_TIME_KEY = new Text( + GENERATE_TIME_KEY); public static final String PROTO_STATUS_KEY = "_pst_"; - public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY); + public static final Text WRITABLE_PROTO_STATUS_KEY = new Text( + PROTO_STATUS_KEY); public static final String FETCH_TIME_KEY = "_ftk_"; public static final String FETCH_STATUS_KEY = "_fst_"; - /** Sites may request that search engines don't provide access to cached documents. */ + /** + * Sites may request that search engines don't provide access to cached + * documents. + */ public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden"; - public static final Utf8 CACHING_FORBIDDEN_KEY_UTF8 = new Utf8(CACHING_FORBIDDEN_KEY); + public static final Utf8 CACHING_FORBIDDEN_KEY_UTF8 = new Utf8( + CACHING_FORBIDDEN_KEY); /** Show both original forbidden content and summaries (default). */ public static final String CACHING_FORBIDDEN_NONE = "none"; @@ -75,8 +78,7 @@ public interface Nutch { public static final Utf8 ALL_CRAWL_ID = new Utf8(ALL_BATCH_ID_STR); public static final String CRAWL_ID_KEY = "storage.crawl.id"; - - + // short constants for cmd-line args /** Batch id to select. */ public static final String ARG_BATCH = "batch"; @@ -110,7 +112,7 @@ public interface Nutch { public static final String ARG_CLASS = "class"; /** Depth (number of cycles) of a crawl. */ public static final String ARG_DEPTH = "depth"; - + // short constants for status / results fields /** Status / result message. */ public static final String STAT_MESSAGE = "msg"; Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java Fri Jan 9 06:34:33 2015 @@ -33,7 +33,7 @@ public class SpellCheckedMetadata extend /** * Treshold divider. - * + * * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code> */ private static final int TRESHOLD_DIVIDER = 3; @@ -52,7 +52,7 @@ public class SpellCheckedMetadata extend // Uses following array to fill the metanames index and the // metanames list. - Class<?>[] spellthese = {HttpHeaders.class}; + Class<?>[] spellthese = { HttpHeaders.class }; for (Class<?> spellCheckedNames : spellthese) { for (Field field : spellCheckedNames.getFields()) { @@ -73,7 +73,7 @@ public class SpellCheckedMetadata extend /** * Normalizes String. - * + * * @param str * the string to normalize * @return normalized String @@ -102,7 +102,7 @@ public class SpellCheckedMetadata extend * </ul> * If no matching with a well-known metadata name is found, then the original * name is returned. - * + * * @param name * Name to normalize * @return normalized name Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilter.java Fri Jan 9 06:34:33 2015 @@ -23,17 +23,18 @@ import org.apache.hadoop.conf.Configurab // Nutch imports import org.apache.nutch.plugin.Pluggable; - /** - * Interface used to limit which URLs enter Nutch. - * Used by the injector and the db updater. + * Interface used to limit which URLs enter Nutch. Used by the injector and the + * db updater. */ public interface URLFilter extends Pluggable, Configurable { /** The name of the extension point. */ public final static String X_POINT_ID = URLFilter.class.getName(); - /* Interface for a filter that transforms a URL: it can pass the - original URL through or "delete" the URL by returning null */ + /* + * Interface for a filter that transforms a URL: it can pass the original URL + * through or "delete" the URL by returning null + */ public String filter(String urlString); } Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilterChecker.java Fri Jan 9 06:34:33 2015 @@ -38,23 +38,23 @@ public class URLFilterChecker { private Configuration conf; public URLFilterChecker(Configuration conf) { - this.conf = conf; + this.conf = conf; } private void checkOne(String filterName) throws Exception { URLFilter filter = null; - ExtensionPoint point = - PluginRepository.get(conf).getExtensionPoint(URLFilter.X_POINT_ID); + ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( + URLFilter.X_POINT_ID); if (point == null) - throw new RuntimeException(URLFilter.X_POINT_ID+" not found."); + throw new RuntimeException(URLFilter.X_POINT_ID + " not found."); Extension[] extensions = point.getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; - filter = (URLFilter)extension.getExtensionInstance(); + filter = (URLFilter) extension.getExtensionInstance(); if (filter.getClass().getName().equals(filterName)) { break; } else { @@ -63,19 +63,19 @@ public class URLFilterChecker { } if (filter == null) - throw new RuntimeException("Filter "+filterName+" not found."); + throw new RuntimeException("Filter " + filterName + " not found."); // jerome : should we keep this behavior? - //if (LogFormatter.hasLoggedSevere()) - // throw new RuntimeException("Severe error encountered."); + // if (LogFormatter.hasLoggedSevere()) + // throw new RuntimeException("Severe error encountered."); - System.out.println("Checking URLFilter "+filterName); + System.out.println("Checking URLFilter " + filterName); BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; - while((line=in.readLine())!=null) { - String out=filter.filter(line); - if(out!=null) { + while ((line = in.readLine()) != null) { + String out = filter.filter(line); + if (out != null) { System.out.print("+"); System.out.println(out); } else { @@ -90,10 +90,10 @@ public class URLFilterChecker { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; - while((line=in.readLine())!=null) { + while ((line = in.readLine()) != null) { URLFilters filters = new URLFilters(this.conf); String out = filters.filter(line); - if(out!=null) { + if (out != null) { System.out.print("+"); System.out.println(out); } else { Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java Fri Jan 9 06:34:33 2015 @@ -28,7 +28,8 @@ import org.apache.nutch.plugin.PluginRep import org.apache.nutch.util.ObjectCache; import org.apache.hadoop.conf.Configuration; -/** Creates and caches {@link URLFilter} implementing plugins.*/ + +/** Creates and caches {@link URLFilter} implementing plugins. */ public class URLFilters { public static final String URLFILTER_ORDER = "urlfilter.order"; @@ -37,7 +38,8 @@ public class URLFilters { public URLFilters(Configuration conf) { String order = conf.get(URLFILTER_ORDER); ObjectCache objectCache = ObjectCache.get(conf); - this.filters = (URLFilter[]) objectCache.getObject(URLFilter.class.getName()); + this.filters = (URLFilter[]) objectCache.getObject(URLFilter.class + .getName()); if (this.filters == null) { String[] orderedFilters = null; @@ -60,8 +62,8 @@ public class URLFilters { } } if (orderedFilters == null) { - objectCache.setObject(URLFilter.class.getName(), filterMap.values().toArray( - new URLFilter[0])); + objectCache.setObject(URLFilter.class.getName(), filterMap.values() + .toArray(new URLFilter[0])); } else { ArrayList<URLFilter> filters = new ArrayList<URLFilter>(); for (int i = 0; i < orderedFilters.length; i++) { @@ -70,13 +72,14 @@ public class URLFilters { filters.add(filter); } } - objectCache.setObject(URLFilter.class.getName(), filters - .toArray(new URLFilter[filters.size()])); + objectCache.setObject(URLFilter.class.getName(), + filters.toArray(new URLFilter[filters.size()])); } } catch (PluginRuntimeException e) { throw new RuntimeException(e); } - this.filters = (URLFilter[]) objectCache.getObject(URLFilter.class.getName()); + this.filters = (URLFilter[]) objectCache.getObject(URLFilter.class + .getName()); } } Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizer.java Fri Jan 9 06:34:33 2015 @@ -21,13 +21,17 @@ import java.net.MalformedURLException; import org.apache.hadoop.conf.Configurable; -/** Interface used to convert URLs to normal form and optionally perform substitutions */ +/** + * Interface used to convert URLs to normal form and optionally perform + * substitutions + */ public interface URLNormalizer extends Configurable { - + /* Extension ID */ public static final String X_POINT_ID = URLNormalizer.class.getName(); - + /* Interface for URL normalization */ - public String normalize(String urlString, String scope) throws MalformedURLException; + public String normalize(String urlString, String scope) + throws MalformedURLException; } Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizerChecker.java Fri Jan 9 06:34:33 2015 @@ -36,23 +36,23 @@ public class URLNormalizerChecker { private Configuration conf; public URLNormalizerChecker(Configuration conf) { - this.conf = conf; + this.conf = conf; } private void checkOne(String normalizerName, String scope) throws Exception { URLNormalizer normalizer = null; - ExtensionPoint point = - PluginRepository.get(conf).getExtensionPoint(URLNormalizer.X_POINT_ID); + ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( + URLNormalizer.X_POINT_ID); if (point == null) - throw new RuntimeException(URLNormalizer.X_POINT_ID+" not found."); + throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found."); Extension[] extensions = point.getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; - normalizer = (URLNormalizer)extension.getExtensionInstance(); + normalizer = (URLNormalizer) extension.getExtensionInstance(); if (normalizer.getClass().getName().equals(normalizerName)) { break; } else { @@ -61,7 +61,8 @@ public class URLNormalizerChecker { } if (normalizer == null) - throw new RuntimeException("URLNormalizer "+normalizerName+" not found."); + throw new RuntimeException("URLNormalizer " + normalizerName + + " not found."); System.out.println("Checking URLNormalizer " + normalizerName); @@ -79,7 +80,7 @@ public class URLNormalizerChecker { BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; URLNormalizers normalizers = new URLNormalizers(conf, scope); - while((line = in.readLine()) != null) { + while ((line = in.readLine()) != null) { String out = normalizers.normalize(line, scope); System.out.println(out); } @@ -88,7 +89,7 @@ public class URLNormalizerChecker { public static void main(String[] args) throws Exception { String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]" - + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"; + + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"; String normalizerName = null; String scope = URLNormalizers.SCOPE_DEFAULT; @@ -103,7 +104,8 @@ public class URLNormalizerChecker { } } - URLNormalizerChecker checker = new URLNormalizerChecker(NutchConfiguration.create()); + URLNormalizerChecker checker = new URLNormalizerChecker( + NutchConfiguration.create()); if (normalizerName != null) { checker.checkOne(normalizerName, scope); } else { Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLNormalizers.java Fri Jan 9 06:34:33 2015 @@ -43,47 +43,63 @@ import org.apache.nutch.util.ObjectCache * contexts where they are used (note however that they need to be activated * first through <tt>plugin.include</tt> property). * - * <p>There is one global scope defined by default, which consists of all - * active normalizers. The order in which these normalizers - * are executed may be defined in "urlnormalizer.order" property, which lists - * space-separated implementation classes (if this property is missing normalizers - * will be run in random order). If there are more - * normalizers activated than explicitly named on this list, the remaining ones - * will be run in random order after the ones specified on the list are executed.</p> - * <p>You can define a set of contexts (or scopes) in which normalizers may be + * <p> + * There is one global scope defined by default, which consists of all active + * normalizers. The order in which these normalizers are executed may be defined + * in "urlnormalizer.order" property, which lists space-separated implementation + * classes (if this property is missing normalizers will be run in random + * order). If there are more normalizers activated than explicitly named on this + * list, the remaining ones will be run in random order after the ones specified + * on the list are executed. + * </p> + * <p> + * You can define a set of contexts (or scopes) in which normalizers may be * called. Each scope can have its own list of normalizers (defined in * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in * "urlnormalizer.order.<scope_name>" property). If any of these properties are - * missing, default settings are used for the global scope.</p> - * <p>In case no normalizers are required for any given scope, a - * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should be used.</p> - * <p>Each normalizer may further select among many configurations, depending on - * the scope in which it is called, because the scope name is passed as a parameter - * to each normalizer. You can also use the same normalizer for many scopes.</p> - * <p>Several scopes have been defined, and various Nutch tools will attempt using - * scope-specific normalizers first (and fall back to default config if scope-specific - * configuration is missing).</p> - * <p>Normalizers may be run several times, to ensure that modifications introduced + * missing, default settings are used for the global scope. + * </p> + * <p> + * In case no normalizers are required for any given scope, a + * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should + * be used. + * </p> + * <p> + * Each normalizer may further select among many configurations, depending on + * the scope in which it is called, because the scope name is passed as a + * parameter to each normalizer. You can also use the same normalizer for many + * scopes. + * </p> + * <p> + * Several scopes have been defined, and various Nutch tools will attempt using + * scope-specific normalizers first (and fall back to default config if + * scope-specific configuration is missing). + * </p> + * <p> + * Normalizers may be run several times, to ensure that modifications introduced * by normalizers at the end of the list can be further reduced by normalizers - * executed at the beginning. By default this loop is executed just once - if you want - * to ensure that all possible combinations have been applied you may want to run - * this loop up to the number of activated normalizers. This loop count can be configured - * through <tt>urlnormalizer.loop.count</tt> property. As soon as the url is - * unchanged the loop will stop and return the result.</p> + * executed at the beginning. By default this loop is executed just once - if + * you want to ensure that all possible combinations have been applied you may + * want to run this loop up to the number of activated normalizers. This loop + * count can be configured through <tt>urlnormalizer.loop.count</tt> property. + * As soon as the url is unchanged the loop will stop and return the result. + * </p> * * @author Andrzej Bialecki */ public final class URLNormalizers { - - /** Default scope. If no scope properties are defined then the configuration for - * this scope will be used. + + /** + * Default scope. If no scope properties are defined then the configuration + * for this scope will be used. */ public static final String SCOPE_DEFAULT = "default"; /** Scope used by {@link org.apache.nutch.crawl.URLPartitioner}. */ public static final String SCOPE_PARTITION = "partition"; /** Scope used by {@link org.apache.nutch.crawl.GeneratorJob}. */ public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count"; - /** Scope used by {@link org.apache.nutch.fetcher.FetcherJob} when processing + /** + * Scope used by {@link org.apache.nutch.fetcher.FetcherJob} when processing * redirect URLs. */ public static final String SCOPE_FETCHER = "fetcher"; @@ -93,15 +109,18 @@ public final class URLNormalizers { public static final String SCOPE_LINKDB = "linkdb"; /** Scope used by {@link org.apache.nutch.crawl.InjectorJob}. */ public static final String SCOPE_INJECT = "inject"; - /** Scope used when constructing new {@link org.apache.nutch.parse.Outlink} instances. */ + /** + * Scope used when constructing new {@link org.apache.nutch.parse.Outlink} + * instances. + */ public static final String SCOPE_OUTLINK = "outlink"; - - public static final Logger LOG = LoggerFactory.getLogger(URLNormalizers.class); + public static final Logger LOG = LoggerFactory + .getLogger(URLNormalizers.class); /* Empty extension list for caching purposes. */ private final List<Extension> EMPTY_EXTENSION_LIST = Collections.emptyList(); - + private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0]; private Configuration conf; @@ -109,37 +128,39 @@ public final class URLNormalizers { private ExtensionPoint extensionPoint; private URLNormalizer[] normalizers; - + private int loopCount; public URLNormalizers(Configuration conf, String scope) { this.conf = conf; this.extensionPoint = PluginRepository.get(conf).getExtensionPoint( - URLNormalizer.X_POINT_ID); + URLNormalizer.X_POINT_ID); ObjectCache objectCache = ObjectCache.get(conf); - + if (this.extensionPoint == null) { throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID - + " not found."); + + " not found."); } - normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + scope); + normalizers = (URLNormalizer[]) objectCache + .getObject(URLNormalizer.X_POINT_ID + "_" + scope); if (normalizers == null) { normalizers = getURLNormalizers(scope); } if (normalizers == EMPTY_NORMALIZERS) { - normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT); + normalizers = (URLNormalizer[]) objectCache + .getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT); if (normalizers == null) { normalizers = getURLNormalizers(SCOPE_DEFAULT); } } - + loopCount = conf.getInt("urlnormalizer.loop.count", 1); } /** - * Function returns an array of {@link URLNormalizer}s for a given scope, - * with a specified order. + * Function returns an array of {@link URLNormalizer}s for a given scope, with + * a specified order. * * @param scope * The scope to return the <code>Array</code> of @@ -151,12 +172,13 @@ public final class URLNormalizers { URLNormalizer[] getURLNormalizers(String scope) { List<Extension> extensions = getExtensions(scope); ObjectCache objectCache = ObjectCache.get(conf); - + if (extensions == EMPTY_EXTENSION_LIST) { return EMPTY_NORMALIZERS; } - - List<URLNormalizer> normalizers = new Vector<URLNormalizer>(extensions.size()); + + List<URLNormalizer> normalizers = new Vector<URLNormalizer>( + extensions.size()); Iterator<Extension> it = extensions.iterator(); while (it.hasNext()) { @@ -174,14 +196,13 @@ public final class URLNormalizers { } catch (PluginRuntimeException e) { e.printStackTrace(); LOG.warn("URLNormalizers:PluginRuntimeException when " - + "initializing url normalizer plugin " - + ext.getDescriptor().getPluginId() - + " instance in getURLNormalizers " - + "function: attempting to continue instantiating plugins"); + + "initializing url normalizer plugin " + + ext.getDescriptor().getPluginId() + + " instance in getURLNormalizers " + + "function: attempting to continue instantiating plugins"); } } - return normalizers.toArray(new URLNormalizer[normalizers - .size()]); + return normalizers.toArray(new URLNormalizer[normalizers.size()]); } /** @@ -196,9 +217,8 @@ public final class URLNormalizers { @SuppressWarnings("unchecked") private List<Extension> getExtensions(String scope) { ObjectCache objectCache = ObjectCache.get(conf); - List<Extension> extensions = - (List<Extension>) objectCache.getObject(URLNormalizer.X_POINT_ID + "_x_" - + scope); + List<Extension> extensions = (List<Extension>) objectCache + .getObject(URLNormalizer.X_POINT_ID + "_x_" + scope); // Just compare the reference: // if this is the empty list, we know we will find no extension. @@ -209,11 +229,13 @@ public final class URLNormalizers { if (extensions == null) { extensions = findExtensions(scope); if (extensions != null) { - objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, extensions); + objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, + extensions); } else { // Put the empty extension list into cache // to remember we don't know any related extension. - objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, EMPTY_EXTENSION_LIST); + objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, + EMPTY_EXTENSION_LIST); extensions = EMPTY_EXTENSION_LIST; } } @@ -233,7 +255,8 @@ public final class URLNormalizers { String[] orders = null; String orderlist = conf.get("urlnormalizer.order." + scope); - if (orderlist == null) orderlist = conf.get("urlnormalizer.order"); + if (orderlist == null) + orderlist = conf.get("urlnormalizer.order"); if (orderlist != null && !orderlist.trim().equals("")) { orders = orderlist.split("\\s+"); } @@ -271,13 +294,17 @@ public final class URLNormalizers { /** * Normalize - * @param urlString The URL string to normalize. - * @param scope The given scope. + * + * @param urlString + * The URL string to normalize. + * @param scope + * The given scope. * @return A normalized String, using the given <code>scope</code> - * @throws MalformedURLException If the given URL string is malformed. + * @throws MalformedURLException + * If the given URL string is malformed. */ public String normalize(String urlString, String scope) - throws MalformedURLException { + throws MalformedURLException { // optionally loop several times, and break if no further changes String initialString = urlString; for (int k = 0; k < loopCount; k++) { @@ -286,7 +313,8 @@ public final class URLNormalizers { return null; urlString = this.normalizers[i].normalize(urlString, scope); } - if (initialString.equals(urlString)) break; + if (initialString.equals(urlString)) + break; initialString = urlString; } return urlString; Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * and {@link org.apache.nutch.net.URLNormalizer normalizers}. */ package org.apache.nutch.net; + Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java Fri Jan 9 06:34:33 2015 @@ -26,15 +26,15 @@ import java.text.ParseException; /** * class to handle HTTP dates. - * + * * Modified from FastHttpDateFormat.java in jakarta-tomcat. - * + * * @author John Xing */ public class HttpDateFormat { - protected static SimpleDateFormat format = - new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); + protected static SimpleDateFormat format = new SimpleDateFormat( + "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); /** * HTTP date uses TimeZone GMT @@ -43,29 +43,29 @@ public class HttpDateFormat { format.setTimeZone(TimeZone.getTimeZone("GMT")); } - //HttpDate (long t) { - //} + // HttpDate (long t) { + // } - //HttpDate (String s) { - //} + // HttpDate (String s) { + // } -// /** -// * Get the current date in HTTP format. -// */ -// public static String getCurrentDate() { -// -// long now = System.currentTimeMillis(); -// if ((now - currentDateGenerated) > 1000) { -// synchronized (format) { -// if ((now - currentDateGenerated) > 1000) { -// currentDateGenerated = now; -// currentDate = format.format(new Date(now)); -// } -// } -// } -// return currentDate; -// -// } + // /** + // * Get the current date in HTTP format. + // */ + // public static String getCurrentDate() { + // + // long now = System.currentTimeMillis(); + // if ((now - currentDateGenerated) > 1000) { + // synchronized (format) { + // if ((now - currentDateGenerated) > 1000) { + // currentDateGenerated = now; + // currentDate = format.format(new Date(now)); + // } + // } + // } + // return currentDate; + // + // } /** * Get the HTTP format of the specified date. Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/ProtocolException.java Fri Jan 9 06:34:33 2015 @@ -21,13 +21,13 @@ import java.io.Serializable; /** * Base exception for all protocol handlers + * * @deprecated Use {@link org.apache.nutch.protocol.ProtocolException} instead. */ @Deprecated @SuppressWarnings("serial") public class ProtocolException extends Exception implements Serializable { - public ProtocolException() { super(); } Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/Response.java Fri Jan 9 06:34:33 2015 @@ -23,12 +23,11 @@ import java.net.URL; import org.apache.nutch.metadata.HttpHeaders; import org.apache.nutch.metadata.Metadata; - /** - * A response interface. Makes all protocols model HTTP. + * A response interface. Makes all protocols model HTTP. */ public interface Response extends HttpHeaders { - + /** Returns the URL used to retrieve this response. */ public URL getUrl(); @@ -40,7 +39,7 @@ public interface Response extends HttpHe /** Returns all the headers. */ public Metadata getHeaders(); - + /** Returns the full content of the response. */ public byte[] getContent(); Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/package-info.java Fri Jan 9 06:34:33 2015 @@ -20,3 +20,4 @@ * interface, sea also {@link org.apache.nutch.protocol}. */ package org.apache.nutch.net.protocols; + Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/HTMLMetaTags.java Fri Jan 9 06:34:33 2015 @@ -24,8 +24,8 @@ import java.util.Properties; import org.apache.nutch.metadata.Metadata; /** - * This class holds the information about HTML "meta" tags extracted from - * a page. Some special tags have convenience methods for easy checking. + * This class holds the information about HTML "meta" tags extracted from a + * page. Some special tags have convenience methods for easy checking. */ public class HTMLMetaTags { private boolean noIndex = false; @@ -156,8 +156,8 @@ public class HTMLMetaTags { } /** - * A convenience method. Returns the current value of <code>refreshTime</code>. - * The value may be invalid if {@link #getRefresh()}returns + * A convenience method. Returns the current value of <code>refreshTime</code> + * . The value may be invalid if {@link #getRefresh()}returns * <code>false</code>. */ public int getRefreshTime() { @@ -179,16 +179,12 @@ public class HTMLMetaTags { public Properties getHttpEquivTags() { return httpEquivTags; } - + public String toString() { StringBuffer sb = new StringBuffer(); - sb.append("base=" + baseHref - + ", noCache=" + noCache - + ", noFollow=" + noFollow - + ", noIndex=" + noIndex - + ", refresh=" + refresh - + ", refreshHref=" + refreshHref + "\n" - ); + sb.append("base=" + baseHref + ", noCache=" + noCache + ", noFollow=" + + noFollow + ", noIndex=" + noIndex + ", refresh=" + refresh + + ", refreshHref=" + refreshHref + "\n"); sb.append(" * general tags:\n"); String[] names = generalTags.names(); for (String name : names) { @@ -198,7 +194,7 @@ public class HTMLMetaTags { sb.append(" * http-equiv tags:\n"); Iterator<Object> it = httpEquivTags.keySet().iterator(); while (it.hasNext()) { - String key = (String)it.next(); + String key = (String) it.next(); sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n"); } return sb.toString(); Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/Outlink.java Fri Jan 9 06:34:33 2015 @@ -28,11 +28,13 @@ public class Outlink implements Writable private String toUrl; private String anchor; - public Outlink() {} + public Outlink() { + } public Outlink(String toUrl, String anchor) throws MalformedURLException { this.toUrl = toUrl; - if (anchor == null) anchor = ""; + if (anchor == null) + anchor = ""; this.anchor = anchor; } @@ -43,8 +45,8 @@ public class Outlink implements Writable /** Skips over one Outlink in the input. */ public static void skip(DataInput in) throws IOException { - Text.skip(in); // skip toUrl - Text.skip(in); // skip anchor + Text.skip(in); // skip toUrl + Text.skip(in); // skip anchor } public void write(DataOutput out) throws IOException { @@ -58,21 +60,24 @@ public class Outlink implements Writable return outlink; } - public String getToUrl() { return toUrl; } - public String getAnchor() { return anchor; } + public String getToUrl() { + return toUrl; + } + public String getAnchor() { + return anchor; + } public boolean equals(Object o) { if (!(o instanceof Outlink)) return false; - Outlink other = (Outlink)o; - return - this.toUrl.equals(other.toUrl) && - this.anchor.equals(other.anchor); + Outlink other = (Outlink) o; + return this.toUrl.equals(other.toUrl) && this.anchor.equals(other.anchor); } public String toString() { - return "toUrl: " + toUrl + " anchor: " + anchor; // removed "\n". toString, not printLine... WD. + return "toUrl: " + toUrl + " anchor: " + anchor; // removed "\n". toString, + // not printLine... WD. } } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/OutlinkExtractor.java Fri Jan 9 06:34:33 2015 @@ -34,8 +34,8 @@ import org.apache.oro.text.regex.Perl5Co import org.apache.oro.text.regex.Perl5Matcher; /** - * Extractor to extract {@link org.apache.nutch.parse.Outlink}s - * / URLs from plain text using Regular Expressions. + * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from + * plain text using Regular Expressions. * * @see <a * href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison @@ -48,23 +48,26 @@ import org.apache.oro.text.regex.Perl5Ma * @since 0.7 */ public class OutlinkExtractor { - private static final Logger LOG = LoggerFactory.getLogger(OutlinkExtractor.class); + private static final Logger LOG = LoggerFactory + .getLogger(OutlinkExtractor.class); /** * Regex pattern to get URLs within a plain text. * * @see <a * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html + * </a> */ - private static final String URL_PATTERN = - "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; + private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; /** - * Extracts <code>Outlink</code> from given plain text. - * Applying this method to non-plain-text can result in extremely lengthy - * runtimes for parasitic cases (postscript is a known example). - * @param plainText the plain text from wich URLs should be extracted. + * Extracts <code>Outlink</code> from given plain text. Applying this method + * to non-plain-text can result in extremely lengthy runtimes for parasitic + * cases (postscript is a known example). + * + * @param plainText + * the plain text from wich URLs should be extracted. * * @return Array of <code>Outlink</code>s within found in plainText */ @@ -73,15 +76,18 @@ public class OutlinkExtractor { } /** - * Extracts <code>Outlink</code> from given plain text and adds anchor - * to the extracted <code>Outlink</code>s + * Extracts <code>Outlink</code> from given plain text and adds anchor to the + * extracted <code>Outlink</code>s * - * @param plainText the plain text from wich URLs should be extracted. - * @param anchor the anchor of the url + * @param plainText + * the plain text from wich URLs should be extracted. + * @param anchor + * the anchor of the url * * @return Array of <code>Outlink</code>s within found in plainText */ - public static Outlink[] getOutlinks(final String plainText, String anchor, Configuration conf) { + public static Outlink[] getOutlinks(final String plainText, String anchor, + Configuration conf) { long start = System.currentTimeMillis(); final List<Outlink> outlinks = new ArrayList<Outlink>(); @@ -97,11 +103,11 @@ public class OutlinkExtractor { MatchResult result; String url; - //loop the matches + // loop the matches while (matcher.contains(input, pattern)) { // if this is taking too long, stop matching - // (SHOULD really check cpu time used so that heavily loaded systems - // do not unnecessarily hit this limit.) + // (SHOULD really check cpu time used so that heavily loaded systems + // do not unnecessarily hit this limit.) if (System.currentTimeMillis() - start >= 60000L) { if (LOG.isWarnEnabled()) { LOG.warn("Time limit exceeded for getOutLinks"); @@ -117,13 +123,16 @@ public class OutlinkExtractor { } } } catch (Exception ex) { - // if the matcher fails (perhaps a malformed URL) we just log it and move on - if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); } + // if the matcher fails (perhaps a malformed URL) we just log it and move + // on + if (LOG.isErrorEnabled()) { + LOG.error("getOutlinks", ex); + } } final Outlink[] retval; - //create array of the Outlinks + // create array of the Outlinks if (outlinks != null && outlinks.size() > 0) { retval = outlinks.toArray(new Outlink[0]); } else { @@ -132,7 +141,6 @@ public class OutlinkExtractor { return retval; } - /** * Extracts outlinks from a plain text. <br /> @@ -162,7 +170,7 @@ public class OutlinkExtractor { // url = re.getParen(0); // // if (LOG.isTraceEnabled()) { - // LOG.trace("Extracted url: " + url); + // LOG.trace("Extracted url: " + url); // } // // try { @@ -192,9 +200,8 @@ public class OutlinkExtractor { } /** - * Extracts outlinks from a plain text. - * </p> - * This Method takes the JDK5 Regexp API. + * Extracts outlinks from a plain text. </p> This Method takes the JDK5 Regexp + * API. * * @param plainText * @@ -243,5 +250,5 @@ public class OutlinkExtractor { // // return retval; } - + } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/Parse.java Fri Jan 9 06:34:33 2015 @@ -16,7 +16,6 @@ ******************************************************************************/ package org.apache.nutch.parse; - public class Parse { private String text; Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseCallable.java Fri Jan 9 06:34:33 2015 @@ -24,7 +24,7 @@ class ParseCallable implements Callable< private Parser p; private WebPage content; private String url; - + public ParseCallable(Parser p, WebPage content, String url) { this.p = p; this.content = content; @@ -34,5 +34,5 @@ class ParseCallable implements Callable< @Override public Parse call() throws Exception { return p.getParse(url, content); - } + } } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilter.java Fri Jan 9 06:34:33 2015 @@ -22,18 +22,19 @@ import org.apache.nutch.plugin.FieldPlug import org.apache.nutch.storage.WebPage; import org.w3c.dom.DocumentFragment; - -/** Extension point for DOM-based parsers. Permits one to add additional - * metadata to parses provided by the html or tika plugins. All plugins found which implement this extension - * point are run sequentially on the parse. +/** + * Extension point for DOM-based parsers. Permits one to add additional metadata + * to parses provided by the html or tika plugins. All plugins found which + * implement this extension point are run sequentially on the parse. */ public interface ParseFilter extends FieldPluggable, Configurable { /** The name of the extension point. */ final static String X_POINT_ID = ParseFilter.class.getName(); - /** Adds metadata or otherwise modifies a parse, given - * the DOM tree of a page. */ - Parse filter(String url, WebPage page, Parse parse, - HTMLMetaTags metaTags, DocumentFragment doc); + /** + * Adds metadata or otherwise modifies a parse, given the DOM tree of a page. + */ + Parse filter(String url, WebPage page, Parse parse, HTMLMetaTags metaTags, + DocumentFragment doc); } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseFilters.java Fri Jan 9 06:34:33 2015 @@ -31,7 +31,7 @@ import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.ObjectCache; import org.w3c.dom.DocumentFragment; -/** Creates and caches {@link ParseFilter} implementing plugins.*/ +/** Creates and caches {@link ParseFilter} implementing plugins. */ public class ParseFilters { private ParseFilter[] parseFilters; @@ -41,7 +41,8 @@ public class ParseFilters { public ParseFilters(Configuration conf) { String order = conf.get(HTMLPARSEFILTER_ORDER); ObjectCache objectCache = ObjectCache.get(conf); - this.parseFilters = (ParseFilter[]) objectCache.getObject(ParseFilter.class.getName()); + this.parseFilters = (ParseFilter[]) objectCache.getObject(ParseFilter.class + .getName()); if (parseFilters == null) { /* * If ordered filters are required, prepare array of filters based on @@ -51,21 +52,23 @@ public class ParseFilters { if (order != null && !order.trim().equals("")) { orderedFilters = order.split("\\s+"); } - HashMap<String, ParseFilter> filterMap = - new HashMap<String, ParseFilter>(); + HashMap<String, ParseFilter> filterMap = new HashMap<String, ParseFilter>(); try { - ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(ParseFilter.X_POINT_ID); + ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( + ParseFilter.X_POINT_ID); if (point == null) throw new RuntimeException(ParseFilter.X_POINT_ID + " not found."); Extension[] extensions = point.getExtensions(); for (int i = 0; i < extensions.length; i++) { Extension extension = extensions[i]; - ParseFilter parseFilter = (ParseFilter) extension.getExtensionInstance(); + ParseFilter parseFilter = (ParseFilter) extension + .getExtensionInstance(); if (!filterMap.containsKey(parseFilter.getClass().getName())) { filterMap.put(parseFilter.getClass().getName(), parseFilter); } } - ParseFilter[] htmlParseFilters = filterMap.values().toArray(new ParseFilter[filterMap.size()]); + ParseFilter[] htmlParseFilters = filterMap.values().toArray( + new ParseFilter[filterMap.size()]); /* * If no ordered filters required, just get the filters in an * indeterminate order @@ -77,19 +80,19 @@ public class ParseFilters { else { ArrayList<ParseFilter> filters = new ArrayList<ParseFilter>(); for (int i = 0; i < orderedFilters.length; i++) { - ParseFilter filter = filterMap - .get(orderedFilters[i]); + ParseFilter filter = filterMap.get(orderedFilters[i]); if (filter != null) { filters.add(filter); } } - objectCache.setObject(ParseFilter.class.getName(), filters - .toArray(new ParseFilter[filters.size()])); + objectCache.setObject(ParseFilter.class.getName(), + filters.toArray(new ParseFilter[filters.size()])); } } catch (PluginRuntimeException e) { throw new RuntimeException(e); } - this.parseFilters = (ParseFilter[]) objectCache.getObject(ParseFilter.class.getName()); + this.parseFilters = (ParseFilter[]) objectCache + .getObject(ParseFilter.class.getName()); } } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginList.java Fri Jan 9 06:34:33 2015 @@ -22,25 +22,23 @@ import java.util.HashMap; import java.util.List; import java.util.Map; - /** * This class represents a natural ordering for which parsing plugin should get * called for a particular mimeType. It provides methods to store the * parse-plugins.xml data, and methods to retreive the name of the appropriate * parsing plugin for a contentType. - * + * * @author mattmann * @version 1.0 */ public class ParsePluginList { - + /* a map to link mimeType to an ordered list of parsing plugins */ private Map<String, List<String>> fMimeTypeToPluginMap = null; - + /* A list of aliases */ private Map<String, String> aliases = null; - - + /** * Constructs a new ParsePluginList */ @@ -48,7 +46,7 @@ public class ParsePluginList { fMimeTypeToPluginMap = new HashMap<String, List<String>>(); aliases = new HashMap<String, String>(); } - + public List<String> getPluginList(String mimeType) { return fMimeTypeToPluginMap.get(mimeType); } @@ -56,18 +54,18 @@ public class ParsePluginList { void setAliases(Map<String, String> aliases) { this.aliases = aliases; } - + public Map<String, String> getAliases() { return aliases; } - + void setPluginList(String mimeType, List<String> l) { fMimeTypeToPluginMap.put(mimeType, l); } - + List<String> getSupportedMimeTypes() { - return Arrays.asList(fMimeTypeToPluginMap.keySet().toArray( - new String[] {})); + return Arrays + .asList(fMimeTypeToPluginMap.keySet().toArray(new String[] {})); } - + } Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java?rev=1650447&r1=1650446&r2=1650447&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParsePluginsReader.java Fri Jan 9 06:34:33 2015 @@ -42,50 +42,50 @@ import org.apache.hadoop.conf.Configurat // Nutch imports import org.apache.nutch.util.NutchConfiguration; - /** * A reader to load the information stored in the * <code>$NUTCH_HOME/conf/parse-plugins.xml</code> file. - * + * * @author mattmann * @version 1.0 */ public class ParsePluginsReader { - + /* our log stream */ - public static final Logger LOG = LoggerFactory.getLogger(ParsePluginsReader.class); - + public static final Logger LOG = LoggerFactory + .getLogger(ParsePluginsReader.class); + /** The property name of the parse-plugins location */ private static final String PP_FILE_PROP = "parse.plugin.file"; /** the parse-plugins file */ private String fParsePluginsFile = null; - /** * Constructs a new ParsePluginsReader */ - public ParsePluginsReader() { } - + public ParsePluginsReader() { + } + /** * Reads the <code>parse-plugins.xml</code> file and returns the * {@link #ParsePluginList} defined by it. - * + * * @return A {@link #ParsePluginList} specified by the * <code>parse-plugins.xml</code> file. * @throws Exception - * If any parsing error occurs. + * If any parsing error occurs. */ public ParsePluginList parse(Configuration conf) { - + ParsePluginList pList = new ParsePluginList(); - + // open up the XML file DocumentBuilderFactory factory = null; DocumentBuilder parser = null; Document document = null; InputSource inputSource = null; - + InputStream ppInputStream = null; if (fParsePluginsFile != null) { URL parsePluginUrl = null; @@ -94,56 +94,55 @@ public class ParsePluginsReader { ppInputStream = parsePluginUrl.openStream(); } catch (Exception e) { if (LOG.isWarnEnabled()) { - LOG.warn("Unable to load parse plugins file from URL " + - "[" + fParsePluginsFile + "]. Reason is [" + e + "]"); + LOG.warn("Unable to load parse plugins file from URL " + "[" + + fParsePluginsFile + "]. Reason is [" + e + "]"); } return pList; } } else { - ppInputStream = conf.getConfResourceAsInputStream( - conf.get(PP_FILE_PROP)); + ppInputStream = conf.getConfResourceAsInputStream(conf.get(PP_FILE_PROP)); } - + inputSource = new InputSource(ppInputStream); - + try { factory = DocumentBuilderFactory.newInstance(); parser = factory.newDocumentBuilder(); document = parser.parse(inputSource); } catch (Exception e) { if (LOG.isWarnEnabled()) { - LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + - "Reason is [" + e + "]"); + LOG.warn("Unable to parse [" + fParsePluginsFile + "]." + "Reason is [" + + e + "]"); } return null; } - + Element parsePlugins = document.getDocumentElement(); - + // build up the alias hash map Map<String, String> aliases = getAliases(parsePlugins); // And store it on the parse plugin list pList.setAliases(aliases); - + // get all the mime type nodes NodeList mimeTypes = parsePlugins.getElementsByTagName("mimeType"); - + // iterate through the mime types for (int i = 0; i < mimeTypes.getLength(); i++) { Element mimeType = (Element) mimeTypes.item(i); String mimeTypeStr = mimeType.getAttribute("name"); - + // for each mimeType, get the plugin list NodeList pluginList = mimeType.getElementsByTagName("plugin"); - + // iterate through the plugins, add them in order read // OR if they have a special order="" attribute, then hold those in // a separate list, and then insert them into the final list at the // order specified if (pluginList != null && pluginList.getLength() > 0) { List<String> plugList = new ArrayList<String>(pluginList.getLength()); - - for (int j = 0; j<pluginList.getLength(); j++) { + + for (int j = 0; j < pluginList.getLength(); j++) { Element plugin = (Element) pluginList.item(j); String pluginId = plugin.getAttribute("id"); String extId = aliases.get(pluginId); @@ -163,110 +162,110 @@ public class ParsePluginsReader { plugList.add(extId); } } - + // now add the plugin list and map it to this mimeType pList.setPluginList(mimeTypeStr, plugList); - + } else if (LOG.isWarnEnabled()) { LOG.warn("ParsePluginsReader:ERROR:no plugins defined for mime type: " - + mimeTypeStr + ", continuing parse"); + + mimeTypeStr + ", continuing parse"); } } return pList; } - + /** * Tests parsing of the parse-plugins.xml file. An alternative name for the - * file can be specified via the <code>--file</code> option, although the - * file must be located in the <code>$NUTCH_HOME/conf</code> directory. - * + * file can be specified via the <code>--file</code> option, although the file + * must be located in the <code>$NUTCH_HOME/conf</code> directory. + * * @param args - * Currently only the --file argument to specify an alternative - * name for the parse-plugins.xml file is supported. + * Currently only the --file argument to specify an alternative name + * for the parse-plugins.xml file is supported. */ public static void main(String[] args) throws Exception { String parsePluginFile = null; String usage = "ParsePluginsReader [--file <parse plugin file location>]"; - - if (( args.length != 0 && args.length != 2 ) + + if ((args.length != 0 && args.length != 2) || (args.length == 2 && !"--file".equals(args[0]))) { System.err.println(usage); System.exit(1); } - + for (int i = 0; i < args.length; i++) { if (args[i].equals("--file")) { parsePluginFile = args[++i]; } } - + ParsePluginsReader reader = new ParsePluginsReader(); - + if (parsePluginFile != null) { reader.setFParsePluginsFile(parsePluginFile); } - + ParsePluginList prefs = reader.parse(NutchConfiguration.create()); - + for (String mimeType : prefs.getSupportedMimeTypes()) { - + System.out.println("MIMETYPE: " + mimeType); List<String> plugList = prefs.getPluginList(mimeType); - + System.out.println("EXTENSION IDs:"); - + for (String j : plugList) { System.out.println(j); } } - + } - + /** * @return Returns the fParsePluginsFile. */ public String getFParsePluginsFile() { return fParsePluginsFile; } - + /** * @param parsePluginsFile - * The fParsePluginsFile to set. + * The fParsePluginsFile to set. */ public void setFParsePluginsFile(String parsePluginsFile) { fParsePluginsFile = parsePluginsFile; } - + private Map<String, String> getAliases(Element parsePluginsRoot) { Map<String, String> aliases = new HashMap<String, String>(); NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases"); - + if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) { if (LOG.isWarnEnabled()) { LOG.warn("No aliases defined in parse-plugins.xml!"); } return aliases; } - + if (aliasRoot.getLength() > 1) { // log a warning, but try and continue processing if (LOG.isWarnEnabled()) { LOG.warn("There should only be one \"aliases\" tag in parse-plugins.xml"); } } - - Element aliasRootElem = (Element)aliasRoot.item(0); + + Element aliasRootElem = (Element) aliasRoot.item(0); NodeList aliasElements = aliasRootElem.getElementsByTagName("alias"); - + if (aliasElements != null && aliasElements.getLength() > 0) { - for (int i=0; i<aliasElements.getLength(); i++) { - Element aliasElem = (Element)aliasElements.item(i); - String parsePluginId = aliasElem.getAttribute("name"); - String extensionId = aliasElem.getAttribute("extension-id"); + for (int i = 0; i < aliasElements.getLength(); i++) { + Element aliasElem = (Element) aliasElements.item(i); + String parsePluginId = aliasElem.getAttribute("name"); + String extensionId = aliasElem.getAttribute("extension-id"); if (LOG.isTraceEnabled()) { - LOG.trace("Found alias: plugin-id: " + parsePluginId + - ", extension-id: " + extensionId); + LOG.trace("Found alias: plugin-id: " + parsePluginId + + ", extension-id: " + extensionId); } if (parsePluginId != null && extensionId != null) { aliases.put(parsePluginId, extensionId); @@ -275,5 +274,5 @@ public class ParsePluginsReader { } return aliases; } - + }