Author: jnioche Date: Mon Oct 7 10:08:43 2013 New Revision: 1529813 URL: http://svn.apache.org/r1529813 Log: NUTCH-1562
Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1529813&r1=1529812&r2=1529813&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Mon Oct 7 10:08:43 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1562 Order of execution for scoring filters (jnioche, snagel) + * NUTCH-1640 Reuse ParseUtil instance in ParseSegment (Mitesh Singh Jat via jnioche) * NUTCH-1639 bin/crawl fails on mac os (various contributors via snagel) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1529813&r1=1529812&r2=1529813&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Mon Oct 7 10:08:43 2013 @@ -1172,11 +1172,10 @@ <property> <name>scoring.filter.order</name> <value></value> - <description>The order in which scoring filters are applied. - This may be left empty (in which case all available scoring - filters will be applied in the order defined in plugin-includes - and plugin-excludes), or a space separated list of implementation - classes. + <description>The order in which scoring filters are applied. This + may be left empty (in which case all available scoring filters will + be applied in system defined order), or a space separated list of + implementation classes. </description> </property> Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java (original) +++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFilters.java Mon Oct 7 10:08:43 2013 @@ -17,16 +17,12 @@ package org.apache.nutch.indexer; -import java.util.ArrayList; -import java.util.HashMap; - // Commons Logging imports import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.nutch.plugin.*; +import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.parse.Parse; -import org.apache.nutch.util.ObjectCache; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; @@ -42,65 +38,10 @@ public class IndexingFilters { private IndexingFilter[] indexingFilters; public IndexingFilters(Configuration conf) { - /* Get indexingfilter.order property */ - String order = conf.get(INDEXINGFILTER_ORDER); - ObjectCache objectCache = ObjectCache.get(conf); - this.indexingFilters = (IndexingFilter[]) objectCache - .getObject(IndexingFilter.class.getName()); - if (this.indexingFilters == null) { - /* - * If ordered filters are required, prepare array of filters based on - * property - */ - String[] orderedFilters = null; - if (order != null && !order.trim().equals("")) { - orderedFilters = order.trim().split("\\s+"); - } - try { - ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( - IndexingFilter.X_POINT_ID); - if (point == null) - throw new RuntimeException(IndexingFilter.X_POINT_ID + " not found."); - Extension[] extensions = point.getExtensions(); - HashMap<String, IndexingFilter> filterMap = - new HashMap<String, IndexingFilter>(); - for (int i = 0; i < extensions.length; i++) { - Extension extension = extensions[i]; - IndexingFilter filter = (IndexingFilter) extension - .getExtensionInstance(); - LOG.info("Adding " + filter.getClass().getName()); - if (!filterMap.containsKey(filter.getClass().getName())) { - filterMap.put(filter.getClass().getName(), filter); - } - } - /* - * If no ordered filters required, just get the filters in an - * indeterminate order - */ - if (orderedFilters == null) { - objectCache.setObject(IndexingFilter.class.getName(), - filterMap.values().toArray( - new IndexingFilter[0])); - /* Otherwise run the filters in the required order */ - } else { - ArrayList<IndexingFilter> filters = new ArrayList<IndexingFilter>(); - for (int i = 0; i < orderedFilters.length; i++) { - IndexingFilter filter = filterMap - .get(orderedFilters[i]); - if (filter != null) { - filters.add(filter); - } - } - objectCache.setObject(IndexingFilter.class.getName(), filters - .toArray(new IndexingFilter[filters.size()])); - } - } catch (PluginRuntimeException e) { - throw new RuntimeException(e); - } - this.indexingFilters = (IndexingFilter[]) objectCache - .getObject(IndexingFilter.class.getName()); - } - } + indexingFilters = (IndexingFilter[]) PluginRepository.get(conf) + .getOrderedPlugins(IndexingFilter.class, IndexingFilter.X_POINT_ID, + INDEXINGFILTER_ORDER); + } /** Run all defined filters. */ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original) +++ nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Mon Oct 7 10:08:43 2013 @@ -17,17 +17,9 @@ package org.apache.nutch.net; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; - -import org.apache.nutch.plugin.Extension; -import org.apache.nutch.plugin.ExtensionPoint; -import org.apache.nutch.plugin.PluginRuntimeException; +import org.apache.hadoop.conf.Configuration; import org.apache.nutch.plugin.PluginRepository; -import org.apache.nutch.util.ObjectCache; -import org.apache.hadoop.conf.Configuration; /** Creates and caches {@link URLFilter} implementing plugins.*/ public class URLFilters { @@ -35,49 +27,9 @@ public class URLFilters { private URLFilter[] filters; public URLFilters(Configuration conf) { - String order = conf.get(URLFILTER_ORDER); - ObjectCache objectCache = ObjectCache.get(conf); - this.filters = (URLFilter[]) objectCache.getObject(URLFilter.class.getName()); - - if (this.filters == null) { - String[] orderedFilters = null; - if (order != null && !order.trim().equals("")) { - orderedFilters = order.trim().split("\\s+"); - } - - try { - ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( - URLFilter.X_POINT_ID); - if (point == null) - throw new RuntimeException(URLFilter.X_POINT_ID + " not found."); - Extension[] extensions = point.getExtensions(); - Map<String, URLFilter> filterMap = new HashMap<String, URLFilter>(); - for (int i = 0; i < extensions.length; i++) { - Extension extension = extensions[i]; - URLFilter filter = (URLFilter) extension.getExtensionInstance(); - if (!filterMap.containsKey(filter.getClass().getName())) { - filterMap.put(filter.getClass().getName(), filter); - } - } - if (orderedFilters == null) { - objectCache.setObject(URLFilter.class.getName(), filterMap.values().toArray( - new URLFilter[0])); - } else { - ArrayList<URLFilter> filters = new ArrayList<URLFilter>(); - for (int i = 0; i < orderedFilters.length; i++) { - URLFilter filter = filterMap.get(orderedFilters[i]); - if (filter != null) { - filters.add(filter); - } - } - objectCache.setObject(URLFilter.class.getName(), filters - .toArray(new URLFilter[filters.size()])); - } - } catch (PluginRuntimeException e) { - throw new RuntimeException(e); - } - this.filters = (URLFilter[]) objectCache.getObject(URLFilter.class.getName()); - } + this.filters = (URLFilter[]) PluginRepository.get(conf) + .getOrderedPlugins(URLFilter.class, URLFilter.X_POINT_ID, + URLFILTER_ORDER); } /** Run all defined filters. Assume logical AND. */ Modified: nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Mon Oct 7 10:08:43 2013 @@ -17,12 +17,8 @@ package org.apache.nutch.parse; -import java.util.ArrayList; -import java.util.HashMap; - import org.apache.nutch.protocol.Content; -import org.apache.nutch.plugin.*; -import org.apache.nutch.util.ObjectCache; +import org.apache.nutch.plugin.PluginRepository; import org.apache.hadoop.conf.Configuration; import org.w3c.dom.DocumentFragment; @@ -35,59 +31,10 @@ public class HtmlParseFilters { public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order"; public HtmlParseFilters(Configuration conf) { - String order = conf.get(HTMLPARSEFILTER_ORDER); - ObjectCache objectCache = ObjectCache.get(conf); - this.htmlParseFilters = (HtmlParseFilter[]) objectCache.getObject(HtmlParseFilter.class.getName()); - if (htmlParseFilters == null) { - /* - * If ordered filters are required, prepare array of filters based on - * property - */ - String[] orderedFilters = null; - if (order != null && !order.trim().equals("")) { - orderedFilters = order.trim().split("\\s+"); - } - HashMap<String, HtmlParseFilter> filterMap = - new HashMap<String, HtmlParseFilter>(); - try { - ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(HtmlParseFilter.X_POINT_ID); - if (point == null) - throw new RuntimeException(HtmlParseFilter.X_POINT_ID + " not found."); - Extension[] extensions = point.getExtensions(); - for (int i = 0; i < extensions.length; i++) { - Extension extension = extensions[i]; - HtmlParseFilter parseFilter = (HtmlParseFilter) extension.getExtensionInstance(); - if (!filterMap.containsKey(parseFilter.getClass().getName())) { - filterMap.put(parseFilter.getClass().getName(), parseFilter); - } - } - HtmlParseFilter[] htmlParseFilters = filterMap.values().toArray(new HtmlParseFilter[filterMap.size()]); - /* - * If no ordered filters required, just get the filters in an - * indeterminate order - */ - if (orderedFilters == null) { - objectCache.setObject(HtmlParseFilter.class.getName(), htmlParseFilters); - } - /* Otherwise run the filters in the required order */ - else { - ArrayList<HtmlParseFilter> filters = new ArrayList<HtmlParseFilter>(); - for (int i = 0; i < orderedFilters.length; i++) { - HtmlParseFilter filter = filterMap - .get(orderedFilters[i]); - if (filter != null) { - filters.add(filter); - } - } - objectCache.setObject(HtmlParseFilter.class.getName(), filters - .toArray(new HtmlParseFilter[filters.size()])); - } - } catch (PluginRuntimeException e) { - throw new RuntimeException(e); - } - this.htmlParseFilters = (HtmlParseFilter[]) objectCache.getObject(HtmlParseFilter.class.getName()); - } - } + htmlParseFilters = (HtmlParseFilter[]) PluginRepository.get(conf) + .getOrderedPlugins(HtmlParseFilter.class, HtmlParseFilter.X_POINT_ID, + HTMLPARSEFILTER_ORDER); + } /** Run all defined filters. */ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1529813&r1=1529812&r2=1529813&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java (original) +++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Mon Oct 7 10:08:43 2013 @@ -16,10 +16,12 @@ */ package org.apache.nutch.plugin; +import java.lang.reflect.Array; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.WeakHashMap; import java.util.List; @@ -29,6 +31,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.ObjectCache; /** * The plugin repositority is a registry of all plugins. @@ -371,6 +374,81 @@ public class PluginRepository { } return map; } + + /** + * Get ordered list of plugins. Filter and normalization plugins are applied + * in a configurable "pipeline" order, e.g., if one plugin depends on the + * output of another plugin. This method loads the plugins in the order + * defined by orderProperty. If orderProperty is empty or unset, all active + * plugins of the given interface and extension point are loaded. + * + * @param clazz + * interface class implemented by required plugins + * @param xPointId + * extension point id of required plugins + * @param orderProperty + * property name defining plugin order + * @return array of plugin instances + */ + public synchronized Object[] getOrderedPlugins(Class<?> clazz, String xPointId, + String orderProperty) { + Object[] filters; + ObjectCache objectCache = ObjectCache.get(conf); + filters = (Object[]) objectCache.getObject(clazz.getName()); + + if (filters == null) { + String order = conf.get(orderProperty); + List<String> orderOfFilters = new ArrayList<String>(); + boolean userDefinedOrder = false; + if (order != null && !order.trim().isEmpty()) { + orderOfFilters = Arrays.asList(order.trim().split("\\s+")); + userDefinedOrder = true; + } + + try { + ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint( + xPointId); + if (point == null) + throw new RuntimeException(xPointId + " not found."); + Extension[] extensions = point.getExtensions(); + HashMap<String, Object> filterMap = new HashMap<String, Object>(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + Object filter = extension.getExtensionInstance(); + if (!filterMap.containsKey(filter.getClass().getName())) { + filterMap.put(filter.getClass().getName(), filter); + if (!userDefinedOrder) + orderOfFilters.add(filter.getClass().getName()); + } + } + List<Object> sorted = new ArrayList<Object>(); + for (String orderedFilter : orderOfFilters) { + Object f = filterMap.get(orderedFilter); + if (f == null) { + LOG.error(clazz.getSimpleName() + " : " + orderedFilter + + " declared in configuration property " + orderProperty + + " but not found in an active plugin - ignoring."); + continue; + } + sorted.add(f); + } + Object[] filter = (Object[]) Array.newInstance(clazz, sorted.size()); + for (int i = 0; i < sorted.size(); i++) { + filter[i] = sorted.get(i); + if (LOG.isTraceEnabled()) { + LOG.trace(clazz.getSimpleName() + " : filters[" + i + "] = " + + filter[i].getClass()); + } + } + objectCache.setObject(clazz.getName(), filter); + } catch (PluginRuntimeException e) { + throw new RuntimeException(e); + } + + filters = (Object[]) objectCache.getObject(clazz.getName()); + } + return filters; + } /** * Loads all necessary dependencies for a selected plugin, and then runs one Modified: nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java?rev=1529813&r1=1529812&r2=1529813&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java Mon Oct 7 10:08:43 2013 @@ -18,25 +18,19 @@ package org.apache.nutch.scoring; import java.util.Collection; -import java.util.HashMap; import java.util.List; import java.util.Map.Entry; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.io.Text; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Inlinks; import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseData; -import org.apache.nutch.plugin.Extension; -import org.apache.nutch.plugin.ExtensionPoint; -import org.apache.nutch.plugin.PluginRuntimeException; import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.protocol.Content; -import org.apache.nutch.util.ObjectCache; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.io.Text; /** * Creates and caches {@link ScoringFilter} implementing plugins. @@ -49,43 +43,9 @@ public class ScoringFilters extends Conf public ScoringFilters(Configuration conf) { super(conf); - ObjectCache objectCache = ObjectCache.get(conf); - String order = conf.get("scoring.filter.order"); - this.filters = (ScoringFilter[]) objectCache.getObject(ScoringFilter.class.getName()); - - if (this.filters == null) { - String[] orderedFilters = null; - if (order != null && !order.trim().equals("")) { - orderedFilters = order.trim().split("\\s+"); - } - - try { - ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(ScoringFilter.X_POINT_ID); - if (point == null) throw new RuntimeException(ScoringFilter.X_POINT_ID + " not found."); - Extension[] extensions = point.getExtensions(); - HashMap<String, ScoringFilter> filterMap = - new HashMap<String, ScoringFilter>(); - for (int i = 0; i < extensions.length; i++) { - Extension extension = extensions[i]; - ScoringFilter filter = (ScoringFilter) extension.getExtensionInstance(); - if (!filterMap.containsKey(filter.getClass().getName())) { - filterMap.put(filter.getClass().getName(), filter); - } - } - if (orderedFilters == null) { - objectCache.setObject(ScoringFilter.class.getName(), filterMap.values().toArray(new ScoringFilter[0])); - } else { - ScoringFilter[] filter = new ScoringFilter[orderedFilters.length]; - for (int i = 0; i < orderedFilters.length; i++) { - filter[i] = filterMap.get(orderedFilters[i]); - } - objectCache.setObject(ScoringFilter.class.getName(), filter); - } - } catch (PluginRuntimeException e) { - throw new RuntimeException(e); - } - this.filters = (ScoringFilter[]) objectCache.getObject(ScoringFilter.class.getName()); - } + this.filters = (ScoringFilter[]) PluginRepository.get(conf) + .getOrderedPlugins(ScoringFilter.class, ScoringFilter.X_POINT_ID, + "scoring.filter.order"); } /** Calculate a sort value for Generate. */