Dear Wiki user, You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.
The "WritingPluginExample" page has been changed by LewisJohnMcgibbney: http://wiki.apache.org/nutch/WritingPluginExample?action=diff&rev1=22&rev2=23 }}} == The Scoring Extension == - The following is the code for the Indexing Filter extension. If the document being indexed had a recommended meta tag this extension adds a lucene text field to the index called "recommended" with the content of that meta tag. + The following is the code for the URLMetaScoringFilter extension. If the document being indexed had a recommended meta tag this extension adds a lucene text field to the index called "recommended" with the content of that meta tag. {{{ + package org.apache.nutch.scoring.urlmeta; - // This code does not work for 1.x branch - package org.apache.nutch.parse.recommended; - // JDK import - import java.util.logging.Logger; + import java.util.Collection; + import java.util.Map.Entry; + import java.util.Iterator; + import java.util.List; - // Nutch imports - import org.apache.nutch.util.LogFormatter; - import org.apache.nutch.fetcher.FetcherOutput; + import org.apache.commons.logging.Log; + import org.apache.commons.logging.LogFactory; + import org.apache.hadoop.conf.Configuration; + import org.apache.hadoop.conf.Configured; + import org.apache.hadoop.io.Text; + import org.apache.nutch.crawl.CrawlDatum; + import org.apache.nutch.crawl.Inlinks; - import org.apache.nutch.indexer.IndexingFilter; + import org.apache.nutch.indexer.NutchDocument; - import org.apache.nutch.indexer.IndexingException; import org.apache.nutch.parse.Parse; + import org.apache.nutch.parse.ParseData; + import org.apache.nutch.protocol.Content; + import org.apache.nutch.scoring.ScoringFilter; + import org.apache.nutch.scoring.ScoringFilterException; - // Lucene imports - import org.apache.lucene.document.Field; - import org.apache.lucene.document.Document; + /** + * For documentation: + * + * @see URLMetaIndexingFilter + */ + public class URLMetaScoringFilter extends Configured implements ScoringFilter { - public class RecommendedIndexer implements IndexingFilter { - public static final Logger LOG - = LogFormatter.getLogger(RecommendedIndexer.class.getName()); + private static final Log LOG = LogFactory.getLog(URLMetaScoringFilter.class); + private static final String CONF_PROPERTY = "urlmeta.tags"; + private static String[] urlMetaTags; + private Configuration conf; - public RecommendedIndexer() { - } + /** + * This will take the metatags that you have listed in your "urlmeta.tags" + * property, and looks for them inside the parseData object. If they exist, + * this will be propagated into your 'targets' Collection's ["outlinks"] + * attributes. + * + * @see ScoringFilter#distributeScoreToOutlinks + */ + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { + if (urlMetaTags == null || targets == null || parseData == null) + return adjust; + Iterator<Entry<Text, CrawlDatum>> targetIterator = targets.iterator(); - public Document filter(Document doc, Parse parse, FetcherOutput fo) - throws IndexingException { - String recommendation = parse.getData().get("Recommended"); + while (targetIterator.hasNext()) { + Entry<Text, CrawlDatum> nextTarget = targetIterator.next(); - if (recommendation != null) { - Field recommendedField = - new Field("recommended", recommendation, Field.Store.YES, Field.Index.UN_TOKENIZED); - recommendedField.setBoost(5.0f); - doc.add(recommendedField); - LOG.info("Added " + recommendation + " to the recommended Field"); + for (String metatag : urlMetaTags) { + String metaFromParse = parseData.getMeta(metatag); + + if (metaFromParse == null) + continue; + + nextTarget.getValue().getMetaData().put(new Text(metatag), + new Text(metaFromParse)); - } + } + } + return adjust; + } + /** + * Takes the metadata, specified in your "urlmeta.tags" property, from the + * datum object and injects it into the content. This is transfered to the + * parseData object. + * + * @see ScoringFilter#passScoreBeforeParsing + * @see URLMetaScoringFilter#passScoreAfterParsing + */ + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) { + if (urlMetaTags == null || content == null || datum == null) + return; + + for (String metatag : urlMetaTags) { + Text metaFromDatum = (Text) datum.getMetaData().get(new Text(metatag)); + + if (metaFromDatum == null) + continue; + + content.getMetadata().set(metatag, metaFromDatum.toString()); + } + } + + /** + * Takes the metadata, which was lumped inside the content, and replicates it + * within your parse data. + * + * @see URLMetaScoringFilter#passScoreBeforeParsing + * @see ScoringFilter#passScoreAfterParsing + */ + public void passScoreAfterParsing(Text url, Content content, Parse parse) { + if (urlMetaTags == null || content == null || parse == null) + return; + + for (String metatag : urlMetaTags) { + String metaFromContent = content.getMetadata().get(metatag); + + if (metaFromContent == null) + continue; + + parse.getData().getParseMeta().set(metatag, metaFromContent); + } + } + + /** Boilerplate */ + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { + return initSort; + } + + /** Boilerplate */ + public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { + return initScore; + } + + /** Boilerplate */ + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { - return doc; + return; + } + + /** Boilerplate */ + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { + return; + } + + /** Boilerplate */ + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List inlinked) throws ScoringFilterException { + return; + } + + /** + * handles conf assignment and pulls the value assignment from the + * "urlmeta.tags" property + */ + public void setConf(Configuration conf) { + super.setConf(conf); + + if (conf == null) + return; + + urlMetaTags = conf.getStrings(CONF_PROPERTY); + } + + /** Boilerplate */ + public Configuration getConf() { + return conf; } } }}} - {{{ - // This code works for Nutch 1.0 branch - package org.apache.nutch.parse.recommended; - // JDK import - import java.util.logging.Logger; - - // Nutch imports - import org.apache.nutch.util.LogFormatter; - import org.apache.nutch.fetcher.FetcherOutput; - import org.apache.nutch.indexer.IndexingFilter; - import org.apache.nutch.indexer.IndexingException; - import org.apache.nutch.parse.Parse; - - // Lucene imports - import org.apache.lucene.document.Field; - import org.apache.lucene.document.Document; - - public class RecommendedIndexer implements IndexingFilter { - public static final Logger LOG - = LogFormatter.getLogger(RecommendedIndexer.class.getName()); - - public RecommendedIndexer() { - } - - public Document filter(Document doc, Parse parse, FetcherOutput fo) - throws IndexingException { - - String recommendation = parse.getData().get("Recommended"); - - if (recommendation != null) { - doc.add(recommendedField,recommendation); - LOG.info("Added " + recommendation + " to the recommended Field"); - } - - return doc; - } - - public void addIndexBackendOptions(Configuration conf) { - - /////////////////////////// - // add lucene options // - /////////////////////////// - - LuceneWriter.addFieldOptions("recommended", recommendation, Field.Store.YES, Field.Index.UN_TOKENIZED); - - } - } - }}} - == The QueryFilter == - The QueryFilter gets called when the user does a search. We're bumping up the boost for the recommended field in order to increase its influence on the search results. - - {{{ - package org.apache.nutch.parse.recommended; - - import org.apache.nutch.searcher.FieldQueryFilter; - - import java.util.logging.Logger; - - import org.apache.nutch.util.LogFormatter; - - public class RecommendedQueryFilter extends FieldQueryFilter { - - private static final Logger LOG = LogFormatter - .getLogger(RecommendedParser.class.getName()); - - public RecommendedQueryFilter() { - super("recommended", 5f); - - LOG.info("Added a recommended query"); - } - - } - }}} == Getting Nutch to Use Your Plugin == In order to get Nutch to use your plugin, you need to edit your conf/nutch-site.xml file and add in a block like this: {{{ <property> <name>plugin.includes</name> - <value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value> + <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|'''urlmeta''')|scoring-opic|urlnormalizer-(pass|regex|basic)</value> <description>Regular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By @@ -337, +381 @@ </description> </property> }}} - You'll want to edit the regular expression so that it includes the name of your plugin. + You'll want to edit the regular expression so that it includes the name of the '''urlmeta''' plugin. - {{{ - <value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)|recommended</value> - }}} == Getting Ant to Compile Your Plugin == In order for ant to compile and deploy your plugin you need to edit the src/plugin/build.xml file (NOT the build.xml in the root of your checkout directory). You'll see a number of lines that look like @@ -351, +392 @@ Edit this block to add a line for your plugin before the </target> tag. {{{ - <ant dir="reccomended" target="deploy" /> + <ant dir="urlmeta" target="deploy" /> }}} - Running 'ant' in the root of your checkout directory should get everything compiled and jared up. The next time you run a crawl your parser and index filter should get used. + Running 'ant' in the root of your checkout directory should get everything compiled and jared up. The next time you run a crawl both the scoring and indexing extension will be used which will enable us to search for meta tags within our Solr index. - You'll need to run 'ant war' to compile a new ROOT.war file. Once you've deployed that, your query filter should get used when searches are performed. - - ''This was written for Nutch 1.3 official release. There are various other plugin examples available at the [[http://wiki.apache.org/nutch/OldPluginCentral#Plugin_Tutorials|Old Plugin Central]] + ''This was written for Nutch 1.3 official release. There are various other plugin examples available at the [[http://wiki.apache.org/nutch/OldPluginCentral#Plugin_Tutorials|Old Plugin Central]] for older implementations of previously used Nutch Plugins. <<< See also: HowToContribute