[Nutch Wiki] Trivial Update of "WritingPluginExample" by LewisJohnMcgibbney

Apache Wiki Tue, 16 Aug 2011 07:26:34 -0700

Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change 
notification.


The "WritingPluginExample" page has been changed by LewisJohnMcgibbney:
http://wiki.apache.org/nutch/WritingPluginExample?action=diff&rev1=22&rev2=23

  }}}
  
  == The Scoring Extension ==
- The following is the code for the Indexing Filter extension.  If the document 
being indexed had a recommended meta tag this extension adds a lucene text 
field to the index called "recommended" with the content of that meta tag.
+ The following is the code for the URLMetaScoringFilter extension.  If the 
document being indexed had a recommended meta tag this extension adds a lucene 
text field to the index called "recommended" with the content of that meta tag.
  
  {{{
+ package org.apache.nutch.scoring.urlmeta;
- // This code does not work for 1.x branch
- package org.apache.nutch.parse.recommended;
  
- // JDK import
- import java.util.logging.Logger;
+ import java.util.Collection;
+ import java.util.Map.Entry;
+ import java.util.Iterator;
+ import java.util.List;
  
- // Nutch imports
- import org.apache.nutch.util.LogFormatter;
- import org.apache.nutch.fetcher.FetcherOutput;
+ import org.apache.commons.logging.Log;
+ import org.apache.commons.logging.LogFactory;
+ import org.apache.hadoop.conf.Configuration;
+ import org.apache.hadoop.conf.Configured;
+ import org.apache.hadoop.io.Text;
+ import org.apache.nutch.crawl.CrawlDatum;
+ import org.apache.nutch.crawl.Inlinks;
- import org.apache.nutch.indexer.IndexingFilter;
+ import org.apache.nutch.indexer.NutchDocument;
- import org.apache.nutch.indexer.IndexingException;
  import org.apache.nutch.parse.Parse;
+ import org.apache.nutch.parse.ParseData;
+ import org.apache.nutch.protocol.Content;
+ import org.apache.nutch.scoring.ScoringFilter;
+ import org.apache.nutch.scoring.ScoringFilterException;
  
- // Lucene imports
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Document;
+ /**
+  * For documentation:
+  * 
+  * @see URLMetaIndexingFilter
+  */
+ public class URLMetaScoringFilter extends Configured implements ScoringFilter 
{
  
- public class RecommendedIndexer implements IndexingFilter {
-   public static final Logger LOG
-     = LogFormatter.getLogger(RecommendedIndexer.class.getName());
+   private static final Log LOG = 
LogFactory.getLog(URLMetaScoringFilter.class);
+   private static final String CONF_PROPERTY = "urlmeta.tags";
+   private static String[] urlMetaTags;
+   private Configuration conf;
  
-   public RecommendedIndexer() {
-   }
+   /**
+    * This will take the metatags that you have listed in your "urlmeta.tags"
+    * property, and looks for them inside the parseData object. If they exist,
+    * this will be propagated into your 'targets' Collection's ["outlinks"]
+    * attributes.
+    * 
+    * @see ScoringFilter#distributeScoreToOutlinks
+    */
+   public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+       ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+       CrawlDatum adjust, int allCount) throws ScoringFilterException {
+     if (urlMetaTags == null || targets == null || parseData == null)
+       return adjust;
  
+     Iterator<Entry<Text, CrawlDatum>> targetIterator = targets.iterator();
-   public Document filter(Document doc, Parse parse, FetcherOutput fo)
-     throws IndexingException {
  
-     String recommendation = parse.getData().get("Recommended");
+     while (targetIterator.hasNext()) {
+       Entry<Text, CrawlDatum> nextTarget = targetIterator.next();
  
-         if (recommendation != null) {
-                         Field recommendedField =
-                          new Field("recommended", recommendation, 
Field.Store.YES, Field.Index.UN_TOKENIZED);
-                         recommendedField.setBoost(5.0f);
-                 doc.add(recommendedField);
-                         LOG.info("Added " + recommendation + " to the 
recommended Field");
+       for (String metatag : urlMetaTags) {
+         String metaFromParse = parseData.getMeta(metatag);
+ 
+         if (metaFromParse == null)
+           continue;
+ 
+         nextTarget.getValue().getMetaData().put(new Text(metatag),
+             new Text(metaFromParse));
-         }
+       }
+     }
+     return adjust;
+   }
  
+   /**
+    * Takes the metadata, specified in your "urlmeta.tags" property, from the
+    * datum object and injects it into the content. This is transfered to the
+    * parseData object.
+    * 
+    * @see ScoringFilter#passScoreBeforeParsing
+    * @see URLMetaScoringFilter#passScoreAfterParsing
+    */
+   public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content 
content) {
+     if (urlMetaTags == null || content == null || datum == null)
+       return;
+ 
+     for (String metatag : urlMetaTags) {
+       Text metaFromDatum = (Text) datum.getMetaData().get(new Text(metatag));
+ 
+       if (metaFromDatum == null)
+         continue;
+ 
+       content.getMetadata().set(metatag, metaFromDatum.toString());
+     }
+   }
+ 
+   /**
+    * Takes the metadata, which was lumped inside the content, and replicates 
it
+    * within your parse data.
+    * 
+    * @see URLMetaScoringFilter#passScoreBeforeParsing
+    * @see ScoringFilter#passScoreAfterParsing
+    */
+   public void passScoreAfterParsing(Text url, Content content, Parse parse) {
+     if (urlMetaTags == null || content == null || parse == null)
+       return;
+ 
+     for (String metatag : urlMetaTags) {
+       String metaFromContent = content.getMetadata().get(metatag);
+ 
+       if (metaFromContent == null)
+         continue;
+ 
+       parse.getData().getParseMeta().set(metatag, metaFromContent);
+     }
+   }
+ 
+   /** Boilerplate */
+   public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+       throws ScoringFilterException {
+     return initSort;
+   }
+ 
+   /** Boilerplate */
+   public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+       CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+       throws ScoringFilterException {
+     return initScore;
+   }
+ 
+   /** Boilerplate */
+   public void initialScore(Text url, CrawlDatum datum)
+       throws ScoringFilterException {
-     return doc;
+     return;
+   }
+ 
+   /** Boilerplate */
+   public void injectedScore(Text url, CrawlDatum datum)
+       throws ScoringFilterException {
+     return;
+   }
+ 
+   /** Boilerplate */
+   public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+       List inlinked) throws ScoringFilterException {
+     return;
+   }
+ 
+   /**
+    * handles conf assignment and pulls the value assignment from the
+    * "urlmeta.tags" property
+    */
+   public void setConf(Configuration conf) {
+     super.setConf(conf);
+ 
+     if (conf == null)
+       return;
+ 
+     urlMetaTags = conf.getStrings(CONF_PROPERTY);
+   }
+ 
+   /** Boilerplate */
+   public Configuration getConf() {
+     return conf;
    }
  }
  }}}
- {{{
- // This code works for Nutch 1.0 branch
- package org.apache.nutch.parse.recommended;
  
- // JDK import
- import java.util.logging.Logger;
- 
- // Nutch imports
- import org.apache.nutch.util.LogFormatter;
- import org.apache.nutch.fetcher.FetcherOutput;
- import org.apache.nutch.indexer.IndexingFilter;
- import org.apache.nutch.indexer.IndexingException;
- import org.apache.nutch.parse.Parse;
- 
- // Lucene imports
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Document;
- 
- public class RecommendedIndexer implements IndexingFilter {
-   public static final Logger LOG
-     = LogFormatter.getLogger(RecommendedIndexer.class.getName());
- 
-   public RecommendedIndexer() {
-   }
- 
-   public Document filter(Document doc, Parse parse, FetcherOutput fo)
-     throws IndexingException {
- 
-     String recommendation = parse.getData().get("Recommended");
- 
-         if (recommendation != null) {
-                 doc.add(recommendedField,recommendation);
-                         LOG.info("Added " + recommendation + " to the 
recommended Field");
-         }
- 
-     return doc;
-   }
- 
-   public void addIndexBackendOptions(Configuration conf) {
- 
-     ///////////////////////////
-     //    add lucene options //
-     ///////////////////////////
- 
-     LuceneWriter.addFieldOptions("recommended", recommendation, 
Field.Store.YES, Field.Index.UN_TOKENIZED);
- 
-   }
- }
- }}}
- == The QueryFilter ==
- The QueryFilter gets called when the user does a search.  We're bumping up 
the boost for the recommended field in order to increase its influence on the 
search results.
- 
- {{{
- package org.apache.nutch.parse.recommended;
- 
- import org.apache.nutch.searcher.FieldQueryFilter;
- 
- import java.util.logging.Logger;
- 
- import org.apache.nutch.util.LogFormatter;
- 
- public class RecommendedQueryFilter extends FieldQueryFilter {
- 
-         private static final Logger LOG = LogFormatter
-     .getLogger(RecommendedParser.class.getName());
- 
-   public RecommendedQueryFilter() {
-     super("recommended", 5f);
- 
-         LOG.info("Added a recommended query");
-   }
- 
- }
- }}}
  == Getting Nutch to Use Your Plugin ==
  In order to get Nutch to use your plugin, you need to edit your 
conf/nutch-site.xml file and add in a block like this:
  
  {{{
  <property>
    <name>plugin.includes</name>
-   
<value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value>
+   
<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|'''urlmeta''')|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
    <description>Regular expression naming plugin directory names to
    include.  Any plugin not matching this expression is excluded.
    In any case you need at least include the nutch-extensionpoints plugin. By
@@ -337, +381 @@

    </description>
  </property>
  }}}
- You'll want to edit the regular expression so that it includes the name of 
your plugin.
+ You'll want to edit the regular expression so that it includes the name of 
the '''urlmeta''' plugin.
  
- {{{
-   
<value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)|recommended</value>
- }}}
  == Getting Ant to Compile Your Plugin ==
  In order for ant to compile and deploy your plugin you need to edit the 
src/plugin/build.xml file (NOT the build.xml in the root of your checkout 
directory). You'll see a number of lines that look like
  
@@ -351, +392 @@

  Edit this block to add a line for your plugin before the </target> tag.
  
  {{{
-   <ant dir="reccomended" target="deploy" />
+   <ant dir="urlmeta" target="deploy" />
  }}}
- Running 'ant' in the root of your checkout directory should get everything 
compiled and jared up.  The next time you run a crawl your parser and index 
filter should get used.
+ Running 'ant' in the root of your checkout directory should get everything 
compiled and jared up.  The next time you run a crawl both the scoring and 
indexing extension will be used which will enable us to search for meta tags 
within our Solr index.
  
- You'll need to run 'ant war' to compile a new ROOT.war file.  Once you've 
deployed that, your query filter should get used when searches are performed.
- 
- ''This was written for Nutch 1.3 official release. There are various other 
plugin examples available at the 
[[http://wiki.apache.org/nutch/OldPluginCentral#Plugin_Tutorials|Old Plugin 
Central]] 
+ ''This was written for Nutch 1.3 official release. There are various other 
plugin examples available at the 
[[http://wiki.apache.org/nutch/OldPluginCentral#Plugin_Tutorials|Old Plugin 
Central]] for older implementations of previously used Nutch Plugins.
  
  <<< See also: HowToContribute

[Nutch Wiki] Trivial Update of "WritingPluginExample" by LewisJohnMcgibbney

Reply via email to