Hi,

I am a nutch newbie and I would like to ask a few questions and I would 
appreciate any assistance.


1.       I have written a plugin that extracts all the JavaScript URLs and 
returns them as outlinks. I would like to configure Nutch to take these 
outlinks and push these urls in the crawldb.  Is there a way I can do that? If 
yes, I would like to know how I could do it.

2.       How do I invoke this plugin? My logs show that the plugin is not 
invoked. My set up is as follows:

Any advice would be gratefully appreciated. I am referring to 
http://florianhartl.com/nutch-how-it-works.html on what each job does.

Thanks,

Kartik

<parse-plugins>

  <!--  by default if the mimeType is set to *, or
        if it can't be determined, use parse-tika -->
  . . . . .

       <mimeType name="text/html">
              <plugin id="parse-html" />
              <plugin id="localeextractor" />
       </mimeType>
. . . .

<aliases>
              <alias name="parse-html"
                     extension-id="org.apache.nutch.parse.html.HtmlParser" />
              <alias name="parse-tika"
                     extension-id="org.apache.nutch.parse.tika.TikaParser" />
              <alias name="parse-ext" extension-id="ExtParser" />
              <alias name="parse-js" extension-id="JSParser" />
              <alias name="feed"
                     extension-id="org.apache.nutch.parse.feed.FeedParser" />
              <alias name="parse-swf"
                     extension-id="org.apache.nutch.parse.swf.SWFParser" />
              <alias name="parse-zip"
                     extension-id="org.apache.nutch.parse.zip.ZipParser" />
  <!-- This is my addition -->
              <alias name="localeextractor"
                extension-id="LocaleExtractorFilter" />
       </aliases>
</parse-plugins>

# # ###########
My plugin code
#############

public class LocaleExtractorFilter implements Parser {

  @Override
  public Parse getParse(String url, WebPage page) {
    // TODO Auto-generated method stub
    String stringContent = Bytes.toString(page.getContent());
    Set<Outlink> jsOutlinks = this.addUrlsToBeParsed(stringContent);
    return new Parse(
        page.getText().toString(), page.getTitle().toString(),
        jsOutlinks.toArray(new Outlink[0]), page.getParseStatus());
  }

  private static final Pattern PATTERN_WITH_ASCII_QUOTES =
      
Pattern.compile("^(?:.*?goto\\(&#39;(\\w+)&#39;\\).*|.*?OOLPopUp\\(&#39;(.+?&#39;\\)).*)$",
          Pattern.MULTILINE);

  private static final String REDIRECT = "/accounts/redirect.go?target=";
  /**
   * The implementation parses the URLs from the string content of HTML files. 
The URLs are of the
   * following format:
   * <ul>
   *   <li>{@code fsdgoto} links, Example
   *       {@code &lt;a name='bill_pay' 
href='javascript:goto(&#39;billpay&#39;);'&gt;Bill Pay
   *       &lt;/a&gt;}
   * </ul>
   *
   * @param stringContent from which multiple urls can be constructed
   */
  Set<Outlink> addUrlsToBeParsed(String stringContent) {
    Set<Outlink> outlinks = new TreeSet<Outlink>();
    Matcher matcher = PATTERN_WITH_ASCII_QUOTES.matcher(stringContent);
    while (matcher.find()) {
      String url = "";
      try {
        url = new StringBuilder(REDIRECT).append(
            matcher.group(1) != null ? matcher.group(1) : 
matcher.group(2)).toString();
        outlinks.add(new Outlink(url, ""));
      } catch (MalformedURLException mue) {
        LOG.warn("Error generating outlink urls for " + url, mue);
      }
    }
}


#############
Plugin.xml
#############

<plugin id="localeextractor" name="Locale extractor Filter" version="1.0.0"
  provider-name="nutch.org">

  <runtime>
    <library name="localeextractor">
      <export name="*" />
    </library>
  </runtime>

  <requires>
    <import plugin="nutch-extensionpoints" />
  </requires>

  <extension id="com.bofa.ecom.search.localeextractor"
    name="LocaleExtractor"
    point="org.apache.nutch.parse.Parser">
    <implementation id="LocaleExtractorFilter"
      class="com.myproject.LocaleExtractorFilter" />
  </extension>

</plugin>

----------------------------------------------------------------------
This message, and any attachments, is for the intended recipient(s) only, may 
contain information that is privileged, confidential and/or proprietary and 
subject to important terms and conditions available at 
http://www.bankofamerica.com/emaildisclaimer.   If you are not the intended 
recipient, please delete this message.

Reply via email to