Hi,
I want to search against a field, but I am getting difficulty to index that
field in lucene. for testing whether field is indexed or not I am using luke
as a tool.

here is the example of our program , I want to store recommend as a filed in
luke which may be for tag
<meta name="rollno" value="5"> in our html page. so here i want to index as
a field rollno which has value 5 in lucene index.

I am taking help of this entire program you can replace recommend as rollno.

Expecting your earliest reply.
thanks

package org.apache.nutch.parse.recommended;

//JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
import java.io.*;
//Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.metadata.Metadata;

//Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

//W3C imports
import org.w3c.dom.DocumentFragment;
import org.apache.nutch.util.NutchConfiguration;



public class RecommendedParser implements HtmlParseFilter {

  private static final Log LOG =
LogFactory.getLog(RecommendedParser.class.getName());
  
  private Configuration conf;

  /** The Recommended meta data attribute name */
  public static final String META_RECOMMENDED_NAME="Recommended";

  /**
   * Scan the HTML document looking for a recommended meta tag.
   */
  public Parse filter(Content content, Parse parse, 
    HTMLMetaTags metaTags, DocumentFragment doc) {
    // Trying to find the document's recommended term
    String recommendation = null;

    Properties generalMetaTags = metaTags.getGeneralTags();

    for (Enumeration tagNames = generalMetaTags.propertyNames();
tagNames.hasMoreElements(); ) {
        if (tagNames.nextElement().equals("recommended")) {
           recommendation = generalMetaTags.getProperty("recommended");
           LOG.info("Found a Recommendation for " + recommendation);
        }
    }

    if (recommendation == null) {
        LOG.info("No Recommendation");
    } else {
        LOG.info("Adding Recommendation for " + recommendation);
        parse.getData().getContentMeta().set(META_RECOMMENDED_NAME,
recommendation);
    }

    return parse;
  }
  
  public static void main(String ar[])throws Exception
  {
          if (ar.length < 3) {
                        System.err.println(RecommendedParser.class.getName() + 
" ");
                        return;
                }
                InputStream in = new FileInputStream(ar[0]);
                BufferedReader br = new BufferedReader(new InputStreamReader(in,
"UTF-8"));
                StringBuffer sb = new StringBuffer();
                String line = null;
                while ((line = br.readLine()) != null) sb.append(line + "\n");
                
                String contentType = "text/xml";
                String url = ar[1];
                String recommendedContent=ar[2];
                byte[] bytes = sb.toString().getBytes();
                
                Configuration conf = NutchConfiguration.create();
                RecommendedParser rp = new RecommendedParser();
                rp.setConf(NutchConfiguration.create());
                
                Content content =
                      new Content(url, url, bytes, contentType, new Metadata(), 
conf);
        
                 Parse parse = new
ParseUtil(conf).parseByExtensionId("org.apache.nutch.parse.recommended.recommendedfilter",content);
                                
        }

  
          
  /*String contentType = "text/html";
  InputStream in = new FileInputStream(file);
  ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
  byte[] buffer = new byte[1024];
  int i;
  while ((i = in.read(buffer)) != -1) {
    out.write(buffer, 0, i);
  }
  in.close();
  byte[] bytes = out.toByteArray();
  Configuration conf = NutchConfiguration.create();

  Content content =
    new Content(url, url, bytes, contentType, new Metadata(), conf);
  Parse parse = new
ParseUtil(conf).parseByExtensionId("parse-html",content);

  }*/
  
  
  public void setConf(Configuration conf) {
    this.conf = conf;
  }

  public Configuration getConf() {
    return this.conf;
  }  
}


public class RecommendedIndexer implements IndexingFilter {
    
          public static final Log LOG =
LogFactory.getLog(RecommendedIndexer.class.getName());
          
          private Configuration conf;
          
          public RecommendedIndexer() {
          }

          public Document filter(Document doc, Parse parse, UTF8 url, 
            CrawlDatum datum, Inlinks inlinks)
            throws IndexingException {

            String recommendation = parse.getData().getMeta("Recommended");

                if (recommendation != null) {
                    Field recommendedField = 
                        new Field("recommended", recommendation, 
                            Field.Store.YES, Field.Index.UN_TOKENIZED);
                    recommendedField.setBoost(5.0f);
                    doc.add(recommendedField);
                    LOG.info("Added " + recommendation + " to the recommended
Field");
                }

            return doc;
          }
          
          public void setConf(Configuration conf) {
            this.conf = conf;
          }

          public Configuration getConf() {
            return this.conf;
          }  
        }



public class RecommendedQueryFilter extends FieldQueryFilter {
    private static final Log LOG =
LogFactory.getLog(RecommendedParser.class.getName());

    public RecommendedQueryFilter() {
        super("recommended", 5f);
        LOG.info("Added a recommended query");
    }
  
}


this is plugin.xml file
----------------------------------------
----------------------------------------

xml version="1.0" encoding="UTF-8"?>
<plugin
   id="recommended"
   name="Recommended Parser/Filter"
   version="0.0.1"
   provider-name="nutch.org">

   <runtime>
      <!-- As defined in build.xml this plugin will end up bundled as
recommended.jar -->
      <library name="recommended.jar">
         <export name="*"/>
      </library>
   </runtime>

   <!-- The RecommendedParser extends the HtmlParseFilter to grab the
contents of
        any recommended meta tags -->
   <extension id="org.apache.nutch.parse.recommended.recommendedfilter"
              name="Recommended Parser"
              point="org.apache.nutch.parse.HtmlParseFilter">
      <implementation id="RecommendedParser"
                     
class="org.apache.nutch.parse.recommended.RecommendedParser"/>
   </extension>

   <!-- TheRecommendedIndexer extends the IndexingFilter in order to add the
contents
        of the recommended meta tags (as found by the RecommendedParser) to
the lucene
        index. -->
   <extension id="org.apache.nutch.parse.recommended.recommendedindexer"
              name="Recommended identifier filter"
              point="org.apache.nutch.indexer.IndexingFilter">
      <implementation id="RecommendedIndexer"
                     
class="org.apache.nutch.parse.recommended.RecommendedIndexer"/>
   </extension>

   <!-- The RecommendedQueryFilter gets called when you perform a search. It
runs a
        search for the user's query against the recommended fields.  In
order to get
        add this to the list of filters that gets run by default, you have
to use
        "fields=DEFAULT". -->   
   <extension id="org.apache.nutch.parse.recommended.recommendedSearcher"
              name="Recommended Search Query Filter"
              point="org.apache.nutch.searcher.QueryFilter">
      <implementation id="RecommendedQueryFilter"
                     
class="org.apache.nutch.parse.recommended.RecommendedQueryFilter">
                      <parameter name="fields" value="recommended"/>
                      </implementation>
                      
   </extension>

</plugin>

and I am not getting the desired result since recommended is not stored in
lucke as a field.
-- 
View this message in context: 
http://www.nabble.com/not-able-to-index-a-field-in-lucene-tf3462209.html#a9659562
Sent from the Nutch - User mailing list archive at Nabble.com.


-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys-and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Nutch-general mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/nutch-general

Reply via email to