hy,
i updated a old modified luceneIndexTransformer that i made last year.
This LuceneIndexTransformer is more general (more low level) and can be used to index all kind of resources (not only HTML page).
 
a typic pipeline to index would be :
ressource -> XSLT -> LuceneIndexTransformer-> xml results
 
a sample :
 
<lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0"
create="true"
analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer"
directory="d:/indexbase"
merge-factor="100">
    <lucene:document >
        <lucene:field name="tile" type="keyword">sqdqsdq</lucene:field>
        <lucene:field name="description" type="text"> bla bal blalael balbal </lucene:field>
        <lucene:field name="date" type="date" dateformat="MM/dd/yyyy">10/12/2002</lucene:field>
        (see java API Class SimpleDateFormat for more explanation about the dateFormat attribut)           
        <lucene:field name="date" type="unstored" >just indexed information (not stored)</lucene:field>
        <lucene:field name="date" type="unindexed" >just stored information (not indexed)</lucene:field>
    </lucene:document>
    <lucene:document>
        <lucene:field name="author" type="keyword" boost="2">Mr Author </lucene:field> (boost the field for the search (see Lucene documentation))
        <lucene:field name="langage" type="keyword">french</lucene:field>
    </lucene:document>
< /lucene:index>
 
To delete documents
<lucene:delete directory="d:/indexbase" >
    <lucene:document field="author" value="Mr Author"/> (delete all documents with the field author ="Mr Author")
    <lucene:document field="id" value="1E3RFE"/>
< /lucene:delete>
 
Example of Output Source
< lucene:index nbdocuments="2"/>
< lucene:delete nbdocuments="1"/>
 
Maybe someone would be interessed...
 
NIcolas Maisonneuve
 
package org.apache.cocoon.transformation;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Map;

import org.apache.avalon.excalibur.pool.Recyclable;
import org.apache.avalon.framework.activity.Disposable;
import org.apache.avalon.framework.component.ComponentException;
import org.apache.avalon.framework.component.ComponentManager;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.context.Context;
import org.apache.avalon.framework.context.ContextException;
import org.apache.avalon.framework.context.Contextualizable;
import org.apache.avalon.framework.parameters.Parameters;
import org.apache.cocoon.Constants;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.caching.CacheableProcessingComponent;
import org.apache.cocoon.components.search.LuceneCocoonHelper;
import org.apache.cocoon.environment.SourceResolver;
import org.apache.cocoon.transformation.AbstractTransformer;
import org.apache.excalibur.source.SourceValidity;
import org.apache.excalibur.source.impl.validity.NOPValidity;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.*;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
 * A lucene index creation transformer.
 * @author Nicolas Maisonneuve
 *
 <p><strong>Example of input source:</strong></p>
 <p>
 <ul><li> to Index<br>
 &lt;lucene:index  xmlns:lucene=&quot;http://apache.org/cocoon/lucene/1.0&quot <br/>
 create=&quot;true&quot; <br/>
 analyzer=&quot;org.apache.lucene.analysis.standard.StandardAnalyzer&quot;<br/>
 directory=&quot;d:/indexbase&quot;<br>
 merge-factor=&quot;100&quot;&gt;<br>
 <br/>
 &lt;lucene:document &gt;<br/>
 &lt;lucene:field name=&quot;tile&quot; type=&quot;keyword&quot;&gt;sqdqsdq&lt;/lucene:field&gt;<br>
 &lt;lucene:field name=&quot;description&quot; type=&quot;text&quot;&gt; bla bal blalael balbal &lt;/lucene:field&gt;<br>
 &lt;lucene:field name=&quot;date&quot; type=&quot;date&quot; dateformat=&quot;MM/dd/yyyy&quot;&gt;10/12/2002&lt;/lucene:field&gt;
 <em>(see java API Class SimpleDateFormat for more explanation about the dateFormat attribut)</em><br/>
 &lt;lucene:field name=&quot;date&quot; type=&quot;unstored&quot; &gt;just indexed information (not stored)&lt;/lucene:field&gt;<br>
 &lt;lucene:field name=&quot;date&quot; type=&quot;unindexed&quot; &gt;just stored information (not indexed)&lt;/lucene:field&gt;<br>
 &lt;/lucene:document&gt;<br>
 <p> &lt;lucene:document&gt;<br>
 &lt;lucene:field name=&quot;author&quot; type=&quot;keyword&quot; boost=&quot;2&quot;&gt;Mr Author  &lt;/lucene:field&gt; <em>(boost the field for the search (see Lucene documentation))</em><br/>
 &lt;lucene:field name=&quot;langage&quot; type=&quot;keyword&quot;&gt;french&lt;/lucene:field&gt;<br>
 &lt;/lucene:document&gt;<br>
 &lt; /lucene:index&gt;</p></li>
 <li>To delete <br/>
 <p>&lt;lucene:delete directory=&quot;d:/indexbase&quot; &gt;<br>
 &lt;lucene:document field=&quot;author&quot; value=&quot;Mr Author&quot;/&gt; <em> (delete
 all documents with the field author =&quot;Mr Author&quot;)</em><br>&lt;lucene:document
 field=&quot;id&quot; value=&quot;1E3RFE&quot;/&gt; <br>
 &lt; /lucene:delete&gt;</p>
 <p><strong>Example of Output Source</strong></p>
 <p>&lt;page xmlns:lucene=&quot;http://apache.org/cocoon/lucene/1.0&quot;&gt;<br>
 &lt;
  lucene:index nbdocuments=&quot;2&quot;/&gt;<br>
 &lt;
 lucene:delete nbdocuments=&quot;1&quot;/&gt;<br>
 </li></ul>
 */

public class LuceneIndexTransformer
    extends AbstractTransformer
    implements Disposable, Recyclable,
    Configurable {

  public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
  public static final String ANALYZER_CLASSNAME_PARAMETER =
      "analyzer-classname";

  public static final String DIRECTORY_CONFIG = "directory";
  public static final String DIRECTORY_PARAMETER = "directory";

  public static final String MERGE_FACTOR_CONFIG = "merge-factor";
  public static final String MERGE_FACTOR_PARAMETER = "merge-factor";

  public static final String DIRECTORY_DEFAULT = "index";
  public static final int MERGE_FACTOR_DEFAULT = 20;
  public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";

  public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";;
  public static final String LUCENE_QUERY_ELEMENT = "index";
  public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
  public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
  public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
  public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE =
      "merge-factor";

  public static final String LUCENE_DELETE_ELEMENT = "delete";

  public static final String LUCENE_DOCUMENT_ELEMENT = "document";
  public static final String LUCENE_DOCUMENT_FIELD_ATTRIBUTE = "field";
  public static final String LUCENE_DOCUMENT_VALUE_ATTRIBUTE = "value";
  public static final String LUCENE_FIELD_ELEMENT = "field";
  public static final String LUCENE_FIELD_NAME_ATTRIBUTE = "name";
  public static final String LUCENE_FIELD_TYPE_ATTRIBUTE = "type";
  public static final String LUCENE_FIELD_DATEFORMAT_ATTRIBUTE = "dateformat";
  public static final String LUCENE_FIELD_BOOST_ATTRIBUTE = "boost";

  public static final int TYPE_KEYWORD = 1;
  public static final int TYPE_TEXT = 2;
  public static final int TYPE_DATE = 3;
  public static final int TYPE_UNSTORED = 4;
  public static final int TYPE_UNINDEXED = 5;

// PROCESS STATE
  public static final int NO_PROCESSING = 0;
  public static final int INDEX_PROCESS = 1;
  public static final int DOCUMENT_INDEX_PROCESS = 2;
  public static final int FIELD_INDEX_PROCESS = 4;
  public static final int DELETE_PROCESS = 5;
  public static final int DELETING_PROCESS = 6;

// Initialization time variables
  protected File workDir = null;
  protected int nbdocuments;
  protected int action;

// Declaration time parameters values
  private String analyzerClassnameDefault;
  private String directoryDefault;
  private int mergeFactorDefault;

// Invocation time parameters values
  private String analyzerClassname;
  private String directory;
  private int mergeFactor;

// Runtime variables
  private int processing;
  private IndexWriter writer;
  private IndexReader reader;

  private Term term;
  private Document bodyDocument;
  private String fieldname;
  private int fieldtype;
  private float fieldboost;
  private StringBuffer fieldvalue;
  private SimpleDateFormat df;

  public void configure(Configuration conf) throws ConfigurationException {
    this.analyzerClassnameDefault = conf.getChild(ANALYZER_CLASSNAME_CONFIG)
        .getValue(ANALYZER_CLASSNAME_DEFAULT);
    this.mergeFactorDefault = conf.getChild(MERGE_FACTOR_CONFIG)
        .getValueAsInteger(MERGE_FACTOR_DEFAULT);
    this.directoryDefault = conf.getChild(DIRECTORY_CONFIG)
        .getValue(DIRECTORY_DEFAULT);

  }

  /**
   * Setup the transformer.
   */
  public void setup(SourceResolver resolver, Map objectModel, String src,
                    Parameters parameters) throws ProcessingException,
      SAXException, IOException {
    // We don't need all this stuff
    this.analyzerClassname = parameters.getParameter(
        ANALYZER_CLASSNAME_PARAMETER, analyzerClassnameDefault);
    this.directory = parameters.getParameter(DIRECTORY_PARAMETER,
                                             directoryDefault);
    this.mergeFactor = parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER,
        mergeFactorDefault);
  }

  public void recycle() {
    this.processing = NO_PROCESSING;
    closeWriter();
    closeReader();
    this.bodyDocument = null;
  }

  public void dispose() {
    closeWriter();
    closeReader();
  }

  /**
   * Generate the unique key.
   * This key must be unique inside the space of this component.
   *
   * @return The generated key
   */
  public Serializable getKey() {
    return "1";
  }

  /**
   * Generate the validity object.
   *
   * @return The generated validity object or <code>null</code> if the
   *         component is currently not cacheable.
   */
  public SourceValidity getValidity() {
    return NOPValidity.SHARED_INSTANCE;
  }

  public void startDocument() throws SAXException {
    super.startDocument();
  }

  public void endDocument() throws SAXException {
    super.endDocument();
  }

  /**
   * Begin the scope of a prefix-URI Namespace mapping.
   *
   * @param prefix The Namespace prefix being declared.
   * @param uri The Namespace URI the prefix is mapped to.
   */
  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    if (processing == 0) {
      super.startPrefixMapping(prefix, uri);
    }
  }

  /**
   * End the scope of a prefix-URI mapping.
   *
   * @param prefix The prefix that was being mapping.
   */
  public void endPrefixMapping(String prefix) throws SAXException {
    if (processing == 0) {
      super.endPrefixMapping(prefix);
    }
  }

  public void startElement(String namespaceURI, String localName, String qName,
                           Attributes atts) throws SAXException {

    System.out.println("START processing: " + processing + " " + localName);

    if (LUCENE_URI.equals(namespaceURI)) {

      if (processing == NO_PROCESSING) {

        // INDEX ACTION
        if (LUCENE_QUERY_ELEMENT.equals(localName)) {
          initIndex(atts);
          processing = INDEX_PROCESS;
        }

        // DELETE ACTION
        else if (LUCENE_DELETE_ELEMENT.equals(localName)) {

          initReader(atts);
          processing = DELETE_PROCESS;
        }
        else {
          throw new SAXException("element " + localName + " unknown ");
        }
      }
      else if (processing == INDEX_PROCESS) {
        //  NEW DOCUMENT TO INDEX
        if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
          this.bodyDocument = new Document();
          processing = DOCUMENT_INDEX_PROCESS;
        }
        else {
          throw new SAXException("element " + localName + " is not a <lucene:document> element ");
        }
      }
      else if (processing == DELETE_PROCESS) {
        // DOCUMENT TO DELETED
        if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {

          fieldname = atts.getValue(LUCENE_DOCUMENT_FIELD_ATTRIBUTE);
          if (fieldname == null || fieldname.equals("")) {
            throw new SAXException(
                "<lucene:document> element must contain field attribute");
          }

          String value=atts.getValue(LUCENE_DOCUMENT_VALUE_ATTRIBUTE);
          if (value==null) {
            throw new SAXException(
                "<lucene:document> element must contain value attribute");
          }

          this.term = new Term(fieldname,value);
          processing = DELETING_PROCESS;
        }
        else {
          throw new SAXException("element " + localName + " is not a <lucene:document> element ");
        }
      }
      else if (processing == DOCUMENT_INDEX_PROCESS) {
        if (LUCENE_FIELD_ELEMENT.equals(localName)) {

          this.fieldname = atts.getValue(LUCENE_FIELD_NAME_ATTRIBUTE);
          if (this.fieldname == null || this.fieldname.equals("")) {
            throw new SAXException(
                "<lucene:field> element must contain name attribut");
          }
          this.fieldvalue = new StringBuffer();
          String fieldtype = atts.getValue(LUCENE_FIELD_TYPE_ATTRIBUTE);
          if (fieldtype == null || fieldtype.equals("")) {
            throw new SAXException(
                "<lucene:field> element must contain a type attribut");
          }

          if (fieldtype.equals("keyword")) {
            this.fieldtype = TYPE_KEYWORD;
          }
          else if (fieldtype.equals("text")) {
            this.fieldtype = TYPE_TEXT;
          }
          else if (fieldtype.equals("date")) {
            this.fieldtype = TYPE_DATE;
            String pattern = atts.getValue(LUCENE_FIELD_DATEFORMAT_ATTRIBUTE);

            if (pattern == null || pattern.equals("")) {
              throw new SAXException(
                  "<lucene:field type=\"date\"> element must contain a dateformat attribut");
            }
            df = new SimpleDateFormat(pattern);

          }
          else if (fieldtype.equals("unstored")) {
            this.fieldtype = TYPE_UNSTORED;
          }
          else if (fieldtype.equals("unindexed")) {
            this.fieldtype = TYPE_UNINDEXED;
          }

          String fieldboostS = atts.getValue(LUCENE_FIELD_BOOST_ATTRIBUTE);

          if (fieldboostS == null) {
            this.fieldboost = 1.0f;
          }
          else {
            this.fieldboost = Float.parseFloat(fieldboostS);
          }

          processing = FIELD_INDEX_PROCESS;
        }
        else {
          throw new SAXException("</lucene:" + this.LUCENE_FIELD_ELEMENT +
                                 " was expected!");
        }
      }

    }
    else {
      super.startElement(namespaceURI, localName, qName, atts);
    }
  }

  public void endElement(String namespaceURI, String localName, String qName) throws
      SAXException {

    System.out.println("END: processing: " + processing + " " + localName);

    if (LUCENE_URI.equals(namespaceURI)) {

      if (processing == INDEX_PROCESS) {

        if (LUCENE_QUERY_ELEMENT.equals(localName)) {
          // END OF THE INDEXATION
          AttributesImpl attrs = new AttributesImpl();
          attrs.addAttribute(null, "nbdocuments",
                             "nbdocuments", "CDATA",
                             Integer.toString(nbdocuments));

          super.startElement(namespaceURI, localName, qName, attrs);
          super.endElement(namespaceURI, localName, qName);
          nbdocuments = 0;

          try {
            this.writer.optimize();
          }
          catch (IOException e) {
            throw new SAXException(e);
          }
          this.closeWriter();
          this.processing = NO_PROCESSING;
        }
      }

      else if (processing == DOCUMENT_INDEX_PROCESS) {
        if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
          // END OF A DOCUMENT
          try {
            this.writer.addDocument(this.bodyDocument);
            nbdocuments++;
            this.bodyDocument = null;
          }
          catch (IOException e) {
            throw new SAXException(e);
          }
          this.processing = INDEX_PROCESS;
        }
        else {
          throw new SAXException("element " + localName + "unknown!");
        }
      }

      else if (processing == DELETE_PROCESS) {
        if (LUCENE_DELETE_ELEMENT.equals(localName)) {
          // END OF THE DELETE

          AttributesImpl attrs = new AttributesImpl();
          attrs.addAttribute(null, "nbdocuments",
                             "nbdocuments", "CDATA",
                             Integer.toString(nbdocuments));

          super.startElement(namespaceURI, localName, qName, attrs);
          super.endElement(namespaceURI, localName, qName);
          nbdocuments = 0;

          closeReader();

          this.processing = NO_PROCESSING;
        }
        else {
          throw new SAXException("</lucene:" + this.LUCENE_DELETE_ELEMENT +
                                 " was expected!");
        }
      }
      else if (processing == DELETING_PROCESS) {
        if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
          try {

            nbdocuments += reader.delete(this.term);
          }
          catch (IOException e) {
            throw new SAXException(e);
          }

          this.processing = DELETE_PROCESS;
        }
        else {
          throw new SAXException("</lucene:" + this.LUCENE_DOCUMENT_ELEMENT +
                                 " was expected!");
        }
      }
      else if (processing == FIELD_INDEX_PROCESS) {
        if (LUCENE_FIELD_ELEMENT.equals(localName)) {
          Field f = null;
          // add Field
          switch (fieldtype) {
            case TYPE_KEYWORD:
              f = Field.Keyword(fieldname, fieldvalue.toString());
              break;
            case TYPE_TEXT:
              f = Field.Text(fieldname, fieldvalue.toString());
              break;
            case TYPE_DATE:
              try {
                f = Field.Keyword(fieldname,
                                  DateField.dateToString(df.parse(fieldvalue.toString())));
              }
              catch (ParseException ex) {
                throw new SAXException(ex);
              }
              break;
            case TYPE_UNSTORED:
              f = Field.UnStored(fieldname, fieldvalue.toString());
              break;
            case TYPE_UNINDEXED:
              f = Field.UnIndexed(fieldname, fieldvalue.toString());
              break;
          }
          if (fieldboost != 1.0f) {
            f.setBoost(fieldboost);
          }
          bodyDocument.add(f);
          processing = DOCUMENT_INDEX_PROCESS;
        }
        else {
          throw new SAXException("</lucene:" + this.LUCENE_FIELD_ELEMENT +
                                 " was expected!");
        }
      }
    }
    else {
      super.endElement(namespaceURI, localName, qName);
    }
  }

  public void characters(char[] ch, int start, int length) throws
      SAXException {

    if (processing == FIELD_INDEX_PROCESS) {
      this.fieldvalue.append(ch, start, length);
    }
    else {
      super.characters(ch, start, length);
    }
  }


  /**
   * Initialize the IndexReader
   * @param atts
   * @throws SAXException
   */
  private void initReader(Attributes atts) throws SAXException {
    // directory parameter
    String directoryName =
        atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
    if (directoryName == null) {
      directoryName = this.directory;
    }
    try {
      reader = IndexReader.open(directoryName);
    }
    catch (IOException e) {
      throw new SAXException(e);
    }
  }

  /**
   * Initialize the indexWriter
   * @param atts
   */
  private void initIndex(Attributes atts) throws SAXException {

    // create base parameter
    String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
    boolean bCreate = sCreate != null &&
        (sCreate.equalsIgnoreCase("yes") ||
         sCreate.equalsIgnoreCase("true"));

    // analyzer parameter
    String analyzerClassname =
        atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
    if (analyzerClassname == null) {
      analyzerClassname = this.ANALYZER_CLASSNAME_DEFAULT;
    }
    Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(analyzerClassname);

    // mergeFactor parameter
    String sMergeFactor =
        atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
    int mergeFactor = this.mergeFactor;
    if (sMergeFactor != null) {
      mergeFactor = Integer.parseInt(sMergeFactor);
    }

    // directory parameter
    String directoryName =
        atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
    if (directoryName == null) {
      directoryName = this.directory;

    }
    openWriter(directoryName, analyzer, bCreate, mergeFactor);

  }

  /**
   * Open a IndexWriter
   * @param directoryName
   * @param analyzer
   * @param create
   * @param mergeFactor
   */
  private void openWriter(String directoryName, Analyzer analyzer, boolean create, int mergeFactor) throws SAXException {
    if (isWriterOpen()) {
      closeWriter();
    }
    try {
      writer = new IndexWriter(directoryName, analyzer, create);
      writer.setUseCompoundFile(true);
      writer.mergeFactor = mergeFactor;
    }
    catch (IOException e) {
      throw new SAXException(e);
    }
  }

  private boolean isReaderOpen() {
    return reader != null;
  }

  private boolean isWriterOpen() {
    return writer != null;
  }

  /**
   * Close the IndexReader
   */
  private void closeReader() {
    if (isReaderOpen()) {
      try {
        this.reader.close();
      }
      catch (IOException ioe) {
        this.getLogger().error("Close Reader Error: " + ioe.getMessage());
      }
      this.reader = null;
    }
  }

  /**
   * CLose the IndexWriter
   */
  private void closeWriter() {
    if (isWriterOpen()) {
      try {
        this.writer.close();
      }
      catch (IOException ioe) {
        this.getLogger().error("Close writer Error: " + ioe.getMessage());
      }
      this.writer = null;
    }
  }
}

Reply via email to