hy,
i updated a old modified luceneIndexTransformer
that i made last year.
This LuceneIndexTransformer is more general (more
low level) and can be used to index all kind of resources (not only HTML page).
a typic pipeline to index would be :
ressource -> XSLT ->
LuceneIndexTransformer-> xml results
a sample :
<lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0"
create="true"
analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer"
directory="d:/indexbase"
merge-factor="100">
<lucene:document
>
<lucene:field name="tile"
type="keyword">sqdqsdq</lucene:field>
<lucene:field name="description" type="text"> bla bal blalael balbal
</lucene:field>
<lucene:field name="date" type="date"
dateformat="MM/dd/yyyy">10/12/2002</lucene:field>
(see java
API Class SimpleDateFormat for more explanation about the dateFormat
attribut)
<lucene:field name="date" type="unstored" >just indexed information (not
stored)</lucene:field>
<lucene:field name="date" type="unindexed" >just stored information (not
indexed)</lucene:field>
</lucene:document>
<lucene:document>
<lucene:field name="author" type="keyword" boost="2">Mr Author
</lucene:field> (boost the field for the search (see Lucene
documentation))
<lucene:field name="langage"
type="keyword">french</lucene:field>
</lucene:document>
< /lucene:index>
To delete documents
<lucene:delete directory="d:/indexbase"
>
<lucene:document
field="author" value="Mr Author"/> (delete all documents with the field
author ="Mr Author")
<lucene:document
field="id" value="1E3RFE"/>
< /lucene:delete>
Example of Output Source
< lucene:index nbdocuments="2"/>
< lucene:delete
nbdocuments="1"/>
Maybe someone would be
interessed...
NIcolas Maisonneuve
|
package org.apache.cocoon.transformation; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Map;
import org.apache.avalon.excalibur.pool.Recyclable; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.component.ComponentException; import org.apache.avalon.framework.component.ComponentManager; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.context.Context; import org.apache.avalon.framework.context.ContextException; import org.apache.avalon.framework.context.Contextualizable; import org.apache.avalon.framework.parameters.Parameters; import org.apache.cocoon.Constants; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.caching.CacheableProcessingComponent; import org.apache.cocoon.components.search.LuceneCocoonHelper; import org.apache.cocoon.environment.SourceResolver; import org.apache.cocoon.transformation.AbstractTransformer; import org.apache.excalibur.source.SourceValidity; import org.apache.excalibur.source.impl.validity.NOPValidity; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.DateField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.*; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** * A lucene index creation transformer. * @author Nicolas Maisonneuve * <p><strong>Example of input source:</strong></p> <p> <ul><li> to Index<br> <lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0" <br/> create="true" <br/> analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer"<br/> directory="d:/indexbase"<br> merge-factor="100"><br> <br/> <lucene:document ><br/> <lucene:field name="tile" type="keyword">sqdqsdq</lucene:field><br> <lucene:field name="description" type="text"> bla bal blalael balbal </lucene:field><br> <lucene:field name="date" type="date" dateformat="MM/dd/yyyy">10/12/2002</lucene:field> <em>(see java API Class SimpleDateFormat for more explanation about the dateFormat attribut)</em><br/> <lucene:field name="date" type="unstored" >just indexed information (not stored)</lucene:field><br> <lucene:field name="date" type="unindexed" >just stored information (not indexed)</lucene:field><br> </lucene:document><br> <p> <lucene:document><br> <lucene:field name="author" type="keyword" boost="2">Mr Author </lucene:field> <em>(boost the field for the search (see Lucene documentation))</em><br/> <lucene:field name="langage" type="keyword">french</lucene:field><br> </lucene:document><br> < /lucene:index></p></li> <li>To delete <br/> <p><lucene:delete directory="d:/indexbase" ><br> <lucene:document field="author" value="Mr Author"/> <em> (delete all documents with the field author ="Mr Author")</em><br><lucene:document field="id" value="1E3RFE"/> <br> < /lucene:delete></p> <p><strong>Example of Output Source</strong></p> <p><page xmlns:lucene="http://apache.org/cocoon/lucene/1.0"><br> < lucene:index nbdocuments="2"/><br> < lucene:delete nbdocuments="1"/><br> </li></ul> */ public class LuceneIndexTransformer extends AbstractTransformer implements Disposable, Recyclable, Configurable { public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname"; public static final String DIRECTORY_CONFIG = "directory"; public static final String DIRECTORY_PARAMETER = "directory"; public static final String MERGE_FACTOR_CONFIG = "merge-factor"; public static final String MERGE_FACTOR_PARAMETER = "merge-factor"; public static final String DIRECTORY_DEFAULT = "index"; public static final int MERGE_FACTOR_DEFAULT = 20; public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0"; public static final String LUCENE_QUERY_ELEMENT = "index"; public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer"; public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory"; public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create"; public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor"; public static final String LUCENE_DELETE_ELEMENT = "delete"; public static final String LUCENE_DOCUMENT_ELEMENT = "document"; public static final String LUCENE_DOCUMENT_FIELD_ATTRIBUTE = "field"; public static final String LUCENE_DOCUMENT_VALUE_ATTRIBUTE = "value"; public static final String LUCENE_FIELD_ELEMENT = "field"; public static final String LUCENE_FIELD_NAME_ATTRIBUTE = "name"; public static final String LUCENE_FIELD_TYPE_ATTRIBUTE = "type"; public static final String LUCENE_FIELD_DATEFORMAT_ATTRIBUTE = "dateformat"; public static final String LUCENE_FIELD_BOOST_ATTRIBUTE = "boost"; public static final int TYPE_KEYWORD = 1; public static final int TYPE_TEXT = 2; public static final int TYPE_DATE = 3; public static final int TYPE_UNSTORED = 4; public static final int TYPE_UNINDEXED = 5; // PROCESS STATE public static final int NO_PROCESSING = 0; public static final int INDEX_PROCESS = 1; public static final int DOCUMENT_INDEX_PROCESS = 2; public static final int FIELD_INDEX_PROCESS = 4; public static final int DELETE_PROCESS = 5; public static final int DELETING_PROCESS = 6; // Initialization time variables protected File workDir = null; protected int nbdocuments; protected int action; // Declaration time parameters values private String analyzerClassnameDefault; private String directoryDefault; private int mergeFactorDefault; // Invocation time parameters values private String analyzerClassname; private String directory; private int mergeFactor; // Runtime variables private int processing; private IndexWriter writer; private IndexReader reader; private Term term; private Document bodyDocument; private String fieldname; private int fieldtype; private float fieldboost; private StringBuffer fieldvalue; private SimpleDateFormat df; public void configure(Configuration conf) throws ConfigurationException { this.analyzerClassnameDefault = conf.getChild(ANALYZER_CLASSNAME_CONFIG) .getValue(ANALYZER_CLASSNAME_DEFAULT); this.mergeFactorDefault = conf.getChild(MERGE_FACTOR_CONFIG) .getValueAsInteger(MERGE_FACTOR_DEFAULT); this.directoryDefault = conf.getChild(DIRECTORY_CONFIG) .getValue(DIRECTORY_DEFAULT); } /** * Setup the transformer. */ public void setup(SourceResolver resolver, Map objectModel, String src, Parameters parameters) throws ProcessingException, SAXException, IOException { // We don't need all this stuff this.analyzerClassname = parameters.getParameter( ANALYZER_CLASSNAME_PARAMETER, analyzerClassnameDefault); this.directory = parameters.getParameter(DIRECTORY_PARAMETER, directoryDefault); this.mergeFactor = parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, mergeFactorDefault); } public void recycle() { this.processing = NO_PROCESSING; closeWriter(); closeReader(); this.bodyDocument = null; } public void dispose() { closeWriter(); closeReader(); } /** * Generate the unique key. * This key must be unique inside the space of this component. * * @return The generated key */ public Serializable getKey() { return "1"; } /** * Generate the validity object. * * @return The generated validity object or <code>null</code> if the * component is currently not cacheable. */ public SourceValidity getValidity() { return NOPValidity.SHARED_INSTANCE; } public void startDocument() throws SAXException { super.startDocument(); } public void endDocument() throws SAXException { super.endDocument(); } /** * Begin the scope of a prefix-URI Namespace mapping. * * @param prefix The Namespace prefix being declared. * @param uri The Namespace URI the prefix is mapped to. */ public void startPrefixMapping(String prefix, String uri) throws SAXException { if (processing == 0) { super.startPrefixMapping(prefix, uri); } } /** * End the scope of a prefix-URI mapping. * * @param prefix The prefix that was being mapping. */ public void endPrefixMapping(String prefix) throws SAXException { if (processing == 0) { super.endPrefixMapping(prefix); } } public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { System.out.println("START processing: " + processing + " " + localName); if (LUCENE_URI.equals(namespaceURI)) { if (processing == NO_PROCESSING) { // INDEX ACTION if (LUCENE_QUERY_ELEMENT.equals(localName)) { initIndex(atts); processing = INDEX_PROCESS; } // DELETE ACTION else if (LUCENE_DELETE_ELEMENT.equals(localName)) { initReader(atts); processing = DELETE_PROCESS; } else { throw new SAXException("element " + localName + " unknown "); } } else if (processing == INDEX_PROCESS) { // NEW DOCUMENT TO INDEX if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) { this.bodyDocument = new Document(); processing = DOCUMENT_INDEX_PROCESS; } else { throw new SAXException("element " + localName + " is not a <lucene:document> element "); } } else if (processing == DELETE_PROCESS) { // DOCUMENT TO DELETED if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) { fieldname = atts.getValue(LUCENE_DOCUMENT_FIELD_ATTRIBUTE); if (fieldname == null || fieldname.equals("")) { throw new SAXException( "<lucene:document> element must contain field attribute"); } String value=atts.getValue(LUCENE_DOCUMENT_VALUE_ATTRIBUTE); if (value==null) { throw new SAXException( "<lucene:document> element must contain value attribute"); } this.term = new Term(fieldname,value); processing = DELETING_PROCESS; } else { throw new SAXException("element " + localName + " is not a <lucene:document> element "); } } else if (processing == DOCUMENT_INDEX_PROCESS) { if (LUCENE_FIELD_ELEMENT.equals(localName)) { this.fieldname = atts.getValue(LUCENE_FIELD_NAME_ATTRIBUTE); if (this.fieldname == null || this.fieldname.equals("")) { throw new SAXException( "<lucene:field> element must contain name attribut"); } this.fieldvalue = new StringBuffer(); String fieldtype = atts.getValue(LUCENE_FIELD_TYPE_ATTRIBUTE); if (fieldtype == null || fieldtype.equals("")) { throw new SAXException( "<lucene:field> element must contain a type attribut"); } if (fieldtype.equals("keyword")) { this.fieldtype = TYPE_KEYWORD; } else if (fieldtype.equals("text")) { this.fieldtype = TYPE_TEXT; } else if (fieldtype.equals("date")) { this.fieldtype = TYPE_DATE; String pattern = atts.getValue(LUCENE_FIELD_DATEFORMAT_ATTRIBUTE); if (pattern == null || pattern.equals("")) { throw new SAXException( "<lucene:field type=\"date\"> element must contain a dateformat attribut"); } df = new SimpleDateFormat(pattern); } else if (fieldtype.equals("unstored")) { this.fieldtype = TYPE_UNSTORED; } else if (fieldtype.equals("unindexed")) { this.fieldtype = TYPE_UNINDEXED; } String fieldboostS = atts.getValue(LUCENE_FIELD_BOOST_ATTRIBUTE); if (fieldboostS == null) { this.fieldboost = 1.0f; } else { this.fieldboost = Float.parseFloat(fieldboostS); } processing = FIELD_INDEX_PROCESS; } else { throw new SAXException("</lucene:" + this.LUCENE_FIELD_ELEMENT + " was expected!"); } } } else { super.startElement(namespaceURI, localName, qName, atts); } } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { System.out.println("END: processing: " + processing + " " + localName); if (LUCENE_URI.equals(namespaceURI)) { if (processing == INDEX_PROCESS) { if (LUCENE_QUERY_ELEMENT.equals(localName)) { // END OF THE INDEXATION AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(null, "nbdocuments", "nbdocuments", "CDATA", Integer.toString(nbdocuments)); super.startElement(namespaceURI, localName, qName, attrs); super.endElement(namespaceURI, localName, qName); nbdocuments = 0; try { this.writer.optimize(); } catch (IOException e) { throw new SAXException(e); } this.closeWriter(); this.processing = NO_PROCESSING; } } else if (processing == DOCUMENT_INDEX_PROCESS) { if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) { // END OF A DOCUMENT try { this.writer.addDocument(this.bodyDocument); nbdocuments++; this.bodyDocument = null; } catch (IOException e) { throw new SAXException(e); } this.processing = INDEX_PROCESS; } else { throw new SAXException("element " + localName + "unknown!"); } } else if (processing == DELETE_PROCESS) { if (LUCENE_DELETE_ELEMENT.equals(localName)) { // END OF THE DELETE AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(null, "nbdocuments", "nbdocuments", "CDATA", Integer.toString(nbdocuments)); super.startElement(namespaceURI, localName, qName, attrs); super.endElement(namespaceURI, localName, qName); nbdocuments = 0; closeReader(); this.processing = NO_PROCESSING; } else { throw new SAXException("</lucene:" + this.LUCENE_DELETE_ELEMENT + " was expected!"); } } else if (processing == DELETING_PROCESS) { if (LUCENE_DOCUMENT_ELEMENT.equals(localName)) { try { nbdocuments += reader.delete(this.term); } catch (IOException e) { throw new SAXException(e); } this.processing = DELETE_PROCESS; } else { throw new SAXException("</lucene:" + this.LUCENE_DOCUMENT_ELEMENT + " was expected!"); } } else if (processing == FIELD_INDEX_PROCESS) { if (LUCENE_FIELD_ELEMENT.equals(localName)) { Field f = null; // add Field switch (fieldtype) { case TYPE_KEYWORD: f = Field.Keyword(fieldname, fieldvalue.toString()); break; case TYPE_TEXT: f = Field.Text(fieldname, fieldvalue.toString()); break; case TYPE_DATE: try { f = Field.Keyword(fieldname, DateField.dateToString(df.parse(fieldvalue.toString()))); } catch (ParseException ex) { throw new SAXException(ex); } break; case TYPE_UNSTORED: f = Field.UnStored(fieldname, fieldvalue.toString()); break; case TYPE_UNINDEXED: f = Field.UnIndexed(fieldname, fieldvalue.toString()); break; } if (fieldboost != 1.0f) { f.setBoost(fieldboost); } bodyDocument.add(f); processing = DOCUMENT_INDEX_PROCESS; } else { throw new SAXException("</lucene:" + this.LUCENE_FIELD_ELEMENT + " was expected!"); } } } else { super.endElement(namespaceURI, localName, qName); } } public void characters(char[] ch, int start, int length) throws SAXException { if (processing == FIELD_INDEX_PROCESS) { this.fieldvalue.append(ch, start, length); } else { super.characters(ch, start, length); } } /** * Initialize the IndexReader * @param atts * @throws SAXException */ private void initReader(Attributes atts) throws SAXException { // directory parameter String directoryName = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE); if (directoryName == null) { directoryName = this.directory; } try { reader = IndexReader.open(directoryName); } catch (IOException e) { throw new SAXException(e); } } /** * Initialize the indexWriter * @param atts */ private void initIndex(Attributes atts) throws SAXException { // create base parameter String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE); boolean bCreate = sCreate != null && (sCreate.equalsIgnoreCase("yes") || sCreate.equalsIgnoreCase("true")); // analyzer parameter String analyzerClassname = atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE); if (analyzerClassname == null) { analyzerClassname = this.ANALYZER_CLASSNAME_DEFAULT; } Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(analyzerClassname); // mergeFactor parameter String sMergeFactor = atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE); int mergeFactor = this.mergeFactor; if (sMergeFactor != null) { mergeFactor = Integer.parseInt(sMergeFactor); } // directory parameter String directoryName = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE); if (directoryName == null) { directoryName = this.directory; } openWriter(directoryName, analyzer, bCreate, mergeFactor); } /** * Open a IndexWriter * @param directoryName * @param analyzer * @param create * @param mergeFactor */ private void openWriter(String directoryName, Analyzer analyzer, boolean create, int mergeFactor) throws SAXException { if (isWriterOpen()) { closeWriter(); } try { writer = new IndexWriter(directoryName, analyzer, create); writer.setUseCompoundFile(true); writer.mergeFactor = mergeFactor; } catch (IOException e) { throw new SAXException(e); } } private boolean isReaderOpen() { return reader != null; } private boolean isWriterOpen() { return writer != null; } /** * Close the IndexReader */ private void closeReader() { if (isReaderOpen()) { try { this.reader.close(); } catch (IOException ioe) { this.getLogger().error("Close Reader Error: " + ioe.getMessage()); } this.reader = null; } } /** * CLose the IndexWriter */ private void closeWriter() { if (isWriterOpen()) { try { this.writer.close(); } catch (IOException ioe) { this.getLogger().error("Close writer Error: " + ioe.getMessage()); } this.writer = null; } } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]