A little bit intricate !
I propose a change for class TextContentIndexer :
To build a reader from thr revisionContent :
Reader reader = ((ContentExtractor)extractor.get(i)).extract(new
ByteArrayInputStream(revisionContent.getContentBytes()));
is used this code seems to be a little bit intricate !
I propose :
Reader reader =
((ContentExtractor)extractor.get(i)).extract(revisionContent.streamContent());
This code is simple and will save a huge amount of memory when the
revision content is built using a File (I use the FileInputStream)
Notice I use a file (and not bytes array ) to pass date from Slide to
lucene as proposed by Jimmy Monin several months ago and I can index
very large files whithout "OutOfMemory" Exception
Enjoy (as usually ...... )
B DOREL
/*
* $Header:
/home/cvspublic/jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java,v
1.9 2005/04/04 13:56:59 luetzkendorf Exp $
* $Revision: 1.9 $
* $Date: 2005/04/04 13:56:59 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* V1.1 BD/EADS le 30/04/07 :
* MOSS 0607-3637 Delai de restauration trop long passer par API
* Simplification "évidente" de l'appel à ContentExtractor.extract
*
*/
package org.apache.slide.index;
import org.apache.slide.search.IndexException;
import org.apache.slide.search.basic.IBasicExpressionFactory;
import org.apache.slide.util.logger.Logger;
import org.apache.slide.common.*;
import org.apache.slide.content.NodeRevisionDescriptors;
import org.apache.slide.content.NodeRevisionNumber;
import org.apache.slide.content.NodeRevisionDescriptor;
import org.apache.slide.content.NodeRevisionContent;
import org.apache.slide.store.IndexStore;
import org.apache.slide.extractor.ExtractorManager;
import org.apache.slide.extractor.ExtractorException;
import org.apache.slide.extractor.ContentExtractor;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.IOException;
import java.io.CharArrayReader;
import java.io.ByteArrayInputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
/**
* Lucene based IndexStore for indexing content.
* Apart from indexing the content as text field it adds
* indexes using the registered content extractor.
*/
public class TextContentIndexer extends XAServiceBase implements IndexStore {
private static final String INDEX_PATH = "indexpath";
private static final String INCLUDES = "includes";
private static final String ANALYZER = "analyzer";
public static final String URI_FIELD = "uri";
public static final String CONTENT_TEXT = "content";
private String indexpath = "";
private Collection includes;
private String analyzerClassName;
private Analyzer analyzer;
private boolean started = false;
/**
*
* Constructeur
*
*/
public TextContentIndexer(){
super();
}
/**
* Create Index, if not yet done.
*
* @param token a NamespaceAccessToken
*
* @throws org.apache.slide.common.ServiceInitializationFailedException
*
*/
public void initialize(NamespaceAccessToken token)
throws ServiceInitializationFailedException
{
initAnalyzer();
IndexWriter indexWriter = null;
try
{
indexWriter = new IndexWriter(indexpath, analyzer, false);
}
// will fail, if not yet exists
catch (IOException e)
{
try
{
// create index
indexWriter = new IndexWriter(indexpath, analyzer, true);
}
catch (IOException ex)
{
getLogger().log("Error while initializing the Lucene index " +
e.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, ex);
}
}
try
{
indexWriter.close();
}
catch (IOException e)
{
getLogger().log("Error while initializing the Lucene index " +
e.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException (this, e);
}
getLogger().log("Lucene is correctly initialized", LOG_CHANNEL,
Logger.INFO);
}
/**
* Index an object content.
*
* @param uri Uri
* @exception IndexException Error accessing the Data Source
*/
synchronized public void createIndex (Uri uri,
NodeRevisionDescriptor
revisionDescriptor,
NodeRevisionContent revisionContent)
throws IndexException
{
if (!isIncluded(uri.toString())) return;
IndexWriter indexWriter = null;
try
{
indexWriter = new IndexWriter(indexpath, analyzer, false);
// Create document
Document doc = new Document();
doc.add(Field.Keyword(URI_FIELD, uri.toString()));
doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor,
revisionContent)));
if ( revisionContent != null && revisionDescriptor != null ) {
List extractor =
ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(),
(NodeRevisionDescriptors)null, revisionDescriptor);
for ( int i = 0, l = extractor.size(); i < l; i++ ) {
// Reader reader =
((ContentExtractor)extractor.get(i)).extract(new
ByteArrayInputStream(revisionContent.getContentBytes()));
Reader reader =
((ContentExtractor)extractor.get(i)).extract(revisionContent.streamContent());
doc.add(Field.Text(CONTENT_TEXT, reader));
}
}
indexWriter.addDocument(doc);
indexWriter.optimize();
getLogger().log(
"Added '" + uri.toString() + " - " +
revisionDescriptor.getRevisionNumber().toString() + "' to index",
LOG_CHANNEL,
Logger.INFO);
}
catch (IOException e)
{
getLogger().log(
"Error creating an index with " + uri.toString() + " - " +
revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
catch( ExtractorException e)
{
getLogger().log(
"Error extracting content from " + uri.toString() + " - " +
revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Method updateIndex
*
* @param uri an Uri
* @param revisionDescriptor a NodeRevisionDescriptor
* @param revisionContent a NodeRevisionContent
*
* @throws IndexException
*
*/
synchronized public void updateIndex(Uri uri,
NodeRevisionDescriptor
revisionDescriptor,
NodeRevisionContent revisionContent)
throws IndexException
{
if (!isIncluded(uri.toString())) return;
IndexWriter indexWriter = null;
try
{
// Delete entries from index
IndexReader indexReader = IndexReader.open(indexpath);
Term term = new Term(URI_FIELD, uri.toString());
indexReader.delete(term);
indexReader.close();
indexWriter = new IndexWriter(indexpath, analyzer, false);
// Create document
Document doc = new Document();
doc.add(Field.Keyword(URI_FIELD, uri.toString()));
doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor,
revisionContent)));
if ( revisionContent != null && revisionDescriptor != null ) {
List extractor =
ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(),
(NodeRevisionDescriptors)null, revisionDescriptor);
for ( int i = 0, l = extractor.size(); i < l; i++ ) {
// Reader reader =
((ContentExtractor)extractor.get(i)).extract(new
ByteArrayInputStream(revisionContent.getContentBytes()));
Reader reader =
((ContentExtractor)extractor.get(i)).extract(revisionContent.streamContent());
doc.add(Field.Text(CONTENT_TEXT, reader));
}
}
indexWriter.addDocument(doc);
indexWriter.optimize();
if (getLogger().isEnabled(Logger.DEBUG)) {
getLogger().log(
"Updated '" + uri + " - " +
revisionDescriptor.getRevisionNumber() + "' to index",
LOG_CHANNEL,
Logger.DEBUG);
}
}
catch (IOException e)
{
getLogger().log(
"Error updating the index with " + uri + " - " +
revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
catch( ExtractorException e)
{
getLogger().log(
"Error extracting content from " + uri + " - " +
revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Drop an object revision from the index.
*
* @param uri Uri
* @exception IndexException
*/
synchronized public void dropIndex(Uri uri, NodeRevisionNumber number)
throws IndexException
{
if (!isIncluded(uri.toString())) return;
if (number == NodeRevisionNumber.HIDDEN_0_0) return;
IndexWriter indexWriter = null;
try
{
IndexReader indexReader = IndexReader.open(indexpath);
Term term = new Term(URI_FIELD, uri.toString());
indexReader.delete(term);
indexReader.close();
indexWriter = new IndexWriter(indexpath, analyzer, false);
indexWriter.optimize();
if (getLogger().isEnabled(Logger.DEBUG)) {
getLogger().log(
"Deleted '" + uri + "' from the index",
LOG_CHANNEL,
Logger.DEBUG);
}
}
catch (IOException e)
{
getLogger().log("Impossible to delete " + uri + " - " + number + "
from the Lucene index");
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Method getFactory
*
* @return an IBasicExpressionFactory
*
*/
public IBasicExpressionFactory getBasicExpressionFactory()
{
return new TextContainsExpressionFactory(indexpath, analyzer);
}
/**
* Connects to the underlying data source (if any is needed).
*
* @exception ServiceConnectionFailedException Connection failed
*/
public void connect() throws ServiceConnectionFailedException
{
getLogger().log(
"TextContentIndexer: connect",
LOG_CHANNEL,
Logger.INFO);
started = true;
}
/**
* This function tells whether or not the service is connected.
*
* @return boolean true if we are connected
* @exception ServiceAccessException Service access error
*/
public boolean isConnected() throws ServiceAccessException
{
return started;
}
/**
* Parametrize the service. This index store expects a parameter
* "indexpath" to contain the path to the directory to store the index.
* Another optional parameter "includes" lists the paths of resources
* that are to be indexed in a comma-separated format.
* Everything under an included path is indexed. If not specified all
* resources will be indexed.
*
* @param parameters Hashtable containing the parameters' names
* and associated values
* @exception ServiceParameterErrorException Incorrect service parameter
* @exception ServiceParameterMissingException Service parameter missing
*/
public void setParameters (Hashtable parameters) throws
ServiceParameterErrorException, ServiceParameterMissingException
{
indexpath = (String)parameters.get (INDEX_PATH);
if (indexpath == null || indexpath.length() == 0) {
throw new ServiceParameterMissingException (this, INDEX_PATH);
}
String includes = (String) parameters.get(INCLUDES);
if (includes != null && includes.length() > 0) {
StringTokenizer tokenizer = new StringTokenizer(includes, ",");
this.includes = new ArrayList(tokenizer.countTokens());
while (tokenizer.hasMoreTokens()) {
this.includes.add(tokenizer.nextToken());
}
}
analyzerClassName = (String)parameters.get (ANALYZER);
}
/**
* Disconnects from the underlying data source.
*
* @exception ServiceDisconnectionFailedException Disconnection failed
*/
public void disconnect() throws ServiceDisconnectionFailedException
{
getLogger().log(
"TextContentIndexer: disconnect",
LOG_CHANNEL,
Logger.INFO);
started = false;
}
/**
* Deletes service underlying data source, if possible (and meaningful).
*
* @exception ServiceResetFailedException Reset failed
*/
public void reset() throws ServiceResetFailedException
{
getLogger().log(
"TextContentIndexer: reset",
LOG_CHANNEL,
Logger.INFO);
}
protected Reader readContent(NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent) throws
IOException {
return new CharArrayReader (revisionContent.getContent());
}
protected boolean isIncluded(String uri) {
if (includes == null) return true;
Iterator iter = includes.iterator();
while (iter.hasNext()) {
if (uri.startsWith((String) iter.next())) {
return true;
}
}
return false;
}
protected void initAnalyzer() throws ServiceInitializationFailedException {
if (analyzerClassName == null || analyzerClassName.length() == 0) {
getLogger().log("using Lucene StandardAnalyzer", LOG_CHANNEL,
Logger.INFO);
analyzer = new StandardAnalyzer();
} else {
getLogger().log("loading Lucene analyzer: " + analyzerClassName,
LOG_CHANNEL, Logger.INFO);
try {
Class analyzerClazz = Class.forName(analyzerClassName);
analyzer = (Analyzer)analyzerClazz.newInstance();
} catch (ClassNotFoundException cnfe) {
getLogger().log("Error while instantiating analyzer " +
analyzerClassName + cnfe.getMessage(),
LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, cnfe);
} catch (InstantiationException ie) {
getLogger().log("Error while instantiating analyzer " +
analyzerClassName + ie.getMessage(),
LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, ie);
} catch (IllegalAccessException iae) {
getLogger().log("Error while instantiating analyzer " +
analyzerClassName + iae.getMessage(),
LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, iae);
}
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]