A little bit intricate !

Dorel bruno Mon, 30 Apr 2007 01:59:31 -0700

A little bit intricate !

I propose a change for class TextContentIndexer :
To build a reader from thr revisionContent  :

Reader reader = ((ContentExtractor)extractor.get(i)).extract(newByteArrayInputStream(revisionContent.getContentBytes()));

is used  this code seems to be a little bit intricate !



I propose :

Reader reader =((ContentExtractor)extractor.get(i)).extract(revisionContent.streamContent());This code is simple and will save a huge amount of memory when therevision content is built using a File (I use the FileInputStream)Notice I use a file (and not bytes array ) to pass date from Slide tolucene as proposed by Jimmy Monin several months ago and I can index

very large files whithout "OutOfMemory" Exception

Enjoy (as usually ...... )

B DOREL

/*
 * $Header: 
/home/cvspublic/jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java,v
 1.9 2005/04/04 13:56:59 luetzkendorf Exp $
 * $Revision: 1.9 $
 * $Date: 2005/04/04 13:56:59 $
 *
 * ====================================================================
 *
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *  V1.1 BD/EADS le 30/04/07 :
 *       MOSS 0607-3637 Delai de restauration trop long passer par API 
 *       Simplification "évidente" de l'appel à ContentExtractor.extract
 *                     
 */

package org.apache.slide.index;

import org.apache.slide.search.IndexException;
import org.apache.slide.search.basic.IBasicExpressionFactory;
import org.apache.slide.util.logger.Logger;
import org.apache.slide.common.*;
import org.apache.slide.content.NodeRevisionDescriptors;
import org.apache.slide.content.NodeRevisionNumber;
import org.apache.slide.content.NodeRevisionDescriptor;
import org.apache.slide.content.NodeRevisionContent;
import org.apache.slide.store.IndexStore;
import org.apache.slide.extractor.ExtractorManager;
import org.apache.slide.extractor.ExtractorException;
import org.apache.slide.extractor.ContentExtractor;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.io.IOException;
import java.io.CharArrayReader;
import java.io.ByteArrayInputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;

/**
 * Lucene based IndexStore for indexing content. 
 * Apart from indexing the content as text field it adds
 * indexes using the registered content extractor.
 */
public class TextContentIndexer extends XAServiceBase implements IndexStore {

    private static final String INDEX_PATH = "indexpath";
    private static final String INCLUDES = "includes";
    private static final String ANALYZER = "analyzer";
    
    public static final String URI_FIELD = "uri";
    public static final String CONTENT_TEXT = "content";

    private String indexpath = "";
    private Collection includes;
    private String analyzerClassName;
    private Analyzer analyzer;
    private boolean started = false;
    
   /**
    * 
    * Constructeur
    *
    */
    public TextContentIndexer(){
      super();
    }

  /**
    * Create Index, if not yet done.
    *
    * @param    token               a  NamespaceAccessToken
    *
    * @throws   org.apache.slide.common.ServiceInitializationFailedException
    *
    */
    public void initialize(NamespaceAccessToken token)
        throws ServiceInitializationFailedException
   {
      initAnalyzer();

      IndexWriter indexWriter = null;
      try
      {
         indexWriter = new IndexWriter(indexpath, analyzer, false);
      }
      // will fail, if not yet exists
      catch (IOException e)
      {
         try
         {
            // create index
            indexWriter = new IndexWriter(indexpath, analyzer, true);
         }
         catch (IOException ex)
         {
            getLogger().log("Error while initializing the Lucene index " + 
e.getMessage(), LOG_CHANNEL, Logger.ERROR);
            throw new ServiceInitializationFailedException(this, ex);
         }
      }

      try
      {
         indexWriter.close();
      }
      catch (IOException e)
      {
          getLogger().log("Error while initializing the Lucene index " + 
e.getMessage(), LOG_CHANNEL, Logger.ERROR);
            throw new ServiceInitializationFailedException (this, e);

      }
      getLogger().log("Lucene is correctly initialized", LOG_CHANNEL, 
Logger.INFO);
    }

    /**
     * Index an object content.
     *
     * @param uri Uri
     * @exception IndexException Error accessing the Data Source
     */
    synchronized public void createIndex (Uri uri,
                                          NodeRevisionDescriptor 
revisionDescriptor,
                                          NodeRevisionContent revisionContent)
        throws IndexException
    {
      if (!isIncluded(uri.toString())) return;
      IndexWriter indexWriter = null;
      try
      {
         indexWriter = new IndexWriter(indexpath, analyzer, false);

         // Create document
         Document doc = new Document();

         doc.add(Field.Keyword(URI_FIELD, uri.toString()));
         doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, 
revisionContent)));

         if ( revisionContent != null && revisionDescriptor != null ) {
            List extractor = 
ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(),
 (NodeRevisionDescriptors)null, revisionDescriptor);
                 for ( int i = 0, l = extractor.size(); i < l; i++ ) {
                     // Reader reader = 
((ContentExtractor)extractor.get(i)).extract(new 
ByteArrayInputStream(revisionContent.getContentBytes()));
                   Reader reader = 
((ContentExtractor)extractor.get(i)).extract(revisionContent.streamContent());
                   doc.add(Field.Text(CONTENT_TEXT, reader));
                 }
            }

            indexWriter.addDocument(doc);
            indexWriter.optimize();

            getLogger().log(
                 "Added '" + uri.toString() + " - " + 
revisionDescriptor.getRevisionNumber().toString() + "' to index",
                 LOG_CHANNEL,
                 Logger.INFO);
       }
      catch (IOException e)
      {
         getLogger().log(
                "Error creating an index with " + uri.toString() + " - " + 
revisionDescriptor.getRevisionNumber(),
                LOG_CHANNEL,
                Logger.ERROR);
      }
      catch( ExtractorException e)
      {
         getLogger().log(
            "Error extracting content from " + uri.toString() + " - " + 
revisionDescriptor.getRevisionNumber(),
            LOG_CHANNEL,
            Logger.ERROR);
      }
      finally
      {
          try
          {
              if(indexWriter != null)
                 indexWriter.close();
          }
          catch(IOException ioe ) {}
      }
    }

    /**
     * Method updateIndex
     *
     * @param    uri                 an Uri
     * @param    revisionDescriptor  a  NodeRevisionDescriptor
     * @param    revisionContent     a  NodeRevisionContent
     *
     * @throws   IndexException
     *
     */
    synchronized public void updateIndex(Uri uri,
                                         NodeRevisionDescriptor 
revisionDescriptor,
                                         NodeRevisionContent revisionContent)
      throws IndexException
    {
        if (!isIncluded(uri.toString())) return;
        IndexWriter indexWriter = null;
        try
        {
            // Delete entries from index
            IndexReader indexReader = IndexReader.open(indexpath);
            Term term = new Term(URI_FIELD, uri.toString());

            indexReader.delete(term);
            indexReader.close();

            indexWriter = new IndexWriter(indexpath, analyzer, false);

            // Create document
            Document doc = new Document();

            doc.add(Field.Keyword(URI_FIELD, uri.toString()));
            doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, 
revisionContent)));

            if ( revisionContent != null && revisionDescriptor != null ) {
                List extractor = 
ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(),
 (NodeRevisionDescriptors)null, revisionDescriptor);
                for ( int i = 0, l = extractor.size(); i < l; i++ ) {
                   //  Reader reader = 
((ContentExtractor)extractor.get(i)).extract(new 
ByteArrayInputStream(revisionContent.getContentBytes()));
                   Reader reader = 
((ContentExtractor)extractor.get(i)).extract(revisionContent.streamContent());
                   doc.add(Field.Text(CONTENT_TEXT, reader));
                }
            }

            indexWriter.addDocument(doc);
            indexWriter.optimize();
            
            if (getLogger().isEnabled(Logger.DEBUG)) {
                getLogger().log(
                     "Updated '" + uri + " - " + 
revisionDescriptor.getRevisionNumber() + "' to index",
                     LOG_CHANNEL,
                     Logger.DEBUG);
            }
        }
        catch (IOException e)
        {
            getLogger().log(
                 "Error updating the index with " + uri + " - " + 
revisionDescriptor.getRevisionNumber(),
                 LOG_CHANNEL,
                 Logger.ERROR);
        }
        catch( ExtractorException e)
        {
            getLogger().log(
                 "Error extracting content from " + uri + " - " + 
revisionDescriptor.getRevisionNumber(),
                 LOG_CHANNEL,
                 Logger.ERROR);
        }
        finally
       {
           try
           {
               if(indexWriter != null)
                  indexWriter.close();
           }
           catch(IOException ioe ) {}
       }
    }

    /**
     * Drop an object revision from the index.
     *
     * @param uri Uri
     * @exception IndexException
     */
    synchronized public void dropIndex(Uri uri, NodeRevisionNumber number)
      throws IndexException
    {
        if (!isIncluded(uri.toString())) return;
        if (number == NodeRevisionNumber.HIDDEN_0_0) return;

        IndexWriter indexWriter = null;
        try
        {
            IndexReader indexReader = IndexReader.open(indexpath);
            Term term = new Term(URI_FIELD, uri.toString());

            indexReader.delete(term);
            indexReader.close();

            indexWriter = new IndexWriter(indexpath, analyzer, false);
            indexWriter.optimize();

            if (getLogger().isEnabled(Logger.DEBUG)) {
                getLogger().log(
                     "Deleted '" + uri + "' from the index",
                     LOG_CHANNEL,
                     Logger.DEBUG);
            }
        }
        catch (IOException e)
        {
            getLogger().log("Impossible to delete " + uri + " - " + number + " 
from the Lucene index");
        }
        finally
        {
            try
            {
                if(indexWriter != null)
                   indexWriter.close();
            }
            catch(IOException ioe )  {}
        }
    }


    /**
     * Method getFactory
     *
     * @return   an IBasicExpressionFactory
     *
     */
   public IBasicExpressionFactory getBasicExpressionFactory()
   {
      return new TextContainsExpressionFactory(indexpath, analyzer);
   }


  /**
    * Connects to the underlying data source (if any is needed).
    *
    * @exception ServiceConnectionFailedException Connection failed
    */
    public void connect() throws ServiceConnectionFailedException
    {
        getLogger().log(
             "TextContentIndexer:  connect",
             LOG_CHANNEL,
             Logger.INFO);
        started = true;
    }

  /**
    * This function tells whether or not the service is connected.
    *
    * @return boolean true if we are connected
    * @exception ServiceAccessException Service access error
    */
   public boolean isConnected() throws ServiceAccessException
   {
       return started;
   }

    /**
     * Parametrize the service. This index store expects a parameter
     * "indexpath" to contain the path to the directory to store the index.
     * Another optional parameter "includes" lists the paths of resources 
     * that are to be indexed in a comma-separated format. 
     * Everything under an included path is indexed. If not specified all 
     * resources will be indexed.
     * 
     * @param parameters Hashtable containing the parameters' names
     * and associated values
     * @exception ServiceParameterErrorException Incorrect service parameter
     * @exception ServiceParameterMissingException Service parameter missing
     */
   public void setParameters (Hashtable parameters) throws 
ServiceParameterErrorException, ServiceParameterMissingException
   {
        indexpath = (String)parameters.get (INDEX_PATH);
        if (indexpath == null || indexpath.length() == 0) {
            throw new ServiceParameterMissingException (this, INDEX_PATH);
        }
        String includes = (String) parameters.get(INCLUDES);
        if (includes != null && includes.length() > 0) {
            StringTokenizer tokenizer = new StringTokenizer(includes, ",");
            this.includes = new ArrayList(tokenizer.countTokens());
            while (tokenizer.hasMoreTokens()) {
                this.includes.add(tokenizer.nextToken());
            }
        }
        analyzerClassName = (String)parameters.get (ANALYZER);
   }

    /**
     * Disconnects from the underlying data source.
     *
     * @exception ServiceDisconnectionFailedException Disconnection failed
     */
    public void disconnect() throws ServiceDisconnectionFailedException
    {
        getLogger().log(
             "TextContentIndexer:  disconnect",
             LOG_CHANNEL,
             Logger.INFO);
        started = false;
    }

    /**
     * Deletes service underlying data source, if possible (and meaningful).
     *
     * @exception ServiceResetFailedException Reset failed
     */
    public void reset() throws ServiceResetFailedException
    {
        getLogger().log(
             "TextContentIndexer:  reset",
             LOG_CHANNEL,
             Logger.INFO);
   }

    protected Reader readContent(NodeRevisionDescriptor revisionDescriptor, 
                                 NodeRevisionContent revisionContent) throws 
IOException {
        return new CharArrayReader (revisionContent.getContent());
    }
    
    protected boolean isIncluded(String uri) {
        if (includes == null) return true;
        Iterator iter = includes.iterator();
        while (iter.hasNext()) {
            if (uri.startsWith((String) iter.next())) {
                return true;
            }
        }
        return false;
    }


    protected void initAnalyzer() throws ServiceInitializationFailedException {

        if (analyzerClassName == null || analyzerClassName.length() == 0) {
            getLogger().log("using Lucene StandardAnalyzer", LOG_CHANNEL, 
Logger.INFO);
            analyzer = new StandardAnalyzer();

        } else {
            getLogger().log("loading Lucene analyzer: " + analyzerClassName, 
LOG_CHANNEL, Logger.INFO);

            try {
                Class analyzerClazz = Class.forName(analyzerClassName);
                analyzer = (Analyzer)analyzerClazz.newInstance();

            } catch (ClassNotFoundException cnfe) {
                getLogger().log("Error while instantiating analyzer " + 
                                analyzerClassName + cnfe.getMessage(), 
LOG_CHANNEL, Logger.ERROR);
                throw new ServiceInitializationFailedException(this, cnfe);

            } catch (InstantiationException ie) {
                getLogger().log("Error while instantiating analyzer " + 
                                analyzerClassName + ie.getMessage(), 
LOG_CHANNEL, Logger.ERROR);
                throw new ServiceInitializationFailedException(this, ie);

            } catch (IllegalAccessException iae) {
                getLogger().log("Error while instantiating analyzer " + 
                                analyzerClassName + iae.getMessage(), 
LOG_CHANNEL, Logger.ERROR);
                throw new ServiceInitializationFailedException(this, iae);
            }
        }
    }

}

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

A little bit intricate !

Reply via email to