Ryan Ackley wrote:

Thanks Sergiu,

You should also post to the Lucene Users list.

-Ryan



I did it from the begining. But I want to report a bugin this code. My coleagues reported me
that is possible to get an OutOfMemoryException for a PPT they have. I will try to debug this is the next days.


 Sergiu



----- Original Message ----- From: "Sergiu Gordea" <[EMAIL PROTECTED]>
To: "Lucene Users List" <[EMAIL PROTECTED]>;
<[EMAIL PROTECTED]>
Cc: "POI Users List" <[EMAIL PROTECTED]>
Sent: Friday, June 25, 2004 8:42 AM
Subject: Index MSOffice Documents





Hi all,

I'm working on a project in which we are building a knowledge
management platform. We are using Turbine/Velocity
as framework and we are using lucene for search.

We want to make the search to be able to index MSOffice Documents,
therefore I was searching for some possibilities to extract the text
from this
documents. I found some examples based on POI library
(http://jakarta.apache.org/poi) and I addapted them to our needs.
The extraction of the text elements from XLS file I think is trustable
(the from POI development comunity did a great job with the package that
work with XSL files). The examples that extract the text from DOC and
PPT files are not very general, I think they have problems with the
documents
written with special charsets but they are working just well on the
documents I use. I hope someone that has more experience that I have
will improve this
and will a better source code.

Congratulations to all people involved in development of the Jakarta
project and it's subprojects,

Sergiu Gordea

Ps: ExeConverteImpl uses an external stand alone application (like
antiwort or pdf2txt) to extract the text.





----------------------------------------------------------------------------
----




/* @(#) CWK 1.4 07.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universitätsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/

package com.configworks.cwk.be.search.converters;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;

/**
* Class description
*
* @author sergiu
* @version 1.0
* @since CWK 1.5
*/
public class XLSConverterImpl extends JavaDocumentConverter {

   private Log logger = null;
   File dest = null;



public boolean extractText(InputStream reader, BufferedWriter writer)


throws FileNotFoundException,


       IOException {

       HSSFWorkbook workbook = new HSSFWorkbook(reader);

       for (int k = 0; k < workbook.getNumberOfSheets(); k++) {
           HSSFSheet sheet = workbook.getSheetAt(k);

if (sheet != null) {
int rows = sheet.getLastRowNum();
//I don't know why the last row = sheet.getRow(rows) and


first row = sheet.getRow(0)


               for (int r = 0; r <= rows; r++) {
               HSSFRow row = sheet.getRow(r);
               if (row != null) {
                   int cells = row.getLastCellNum();
                   for (int c = 0; c <= cells; c++) {
                   HSSFCell cell = row.getCell((short) c);
                   String value = null;
                   if (cell != null) {
                       switch (cell.getCellType()) {
                           case HSSFCell.CELL_TYPE_FORMULA:
                               value = cell.getCellFormula();
                               break;
                           case HSSFCell.CELL_TYPE_STRING:
                               value = cell.getStringCellValue();
                               break;
                           case HSSFCell.CELL_TYPE_NUMERIC:
                               value = "" + cell.getNumericCellValue();
                               break;
                           default:
                               value = cell.getStringCellValue();
                       }
                   }
                   if (value != null) {
                       writer.write(value + " ");
                   }
               }//cels
               }
           }//rows
           }
       }//sheets

//if no Exception was thrown consider that the conversion was


successful


       return true;
   }

   /**
    * @return Returns the logger.
    */
   public Log getLogger() {
       if (logger == null)
           logger = LogFactory.getLog(XLSConverterImpl.class);
       return logger;
   }

}







----------------------------------------------------------------------------
----




package com.configworks.cwk.be.search.converters;

import com.configworks.cwk.share.Utils;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


/** * Created by IntelliJ IDEA. * User: Kostya * Date: 12.09.2003 * Time: 11:39:25 * To change this template use Options | File Templates. */

public class ExeConverterImpl extends Converter {

private Log logger =


LogFactory.getLog(ExeConverterImpl.class.getName());


public Reader convertSource(File source) {
try {
// the type is not registered the file content will not be


added to the index


if (_config.getExecutionPath() == null) {
return null;
}
// else convert file into a temp dir and return contents of


the converted file


else {
// if no converter is specified the file will be added


withot conversion


               if (_config.getExecutionPath().length() == 0)
                   return new FileReader(source);

               String execPath = _config.getExecutionPath();

               String sourcePath = source.getAbsolutePath();
               // create tempdir if it doesn't exists
               new File(_config.getTempDirectory()).mkdirs();

String targetPath = _config.getTempDirectory() +


File.separator + source.getName()


                   + ".txt";

               String params = "";
               if(_config.getPathParam()!= null){
               //add HOME parameter
               params += _config.getPathParam();
               }

Process process = Utils.executeOSCommand(execPath,


sourcePath, targetPath, params);


process.waitFor();
if (logger.isTraceEnabled()) {
BufferedInputStream stream=null;
try {
stream = new


BufferedInputStream(process.getErrorStream());


                   int read = 0;
                   String outErrorString = "";
                   while ((read = stream.read()) > 0)
                       outErrorString += ((char) read);
                   stream.close();
                   if (outErrorString.length() > 0)
                       logger.error(outErrorString);
                   } finally {
                       if (stream!=null) {
                           stream.close();
                       }
                   }
               }
               File convertedSource = new File(targetPath);
               convertedSource.deleteOnExit();
               return new FileReader(convertedSource);
           }
       } catch (IOException ex) {
           if (logger.isErrorEnabled())
               logger.error("IOException: " + ex.getMessage());
       } catch (InterruptedException ex) {
           if (logger.isErrorEnabled())
               logger.error("InterruptedException: " + ex.getMessage());
       }

       return null;
   }
}





----------------------------------------------------------------------------
----




/* @(#) CWK 1.4 25.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universitätsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/

package com.configworks.cwk.be.search.converters;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

/**
* Class description
*
* @author sergiu
* @version 1.0
*
* @since CWK 1.5
*/
public abstract class JavaDocumentConverter extends Converter {

File dest = null;

/* (non-Javadoc)
* @see


com.configworks.cwk.be.search.converters.Converter#convertSource(java.io.Fil
e)


*/
public Reader convertSource(File source) {
if (source == null)
return null;
Reader reader = null;
InputStream inputStream = null;
BufferedWriter writer = null;
try {
String filename = source.getName();
filename = filename.replace('.', '_');
filename += ".txt";
File tmpDir = new File(_config.getTempDirectory());
tmpDir.mkdirs();
dest = new File(tmpDir.getPath(), filename);
boolean created = dest.createNewFile();

//create the input and output streams
writer = new BufferedWriter(
new FileWriter(dest));
inputStream = new FileInputStream(source);

extractText(inputStream, writer);

if (!dest.exists())
return null;
dest.deleteOnExit();
reader = new BufferedReader(new FileReader(dest));

} catch (Exception e) {
getLogger().error("JavaDocumentConverter cannot convert the source file: "
+ source.getAbsolutePath(), e);
reader = null;
}finally{
try {
if(writer != null)
writer.close();
if(inputStream != null)
inputStream.close();
} catch (IOException ex) {
if(getLogger().isDebugEnabled())
getLogger().error("Cannot close the stream: " + ex);
}
}
return reader;

}

/**
* @param inputStream
* @param writer
* @since CWK 1.4.1
* @see
*/
public abstract boolean extractText(InputStream inputStream,


BufferedWriter writer) throws IOException;


}





----------------------------------------------------------------------------
----




/* @(#) CWK 1.5 23.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universitätsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/

package com.configworks.cwk.be.search.converters;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.util.LittleEndian;

/**
* Class description
*
* @author sergiu
* @version 1.0
* @since CWK 1.5
*/
public class PPTConverterImpl extends JavaDocumentConverter {

static final String lineSeparator =


System.getProperty("line.separator");


/**
* Extract the text from a number of presentations.
*/
public boolean extractText(InputStream reader, BufferedWriter writer)


throws IOException{


   POIFSReader r = new POIFSReader();

/* Register a listener for *all* documents. */
MyPOIFSReaderListener listener = new MyPOIFSReaderListener(writer);
r.registerListener(listener);
r.read(reader);
//if no exception was trown, consider that the conversion was


successful


   return true;
   }

   class MyPOIFSReaderListener implements POIFSReaderListener{
   private BufferedWriter writer = null;

   public MyPOIFSReaderListener(BufferedWriter writer){
   this.writer = writer;
   }

   public void processPOIFSReaderEvent(POIFSReaderEvent event) {
   PropertySet ps = null;

   try{

   org.apache.poi.poifs.filesystem.DocumentInputStream dis=null;
   if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
   return;

   dis=event.getStream();

   byte btoWrite[]= new byte[12];
   dis.read(btoWrite);

   btoWrite = new byte[dis.available()];
   dis.read(btoWrite, 0, dis.available());

   //StringBuffer buff = new StringBuffer("");

   for(int i=0; i<btoWrite.length-20; i++){

   long type=LittleEndian.getUShort(btoWrite,i+2);
   long size=LittleEndian.getUInt(btoWrite,i+4);
   if (type==4008){

   int offset = i+4+1;
   int length = (int)size+3;
   int end = offset + length;

   byte[] textBytes = new byte[length];

   for (int j = offset; j < end; j++) {
   byte b = btoWrite[j];
   writer.write((char) b);
   }

   if(i < (end -1))
   i = end -1;
   }

   }

   PropertySetFactory.create(event.getStream());
   }catch (Exception e){
   String msg = "Cannot index ppt file: \n";
       if(getLogger().isErrorEnabled())
       getLogger().error(msg + e);
   }
   }
   }
}








----------------------------------------------------------------------------
----




/* @(#) CWK 1.4 24.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universitätsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/

package com.configworks.cwk.be.search.converters;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.LittleEndian;

/**
* Class description
*
* @author sergiu
* @version 1.0
* @since CWK 1.5
*/
public class WordConverterImpl extends JavaDocumentConverter {

public boolean extractText(InputStream in, BufferedWriter writer)


throws IOException{


   ArrayList text = new ArrayList();
   POIFSFileSystem fsys = new POIFSFileSystem(in);

DocumentEntry headerProps =
(DocumentEntry)fsys.getRoot().getEntry("WordDocument");
DocumentInputStream din =


fsys.createDocumentInputStream("WordDocument");


   byte[] header = new byte[headerProps.getSize()];

   din.read(header);
   din.close();
   // Prende le informazioni dall'header del documento
   int info = LittleEndian.getShort(header, 0xa);

   boolean useTable1 = (info & 0x200) != 0;

   // Prende informazioni dalla piece table
   int complexOffset = LittleEndian.getInt(header, 0x1a2);


String tableName = null; if (useTable1) tableName = "1Table"; else tableName = "0Table";

DocumentEntry table =


(DocumentEntry)fsys.getRoot().getEntry(tableName);


   byte[] tableStream = new byte[table.getSize()];

   din = fsys.createDocumentInputStream(tableName);

   din.read(tableStream);
   din.close();

   din = null;
   fsys = null;
   table = null;
   headerProps = null;

   int multiple = findText(tableStream, complexOffset, text);

   StringBuffer sb = new StringBuffer();
   int size = text.size();
   tableStream = null;

   for (int x = 0; x < size; x++){
   WordTextPiece nextPiece = (WordTextPiece)text.get(x);
   int start = nextPiece.getStart();
   int length = nextPiece.getLength();

   boolean unicode = nextPiece.usesUnicode();
   String toStr = null;
   if (unicode)
   toStr = new String(header, start, length * multiple, "UTF-16LE");
   else
   toStr = new String(header, start, length , "ISO-8859-1");

//sb.append(toStr).append(" ");
toStr += " ";
writer.write(toStr);
}
//if no exeption occured we say that the conversion was successfully


realized


   return true;
   }

private static int findText(byte[] tableStream, int complexOffset,
ArrayList text) throws IOException{
//actual text
int pos = complexOffset;
int multiple = 2;
//skips through the prms before we reach the piece table. These


contain data


   //for actual fast saved files
   while(tableStream[pos] == 1){
   pos++;
   int skip = LittleEndian.getShort(tableStream, pos);
   pos += 2 + skip;
   }

   if(tableStream[pos] != 2){
   throw new IOException("corrupted Word file");
   }else{
   //parse out the text pieces
   int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
   pos += 4;
   int pieces = (pieceTableSize - 4) / 12;
   for (int x = 0; x < pieces; x++){
   int filePos = LittleEndian.getInt(tableStream, pos + ((pieces
   + 1) * 4) +
   (x * 8) + 2);
   boolean unicode = false;
   if ((filePos & 0x40000000) == 0){
   unicode = true;
   }else{
   unicode = false;
   multiple = 1;
   filePos &= ~(0x40000000);//gives me FC in doc stream
   filePos /= 2;
   }

   int totLength = LittleEndian.getInt(tableStream, pos + (x + 1) * 4)
   - LittleEndian.getInt(tableStream, pos + (x * 4));

   WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);

   text.add(piece);
   }
   }
   return multiple;
   }



}







----------------------------------------------------------------------------
----




/* @(#) CWK 1.4 07.06.2004
*
* Copyright 2003-2005 ConfigWorks Informationssysteme & Consulting GmbH
* Universitätsstr. 94/7 9020 Klagenfurt Austria
* www.configworks.com
* All rights reserved.
*/

package com.configworks.cwk.be.search.converters;

/**
* Class description
*
* @author sergiu
* @version 1.0
*
* @since CWK 1.4
*/
class WordTextPiece{
private int _fcStart;
private boolean _usesUnicode;
private int _length;

public WordTextPiece(int start, int length, boolean unicode){
_usesUnicode = unicode;
_length = length;
_fcStart = start;
}
public boolean usesUnicode(){
return _usesUnicode;
}

public int getStart(){
return _fcStart;
}
public int getLength(){
return _length;
}

}







----------------------------------------------------------------------------
----




package com.configworks.cwk.be.search.converters;

import java.io.File;
import java.io.Reader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
* Created by IntelliJ IDEA.
* User: Kostya
* Date: 11.09.2003
* Time: 19:24:56
* To change this template use Options | File Templates.
*/

public abstract class Converter {
   protected ConverterConfig _config;
   private static Log logger = null;

   public abstract Reader convertSource(File source);

   protected void Initialize(ConverterConfig config) {
       _config = config;
   };

   /**
    * @return Returns the logger.
    */
   public Log getLogger() {
   if (logger == null)
   logger = LogFactory.getLog(XLSConverterImpl.class);
   return logger;
   }
}






----------------------------------------------------------------------------
----




---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]





---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]






--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]



Reply via email to