Hello All, This was my first contribution http://wiki.apache.org/jakarta-lucene-data/attachments/PowerPoint/attachments/PPT2Text.java for jakarta team. And it seems another expert(Ryan Rhodes- [EMAIL PROTECTED]) has already started working on that based on my first given contribution.
That sounds great to me. So In order to increase the development process for Powerpoint extractor, I just wanted to contribute Our team efforts in developing the Powerpoint extractor Authors :- Sudhakar Chavali ([EMAIL PROTECTED]) and Hari Shanker Goud ([EMAIL PROTECTED]) Have a look on the below source codes Regards Sudhakar ____________________________________________________________________________________ /** * Title: DocumentParserException class * Description: This is root Exceptional class for throwing the runtime errors that can be raised by different parsers * @author Sudhakar * @version 1.0 */ public class DocumentParserException extends Exception { /** * Constructs a new exception with null as its detail message. */ public DocumentParserException() { } /** * Constructs a new exception with the specified detail message. * @param message */ public DocumentParserException(String message) { super(message); } /** * Constructs a new exception with the specified detail message. * @param message * @param cause */ public DocumentParserException(String message, Throwable cause) { super(message, cause); } } _____________________________________________________________________________________ import java.io.*; /** * * Title: Summary Base * Description: A Generic one that reads the document's summary information and returns it through different internal methods * @author Sudhakar Chavali * @version 1.0 */ public interface SummaryBase { /** * A method returns the Document's Author * @return String */ public String getDocAuthor(); /** * A method that returns the Document Created Date * @return String */ public String getDocCreatedDate(); /** * A method that returns the Document's Key words * @return String */ public String getDocKeywords(); /** * A method that returns the Document's comments * @return String */ public String getDocComments(); /** * A method that returns the Document Name * @return String */ public String getDocName(); /** * A method that returns the Document's Subject * @return String */ public String getDocSubject(); /** * A method that returns the Document's title */ public String getDocTitle(); /** * A method that reads the document's Summary Information * @throws DocumentParserException */ public void read() throws DocumentParserException; /** * A method that writes the Document's summary information as an XML into the file * @param strXMLFile * @throws DocumentParserException */ public void write(String strXMLFile) throws DocumentParserException; /** * A method that writes the document's summary information as an XML into OutputStream Object * @param out * @throws DocumentParserException */ public void write(OutputStream out) throws DocumentParserException; /** * A method that returns the Document's summary as an XML String * @return String * @throws DocumentParserException */ public String getSummaryAsXML() throws DocumentParserException; /** * A method that returns document's summary information as normal text * @return String * @throws DocumentParserException */ public String getSummaryAsText() throws DocumentParserException; } ______________________________________________________________________________________ import java.io.*; /** * A generic document that reads the document's text and parses it into normal Ascii text using the different methods. */ public interface Document { /** * A method that returns the document's text after parsing. This method should be called after calling the read method * @return String * @see #read() * @throws DocumentParserException */ public abstract String getText() throws DocumentParserException; /** * A method that returns the parsed text as byte array. This method should be called after calling the read method * @return byte[] * @throws DocumentParserException */ public abstract byte[] getBytes() throws DocumentParserException; /** * A method that writes the parsed text into the OutputStream object. This method should be called after calling the read method * @param out * @throws DocumentParserException */ public abstract void write(OutputStream out) throws DocumentParserException, Exception; /** * A method that reads and parses the document into Normal text * @throws DocumentParserException */ public abstract void read() throws DocumentParserException, Exception; /** * A method that returns the document summary information as Normal Text * @return String */ public abstract String getDocumentSummaryAsText(); /** * A method that returns the document summary information as an XML String * @return String */ public abstract String getDocumentSummaryAsXML(); /** * A method that returns the InputStream which contains the document summary information as a Normal Text * @return InputStream * @throws DocumentParserException */ public abstract InputStream getDocumentTextStream() throws DocumentParserException; ; /** * A method that returns the InputStream which contains the document summary information as an XML String * @return InputStream * @throws DocumentParserException */ public abstract InputStream getDocumentSummaryStream() throws DocumentParserException; ; } ______________________________________________________________________________________ /** * <p>Title: PPTSlide</p> * <p>Description: A class that holds the Powerpoint slide</p> * @author Hari Shanker, Sudhakar Chavali * @version 1.0 */ import java.util.*; import java.io.*; class PPTSlide { /** * Holds the Slide Number */ protected long slideNumber; /** * Holds the contents of the Slide */ protected Vector contents; /** * Initialise the Object for holding the contents of Power Point Slide * @param number */ public PPTSlide(long number) { slideNumber = number; contents = new Vector(); } /** * Add the Content of Slide to this Object * @param content */ public void addContent(String content) { contents.addElement(content); } /** * returns the contents of slide as a vector object * @return Vector */ public Vector getContent() { return contents; } /** * returns the slide value * @return long */ public long getSlideNumber() { return slideNumber; } } _____________________________________________________________________________________ import java.io.OutputStream; import java.io.InputStream; import java.io.*; /** * <p>Title: Parsers</p> * <p>Description: Class that parses the Power Point Document Content and it's summary to text</p> * @author Sudhakar Chavali * @version 1.0 */ public class PPTDocument implements Document { /** * Checks whether the summary information was already read or not */ private boolean bRead = false; /** * Holds the Powerpoint Document Summary Content */ private SummaryBase summary = null; /** * Holds the Powerpoint Document Summary Content */ private String docName = null; /** * Holds the Powerpoint Document Text */ private String pptText = ""; /** * Constructor for setting the PowerPoint document path for initialising the POI object * @param docName */ public PPTDocument(String docName) { this.docName = docName; summary = new MSDocumentSummary(docName); } /** * returns the parsed Powerpoint Document text * @return String */ public String getText() { return pptText; } /** * returns the Powerpoint Document text as bytes * @return byte[] */ public byte[] getBytes() { return getText().getBytes(); } /** * Writes the Powerpoint Document Text into OutputStream object * @param out * @throws MSPowerPointDocumentParserException */ public void write(OutputStream out) throws MSPowerPointDocumentParserException { try { out.write(getText().getBytes()); } catch (Throwable _docError) { throw new MSPowerPointDocumentParserException( "Error Raised while writing the text into OutputStream Object \nError Cause : " + _docError, _docError); } } /** * Reads the Powerpoint document for getting the text from it. * @throws MSPowerPointDocumentParserException */ public void read() throws MSPowerPointDocumentParserException { PPT2Text ppt2Text = new PPT2Text(docName); ppt2Text.read(); pptText = ppt2Text.getText(); pptText = pptText.replaceAll("\r", "\n"); pptText = pptText.replaceAll("\n", "\r\n"); } /** * returns the document summary as tab delimited text * @return */ public String getDocumentSummaryAsText() { try { if (!bRead) { summary.read(); //read the summary object bRead = true; //ensure that summary information was read only one time } return summary.getSummaryAsText(); //Build and return the Summary as Normal text } catch (Exception ex) { return ""; } } /** * returns the document summary as xml * @return */ public String getDocumentSummaryAsXML() { try { if (!bRead) { summary.read(); //read the summary information of a document bRead = true; //ensure that summary information was read only one time } return summary.getSummaryAsXML(); //build annd return the summary as XML string } catch (Exception ex) { return ""; } } /** * returns the document text as Stream Object * @return InputStream * @throws MSPowerPointDocumentParserException */ public InputStream getDocumentTextStream() throws MSPowerPointDocumentParserException { try { ByteArrayInputStream in = new ByteArrayInputStream(this.getBytes()); //Write the Document Text in InputStream Object return (InputStream) in; //return InputStream Object } catch (Throwable _documentError) { //EXCEPTION RAISED WHILE CREATING THE InputStream OBJECT throw new MSPowerPointDocumentParserException( "Unable to return the document text as an InputStream\n\tException Root :" + _documentError); } } /** * returns the document summary as stream object * @return InputStream * @throws MSPowerPointDocumentParserException */ public InputStream getDocumentSummaryStream() throws MSPowerPointDocumentParserException { try { /* A method that reads the Document Summary Stream in InputStream Object */ ByteArrayInputStream in = new ByteArrayInputStream( getDocumentSummaryAsText().getBytes()); //Write the Summary Information into the InputStream Object return (InputStream) in; //return InputStream Object } catch (Throwable _documentError) { //error raised while creating the document summary info stream object // throw it throw new MSPowerPointDocumentParserException( "Unable to get Document Summary Information as Stream\n\tException Root: " + _documentError); } } } ________________________________________________________________________________________________ /** * <p>Title: PPTConstants</p> * <p>Description: A class that holds the Powerpoint constants</p> * @author Sudhakar Chavali * @version 1.0 */ public interface PPTConstants { public static final int PPT_SLIDEPERSISTANT_ATOM = 1011; public static final int PPT_DIAGRAMGROUP_ATOM = 61448; public static final int PPT_TEXTCHAR_ATOM = 4000; public static final int PPT_TEXTBYTE_ATOM = 4008; public static final int PPT_USEREDIT_ATOM = 4085; public static final int PPT_MASTERSLIDE = 1024; } _________________________________________________________________________________________________ /** * <p>Title: PPTClientTextBox</p> * <p>Description: A class that holds the Powrpoint Client Text box content</p> * @author Hari Shanker, Sudhakar Chavali * @version 1.0 */ import java.util.*; import java.io.*; public class PPTClientTextBox { /** * Holds the current id of a client text box */ protected long currentID; /** * holds the content of client text box */ protected String content; /** * Instantiates the client text box object * @param number */ public PPTClientTextBox(long number) { currentID = number; this.content = ""; } /** * Instantiates the client text box object * @param number * @param content */ public PPTClientTextBox(long number, String content) { currentID = number; this.content = content; } /** * Sets the content of a client text box * @param content */ public void setContent(String content) { this.content = content; } /** * returns the content of a client text box * @return */ public String getContent() { return content; } /** * returns the current client text box id * @return long */ public long getID() { return currentID; } } _________________________________________________________________________________________________ import java.io.*; import java.util.*; import org.apache.poi.hpsf.*; import org.apache.poi.poifs.eventfilesystem.*; import org.apache.poi.util.HexDump; import org.apache.poi.util.LittleEndian; import org.apache.poi.hdf.extractor.*; /** * <p>Title: PPT2Text</p> * <p>Description: A class that parses the Powerpoint document content to text </p> * @author Hari Shanker, Sudhakar Chavali * @version 1.0 */ public class PPT2Text implements PPTConstants { /** * * <p>Title: PPTListener</p> * <p>Description: Class that used to handle the Power Point Events</p> * @author Hari Shanker,Sudhakar Chavali * @version 1.0 */ class PPTListener implements POIFSReaderListener { public void processPOIFSReaderEvent(POIFSReaderEvent event) { try { org.apache.poi.poifs.filesystem.DocumentInputStream _documentStream = null; // Checking for PowerPoint Document Stream if (!event.getName().startsWith("PowerPoint Document")) { return; } _documentStream = event.getStream(); byte pptdata[] = new byte[_documentStream.available()]; _documentStream.read(pptdata, 0, _documentStream.available()); int sNum = 0; long offset = 0, offsetEnd = 0; long offsetPD = 0, oldoffsetPD = 0, docRef = 0, maxPresist = 0; // Traverse Bytearray to get CurrentUserEditAtom // Call to extract the Text in all PlaceHolders // To hold PPTClientTextBox objects for mapping into Slide Objects java.util.Hashtable _containerClientTextBox = new java.util.Hashtable(); // Traverse ByteArray to identiy edit paths of ClientTextBoxes for (long i = 0; i < pptdata.length - 20; i++) { long type = LittleEndian.getUShort(pptdata, (int) i + 2); long size = LittleEndian.getUInt(pptdata, (int) i + 4); if (type == PPT_USEREDIT_ATOM) { // Checking the Record Header (UserEditAtom) long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8); long version = LittleEndian.getUInt(pptdata, (int) i + 12); offset = LittleEndian.getUInt(pptdata, (int) i + 16); offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20); // Call to extract ClientTextBox text in each UserEditAtom _containerClientTextBox = extractClientTextBoxes( _containerClientTextBox, offset, pptdata, offsetPD); } } Vector slides = extractPlaceHoders(offset, pptdata, offsetPD); if (slides.size() == 0) { slides.addElement(new PPTSlide(256)); } PPTSlide _slide = (PPTSlide) slides.get(slides.size() - 1); for (Enumeration enum = _containerClientTextBox.elements(); enum.hasMoreElements(); ) { PPTClientTextBox _clientTextBox = (PPTClientTextBox) enum.nextElement(); _slide.addContent(_clientTextBox.getContent()); } //Merging ClientTextBox data with Slide Data // Printing the text from Slides vector object (need further modification) for (int i = 0; i < slides.size(); i++) { _slide = (PPTSlide) slides.get(i); Vector scontent = _slide.getContent(); // StringBuffer _stringbuffer = new StringBuffer(); for (int j = 0; j < scontent.size(); j++) { pptTextBuffer.append(scontent.get(j).toString()); } } } catch (Throwable ex) { return; } } } /** * Method that returns the client text boxes of a slide * @param clientTextBoxContainer * @param offset * @param pptBytes * @param offsetPD * @return Hashtable * @throws MSPowerPointDocumentParserException */ public java.util.Hashtable extractClientTextBoxes(java.util.Hashtable _containerClientTextBox, long offset, byte[] pptdata, long offsetPD) throws Throwable { //To hold temparary data ByteArrayOutputStream _outStream = new ByteArrayOutputStream(); PPTClientTextBox _clientTextBox = null; // Traversing the bytearray upto Presist directory position for (long i = offset; i < offsetPD - 20; i++) { try { long rinfo = LittleEndian.getUShort(pptdata, (int) i); long type = LittleEndian.getUShort(pptdata, (int) i + 2); // Record Type long size = LittleEndian.getUInt(pptdata, (int) i + 4); // Record Size if (type == PPT_DIAGRAMGROUP_ATOM) { //Record type is of Drawing Group long shapeCount = LittleEndian.getUInt(pptdata, (int) i + 8); // Total number of objects long _currentID = LittleEndian.getInt(pptdata, (int) i + 12); // Group ID+number of objects _currentID = ( (int) (_currentID / 1024)) * 1024; if (_currentID == PPT_MASTERSLIDE) { // Ignore Master Slide objects i++; continue; } //Check for the ClientTextBox GroupID existence if (!_containerClientTextBox.containsKey(new Long(_currentID))) { _clientTextBox = new PPTClientTextBox(_currentID); _containerClientTextBox.put(new Long(_currentID), _clientTextBox); } else { // If exists get Client Textbox Group _clientTextBox = (PPTClientTextBox) _containerClientTextBox.get(new Long(_currentID)); _clientTextBox.setContent(""); } // Iterating the bytearray for TextCharAtoms and TextBytesAtom for (long j = i + 8; j < offsetPD - 20; j++) { try { long nrinfo = LittleEndian.getUShort(pptdata, (int) j); long ntype = LittleEndian.getUShort(pptdata, (int) j + 2); //Record Type long nsize = LittleEndian.getUInt(pptdata, (int) j + 4); // Record size if (ntype == PPT_DIAGRAMGROUP_ATOM) { // Break the loop if next GroupID found i = j - 1; break; } else if (ntype == PPT_TEXTBYTE_ATOM) { //TextByteAtom record _outStream = new ByteArrayOutputStream(); long ii = 0; for (ii = j + 6; ii <= j + 6 + nsize; ii++) { // For loop to changed to a function short ch = Utils.convertBytesToShort(pptdata, (int) ii + 2); if (ch == 0 || ch == 16 || ch == 13 || ch == 10) { _outStream.write( (byte) '\r'); } else if (ch == 0x201c) { // for left double quote _outStream.write( (byte) 147); } else if (ch == 0x201d) { // for right double quote _outStream.write( (byte) 148); } else if (ch == 0x2019) { // for right single quote _outStream.write( (byte) 146); } else if (ch == 0x2018) { // for left single quote _outStream.write( (byte) 145); } else if (ch == 0x2013) { // for '-' character _outStream.write( (byte) 150); } else { _outStream.write( (byte) ch); } } // Setting the identified text for Current groupID _clientTextBox.setContent(_clientTextBox.getContent() + new String(_outStream.toByteArray())); } else if (ntype == PPT_TEXTCHAR_ATOM) { // TextCharAtom record _outStream = new ByteArrayOutputStream(); String strTempContent = new String(pptdata, (int) j + 6, (int) (nsize) + 2); byte bytes[] = strTempContent.getBytes(); for (int ii = 0; ii < bytes.length - 1; ii += 2) { // For loop to changed to a function short ch = Utils.convertBytesToShort(bytes, ii); if (ch == 0 || ch == 16 || ch == 13 || ch == 10) { _outStream.write( (byte) '\r'); } else if (ch == 0x201c) { _outStream.write( (byte) 147); } else if (ch == 0x201d) { _outStream.write( (byte) 148); } else if (ch == 0x2019) { _outStream.write( (byte) 146); } else if (ch == 0x2018) { _outStream.write( (byte) 145); } else if (ch == 0x2013) { // for - character _outStream.write( (byte) 150); } else { _outStream.write( (byte) ch); } } // Setting the identified text for Current groupID _clientTextBox.setContent(_clientTextBox.getContent() + new String(_outStream.toByteArray())); } } catch (Throwable e) { break; } } } } catch (Throwable ee) { return _containerClientTextBox; } } return _containerClientTextBox; } /** * Method that returns the Powerpoint place holders * @param offset * @param pptBytes * @param offsetPD * @return Vector * @throws MSPowerPointDocumentParserException */ public Vector extractPlaceHoders(long offset, byte[] pptdata, long offsetPD) throws Throwable { int sNum = 0; Vector slides = new Vector(); // To All Slides data PPTSlide currentSlide = null; // Object to hold current slide data ByteArrayOutputStream _outStream = new ByteArrayOutputStream(); // To store data found in TextCharAtoms and TextBytesAtoms for (long i = offset; i < pptdata.length - 20; i++) { try { long rinfo = LittleEndian.getUShort(pptdata, (int) i); long type = LittleEndian.getUShort(pptdata, (int) i + 2); long size = LittleEndian.getUInt(pptdata, (int) i + 4); if (type == PPT_TEXTBYTE_ATOM) { //TextByteAtom record _outStream = new ByteArrayOutputStream(); long ii = 0; for (ii = i + 6; ii <= i + 6 + size; ii++) { short ch = Utils.convertBytesToShort(pptdata, (int) ii + 2); if (ch == 0 || ch == 16 || ch == 13 || ch == 10) { _outStream.write( (byte) '\r'); } else if (ch == 0x201c) { // for left double quote _outStream.write( (byte) 147); } else if (ch == 0x201d) { // for right double quote _outStream.write( (byte) 148); } else if (ch == 0x2019) { // for right single quote _outStream.write( (byte) 146); } else if (ch == 0x2018) { // for left single quote _outStream.write( (byte) 145); } else if (ch == 0x2013) { // for '-' character _outStream.write( (byte) 150); } else { _outStream.write(ch); } } // Setting the identified text for Current Slide currentSlide.addContent(_outStream.toString()); } else if (type == PPT_TEXTCHAR_ATOM) { //TextCharAtom record _outStream = new ByteArrayOutputStream(); String strTempContent = new String(pptdata, (int) i + 6, (int) (size) + 2); byte bytes[] = strTempContent.getBytes(); for (int ii = 0; ii < bytes.length - 1; ii += 2) { short ch = Utils.convertBytesToShort(bytes, ii); if (ch == 0 || ch == 16 || ch == 13 || ch == 10) { _outStream.write( (byte) '\r'); } else if (ch == 0x201c) { _outStream.write( (byte) 147); } else if (ch == 0x201d) { _outStream.write( (byte) 148); } else if (ch == 0x2019) { _outStream.write( (byte) 146); } else if (ch == 0x2018) { _outStream.write( (byte) 145); } else if (ch == 0x2013) { // for - character _outStream.write( (byte) 150); } else { _outStream.write( (byte) ch); } } // Setting the identified text for Current Slide currentSlide.addContent(_outStream.toString()); } else if (type == PPT_SLIDEPERSISTANT_ATOM) { // SlidePresistAtom Record type if (sNum != 0) { _outStream = new ByteArrayOutputStream(); long slideID = LittleEndian.getUInt(pptdata, (int) i + 20); currentSlide = new PPTSlide(slideID); //currentSlide.addContent(_outStream.toString()); slides.addElement(currentSlide); } sNum++; } else if (type == PPT_DIAGRAMGROUP_ATOM) { //DG break; } } catch (Throwable ee) { } /*******************************************************************/ } return slides; } /** * Constructor that takes a Powerpoint document name as an argument for getting the text * @param fileName */ public PPT2Text(String fileName) { this.docName = fileName; } /** * Method that reads the Powerpoint document for parsing the text * @throws MSPowerPointDocumentParserException */ public void read() throws MSPowerPointDocumentParserException { try { POIFSReader reader = new POIFSReader(); reader.registerListener(new PPTListener()); reader.read(new FileInputStream(docName)); } catch (Throwable _docError) { throw new MSPowerPointDocumentParserException( "Unable to read the PPT Document \nError Cause : " + _docError, _docError); } } /** * returns the PowerPoint text * @return String */ public String getText() { return pptTextBuffer.toString(); } /** * Holds the Powerpoint document name */ private String docName; /** * Holds the parsed Powerpoint Text */ private StringBuffer pptTextBuffer = new StringBuffer(); } _________________________________________________________________________________________________ /** * Title: MSPowerPointDocumentParserException * Description: This is An Exception class and used for catching the runtime exceptions for the Powerpoint Document class * @author Sudhakar Chavali * @see microsoft.powerpoint.PowerPointDocument * @version 1.0 */ public class MSPowerPointDocumentParserException extends DocumentParserException { /** * A constructor that builds the Exception object * @param message */ public MSPowerPointDocumentParserException(String message) { super(message); } /** * A constructor that builds the Exception object * @param message * @param cause */ public MSPowerPointDocumentParserException(String message, Throwable cause) { super(message, cause); } } ===== "No one can earn a million dollars honestly."- William Jennings Bryan (1860-1925) "Make everything as simple as possible, but not simpler."- Albert Einstein (1879-1955) "It is dangerous to be sincere unless you are also stupid."- George Bernard Shaw (1856-1950) __________________________________ Do you Yahoo!? New and Improved Yahoo! Mail - 100MB free storage! http://promotions.yahoo.com/new_mail --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]