Author: dflorey Date: Tue Feb 14 01:28:53 2006 New Revision: 377676 URL: http://svn.apache.org/viewcvs?rev=377676&view=rev Log: Fixed bug to enable concurrent indexing of many MS PowerPoints.
Modified: jakarta/slide/trunk/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java Modified: jakarta/slide/trunk/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java URL: http://svn.apache.org/viewcvs/jakarta/slide/trunk/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java?rev=377676&r1=377675&r2=377676&view=diff ============================================================================== --- jakarta/slide/trunk/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java (original) +++ jakarta/slide/trunk/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java Tue Feb 14 01:28:53 2006 @@ -39,84 +39,67 @@ /** * Content extractor for Microsoft Powerpoint documents. */ -public class MSPowerPointExtractor extends AbstractContentExtractor implements POIFSReaderListener{ +public class MSPowerPointExtractor extends AbstractContentExtractor { - static final String CONTENT_TYPE_POWERPOINT_1 = "application/mspowerpoint"; - static final String CONTENT_TYPE_POWERPOINT_2 = "application/vnd.ms-powerpoint"; - static final String CONTENT_TYPE_POWERPOINT_ALL_CSV = CONTENT_TYPE_POWERPOINT_1+","+CONTENT_TYPE_POWERPOINT_2; - - private ByteArrayOutputStream writer = new ByteArrayOutputStream(); - - public MSPowerPointExtractor(String uri, String contentType, String namespace) { - super(uri, contentType, namespace); - } - - public Reader extract(InputStream content) throws ExtractorException { - try { - POIFSReader reader = new POIFSReader(); - reader.registerListener(this); - reader.read(content); - - return new InputStreamReader(new ByteArrayInputStream(writer.toByteArray())); - } - catch(Exception e) { - throw new ExtractorException(e.getMessage()); - } - } - - public void processPOIFSReaderEvent(POIFSReaderEvent event) - { - try{ - if(!event.getName().equalsIgnoreCase("PowerPoint Document")) - return; - - DocumentInputStream input = event.getStream(); - - byte[] buffer = new byte[input.available()]; - input.read(buffer, 0, input.available()); - - for(int i=0; i<buffer.length-20; i++) - { - long type = LittleEndian.getUShort(buffer,i+2); - long size = LittleEndian.getUInt(buffer,i+4); - - if(type==4008) - { - writer.write(buffer, i + 4 + 1, (int) size +3); - i = i + 4 + 1 + (int) size - 1; - - } - } - } - catch (Exception e) - { - - } - } - - public static void main(String[] args) throws Exception - { - FileInputStream in = new FileInputStream(args[0]); - - MSPowerPointExtractor ex = new MSPowerPointExtractor(null, null, null); - - Reader reader = ex.extract(in); - - int c; - do - { - c = reader.read(); - - System.out.print((char)c); - } - while( c != -1 ); - } - - /* (non-Javadoc) + static final String CONTENT_TYPE_POWERPOINT_1 = "application/mspowerpoint"; + + static final String CONTENT_TYPE_POWERPOINT_2 = "application/vnd.ms-powerpoint"; + + static final String CONTENT_TYPE_POWERPOINT_ALL_CSV = CONTENT_TYPE_POWERPOINT_1 + + "," + CONTENT_TYPE_POWERPOINT_2; + + public MSPowerPointExtractor(String uri, String contentType, + String namespace) { + super(uri, contentType, namespace); + } + + public Reader extract(InputStream content) throws ExtractorException { + try { + final ByteArrayOutputStream writer = new ByteArrayOutputStream(); + + POIFSReader reader = new POIFSReader(); + reader.registerListener(new POIFSReaderListener() { + public void processPOIFSReaderEvent(POIFSReaderEvent event) { + try { + if (!event.getName().equalsIgnoreCase( + "PowerPoint Document")) + return; + + DocumentInputStream input = event.getStream(); + + byte[] buffer = new byte[input.available()]; + input.read(buffer, 0, input.available()); + + for (int i = 0; i < buffer.length - 20; i++) { + long type = LittleEndian.getUShort(buffer, i + 2); + long size = LittleEndian.getUInt(buffer, i + 4); + + if (type == 4008) { + writer.write(buffer, i + 4 + 1, (int) size + 3); + i = i + 4 + 1 + (int) size - 1; + } + } + } catch (Exception e) { + + } + } + }); + reader.read(content); + + return new InputStreamReader(new ByteArrayInputStream(writer + .toByteArray())); + } catch (Exception e) { + throw new ExtractorException(e.getMessage()); + } + } + + /* + * (non-Javadoc) + * * @see org.apache.slide.extractor.Extractor#getContentType() */ public String getContentType() { - if(super.getContentType()==null){ + if (super.getContentType() == null) { return CONTENT_TYPE_POWERPOINT_ALL_CSV; } return super.getContentType(); --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]