Author: jukka Date: Sun Nov 25 14:52:09 2007 New Revision: 598075 URL: http://svn.apache.org/viewvc?rev=598075&view=rev Log: TIKA-102 - Parser implementations loading a large amount of content into a single String could be problematic - Patch by Niall Pemberton
Modified: incubator/tika/trunk/CHANGES.txt incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Modified: incubator/tika/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/CHANGES.txt (original) +++ incubator/tika/trunk/CHANGES.txt Sun Nov 25 14:52:09 2007 @@ -130,3 +130,5 @@ 59. TIKA-101 - Improve site and build (mattmann) +60. TIKA-102 - Parser implementations loading a large amount of content + into a single String could be problematic (Niall Pemberton) Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/ExcelParser.java Sun Nov 25 14:52:09 2007 @@ -33,31 +33,29 @@ return "application/vnd.ms-excel"; } - protected String extractText(POIFSFileSystem filesystem) throws IOException{ - StringBuilder builder = new StringBuilder(); + protected void extractText(POIFSFileSystem filesystem, Appendable builder) throws IOException{ extractText(new HSSFWorkbook(filesystem), builder); - return builder.toString(); } - private void extractText(HSSFWorkbook book, StringBuilder builder) { + private void extractText(HSSFWorkbook book, Appendable builder) throws IOException { for (int i = 0; book != null && i < book.getNumberOfSheets(); i++) { extractText(book.getSheetAt(i), builder); } } - private void extractText(HSSFSheet sheet, StringBuilder builder) { + private void extractText(HSSFSheet sheet, Appendable builder) throws IOException { for (int i = 0; sheet != null && i <= sheet.getLastRowNum(); i++) { extractText(sheet.getRow(i), builder); } } - private void extractText(HSSFRow row, StringBuilder builder) { + private void extractText(HSSFRow row, Appendable builder) throws IOException { for (short i = 0; row != null && i < row.getLastCellNum(); i++) { extractText(row.getCell(i), builder); } } - private void extractText(HSSFCell cell, StringBuilder builder) { + private void extractText(HSSFCell cell, Appendable builder) throws IOException { if (cell != null) { switch (cell.getCellType()) { case HSSFCell.CELL_TYPE_STRING: @@ -73,14 +71,11 @@ } } - private void addText(String text, StringBuilder builder) { + private void addText(String text, Appendable builder) throws IOException { if (text != null) { text = text.trim(); if (text.length() > 0) { - if (builder.length() > 0) { - builder.append(' '); - } - builder.append(text); + builder.append(text).append(' '); } } } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Sun Nov 25 14:52:09 2007 @@ -29,6 +29,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; +import org.apache.tika.sax.AppendableAdaptor; import org.apache.tika.sax.XHTMLContentHandler; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -56,7 +57,9 @@ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); - xhtml.element("p", extractText(filesystem)); + xhtml.startElement("p"); + extractText(filesystem, new AppendableAdaptor(xhtml)); + xhtml.endElement("p"); xhtml.endDocument(); } @@ -70,7 +73,7 @@ /** * Extracts the text content from a Microsoft document input stream. */ - protected abstract String extractText(POIFSFileSystem filesystem) + protected abstract void extractText(POIFSFileSystem filesystem, Appendable appendable) throws IOException, TikaException; private void getMetadata( @@ -177,4 +180,4 @@ } } -} \ No newline at end of file +} Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointExtractor.java Sun Nov 25 14:52:09 2007 @@ -32,12 +32,12 @@ static Logger LOG = Logger.getRootLogger(); /** Buffer holding the content of the file */ - private final StringBuilder builder; + private final Appendable builder; /** * Constructs Listener to get content of PowerPoint file. */ - public PowerPointExtractor(StringBuilder builder) { + public PowerPointExtractor(Appendable builder) { this.builder = builder; } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/PowerPointParser.java Sun Nov 25 14:52:09 2007 @@ -35,8 +35,7 @@ return "application/vnd.ms-powerpoint"; } - protected String extractText(POIFSFileSystem filesystem) throws IOException { - StringBuilder builder = new StringBuilder(); + protected void extractText(POIFSFileSystem filesystem, Appendable builder) throws IOException { InputStream stream = filesystem.createDocumentInputStream(POWERPOINT); try { @@ -44,8 +43,6 @@ } finally { stream.close(); } - - return builder.toString(); } } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/Word6Extractor.java Sun Nov 25 14:52:09 2007 @@ -33,8 +33,11 @@ class Word6Extractor { - public Word6Extractor() + private final Appendable appendable; + + public Word6Extractor(Appendable appendable) { + this.appendable = appendable; } /** @@ -45,7 +48,7 @@ * @return The text from the document * @throws Exception If there are any unexpected exceptions. */ - public String extractText(byte[] mainStream) throws IOException { + public void extractText(byte[] mainStream) throws IOException { int fcMin = LittleEndian.getInt(mainStream, 0x18); int fcMax = LittleEndian.getInt(mainStream, 0x1C); @@ -58,7 +61,7 @@ List textRuns = chpTable.getTextRuns(); // iterate through the - WordTextBuffer finalTextBuf = new WordTextBuffer(); + WordTextBuffer finalTextBuf = new WordTextBuffer(appendable); Iterator runsIt = textRuns.iterator(); while(runsIt.hasNext()) { @@ -76,8 +79,6 @@ } } } - - return finalTextBuf.toString(); } /** Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordParser.java Sun Nov 25 14:52:09 2007 @@ -47,7 +47,7 @@ * * @param in The InputStream representing the Word file. */ - public String extractText(POIFSFileSystem fsys) + public void extractText(POIFSFileSystem fsys, Appendable appendable) throws IOException, TikaException { // load our POIFS document streams. DocumentEntry headerProps = @@ -74,8 +74,8 @@ case 103: case 104: // this is a Word 6.0 doc send it to the extractor for that version. - Word6Extractor oldExtractor = new Word6Extractor(); - return oldExtractor.extractText(header); + Word6Extractor oldExtractor = new Word6Extractor(appendable); + oldExtractor.extractText(header); } //get the location of the piece table @@ -123,7 +123,7 @@ int currentTextStart = currentPiece.getStart(); int currentTextEnd = currentPiece.getEnd(); - WordTextBuffer finalTextBuf = new WordTextBuffer(); + WordTextBuffer finalTextBuf = new WordTextBuffer(appendable); // iterate through all text runs extract the text only if they haven't been // deleted @@ -157,7 +157,7 @@ runStart = currentTextStart; currentTextEnd = currentPiece.getEnd (); } else { - return finalTextBuf.toString(); + return; } } String str = currentPiece.substring(0, runEnd - currentTextStart); @@ -172,7 +172,6 @@ finalTextBuf.append(str); } } - return finalTextBuf.toString(); } /** Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/WordTextBuffer.java Sun Nov 25 14:52:09 2007 @@ -22,16 +22,16 @@ */ public class WordTextBuffer { - StringBuffer _buf; + Appendable _buf; boolean _hold; - public WordTextBuffer() + public WordTextBuffer(Appendable appendable) { - _buf = new StringBuffer(); + _buf = appendable; _hold = false; } - public void append(String text) + public void append(String text) throws java.io.IOException { char[] letters = text.toCharArray(); for (int x = 0; x < letters.length; x++) @@ -55,11 +55,6 @@ break; } } - } - - public String toString() - { - return _buf.toString(); } } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/opendocument/OpenOfficeParser.java Sun Nov 25 14:52:09 2007 @@ -31,6 +31,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; import org.apache.tika.parser.xml.XMLParser; +import org.apache.tika.sax.AppendableAdaptor; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.log4j.Logger; @@ -101,7 +102,9 @@ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); - xhtml.element("p", xp.concatOccurrence(xmlDoc, "//*", " ")); + xhtml.startElement("p"); + xp.concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml)); + xhtml.endElement("p"); xhtml.endDocument(); } Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java?rev=598075&r1=598074&r2=598075&view=diff ============================================================================== --- incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java (original) +++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/xml/XMLParser.java Sun Nov 25 14:52:09 2007 @@ -25,6 +25,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; +import org.apache.tika.sax.AppendableAdaptor; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.Utils; @@ -70,13 +71,14 @@ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); - xhtml.element("p", concatOccurrence(xmlDoc, "//*", " ")); + xhtml.startElement("p"); + concatOccurrence(xmlDoc, "//*", " ", new AppendableAdaptor(xhtml)); + xhtml.endElement("p"); xhtml.endDocument(); } - public String concatOccurrence(Object xmlDoc, String xpath, String concatSep) { + public void concatOccurrence(Object xmlDoc, String xpath, String concatSep, Appendable chaineConcat) throws IOException { - StringBuilder chaineConcat = new StringBuilder(); try { JDOMXPath xp = new JDOMXPath(xpath); List ls = xp.selectNodes(xmlDoc); @@ -108,7 +110,7 @@ if (StringUtils.isNotEmpty(text)) { chaineConcat.append(text); if (ls.size() == 1) { - return chaineConcat.toString().trim(); + return; } else { if (ls.size() != j) { chaineConcat.append(' ') @@ -121,7 +123,6 @@ } catch (JaxenException j) { logger.error(j.getMessage()); } - return chaineConcat.toString().trim(); } public List getAllDocumentNs(org.jdom.Document doc) {