[ https://issues.apache.org/jira/browse/TIKA-1356?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Tyler Palsulich updated TIKA-1356: ---------------------------------- Description: If use OOXMLParser with WriteOutContentHandler as destination of result, we can recieve degraded performance. Reason of this problem is ignoring SAXException in org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.SheetTextAsHTML.endRow() and others methods of this class. As example: source doc have many empty rows in end of the table(about 1000000). When WriteOutContentHandler is full WriteLimitReachedException raised lot times. Below is stacktrace of long proccess {code} org.apache.tika.sax.ContentHandlerDecorator.ignorableWhitespace(ContentHandlerDecorator.java:157) org.apache.tika.sax.SafeContentHandler.access$101(SafeContentHandler.java:46) org.apache.tika.sax.SafeContentHandler$2.write(SafeContentHandler.java:94) org.apache.tika.sax.SafeContentHandler.filter(SafeContentHandler.java:140) org.apache.tika.sax.SafeContentHandler.ignorableWhitespace(SafeContentHandler.java:293) org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:242) org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:275) org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$SheetTextAsHTML.cell(XSSFExcelExtractorDecorator.java:203) org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.endElement(XSSFSheetXMLHandler.java:295) org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$XSSFSheetInterestingPartsCapturer.endElement(XSSFExcelExtractorDecorator.java:287) org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source) org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanEndElement(Unknown Source) org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown Source) org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source) org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) org.apache.xerces.parsers.XMLParser.parse(Unknown Source) org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source) org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser.parse(Unknown Source) org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.processSheet(XSSFExcelExtractorDecorator.java:164) org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.buildXHTML(XSSFExcelExtractorDecorator.java:120) org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.getXHTML(AbstractOOXMLExtractor.java:105) org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.parse(OOXMLExtractorFactory.java:112) org.apache.tika.parser.microsoft.ooxml.OOXMLParser.parse(OOXMLParser.java:82) org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120) org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:91) org.elasticsearch.index.mapper.attachment.AttachmentMapper$RecursiveMetadataParser.parse(AttachmentMapper.java:104) org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:72) org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:102) org.apache.tika.parser.pkg.PackageParser.parseEntry(PackageParser.java:169) org.apache.tika.parser.pkg.PackageParser.parse(PackageParser.java:135) org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120) org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:91) {code} was: If use OOXMLParser with WriteOutContentHandler as destination of result, we can recieve degraded performance. Reason of this problem is ignoring SAXException in org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.SheetTextAsHTML.endRow() and others methods of this class. As example: source doc have many empty rows in end of the table(about 1000000). When WriteOutContentHandler is full WriteLimitReachedException raised lot times. Below is stacktrace of long proccess org.apache.tika.sax.ContentHandlerDecorator.ignorableWhitespace(ContentHandlerDecorator.java:157) org.apache.tika.sax.SafeContentHandler.access$101(SafeContentHandler.java:46) org.apache.tika.sax.SafeContentHandler$2.write(SafeContentHandler.java:94) org.apache.tika.sax.SafeContentHandler.filter(SafeContentHandler.java:140) org.apache.tika.sax.SafeContentHandler.ignorableWhitespace(SafeContentHandler.java:293) org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:242) org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:275) org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$SheetTextAsHTML.cell(XSSFExcelExtractorDecorator.java:203) org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.endElement(XSSFSheetXMLHandler.java:295) org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$XSSFSheetInterestingPartsCapturer.endElement(XSSFExcelExtractorDecorator.java:287) org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source) org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanEndElement(Unknown Source) org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown Source) org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown Source) org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) org.apache.xerces.parsers.XMLParser.parse(Unknown Source) org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source) org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser.parse(Unknown Source) org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.processSheet(XSSFExcelExtractorDecorator.java:164) org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.buildXHTML(XSSFExcelExtractorDecorator.java:120) org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.getXHTML(AbstractOOXMLExtractor.java:105) org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.parse(OOXMLExtractorFactory.java:112) org.apache.tika.parser.microsoft.ooxml.OOXMLParser.parse(OOXMLParser.java:82) org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120) org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:91) org.elasticsearch.index.mapper.attachment.AttachmentMapper$RecursiveMetadataParser.parse(AttachmentMapper.java:104) org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:72) org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:102) org.apache.tika.parser.pkg.PackageParser.parseEntry(PackageParser.java:169) org.apache.tika.parser.pkg.PackageParser.parse(PackageParser.java:135) org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120) org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:91) > degraded performance OOXMLParser with WriteOutContentHandler > ------------------------------------------------------------ > > Key: TIKA-1356 > URL: https://issues.apache.org/jira/browse/TIKA-1356 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 1.4, 1.5 > Reporter: Timofeev > > If use OOXMLParser with WriteOutContentHandler as destination of result, we > can recieve degraded performance. Reason of this problem is ignoring > SAXException in > org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.SheetTextAsHTML.endRow() > and others methods of this class. > As example: source doc have many empty rows in end of the table(about > 1000000). When WriteOutContentHandler is full WriteLimitReachedException > raised lot times. > Below is stacktrace of long proccess > {code} > org.apache.tika.sax.ContentHandlerDecorator.ignorableWhitespace(ContentHandlerDecorator.java:157) > > org.apache.tika.sax.SafeContentHandler.access$101(SafeContentHandler.java:46) > > org.apache.tika.sax.SafeContentHandler$2.write(SafeContentHandler.java:94) > > org.apache.tika.sax.SafeContentHandler.filter(SafeContentHandler.java:140) > > org.apache.tika.sax.SafeContentHandler.ignorableWhitespace(SafeContentHandler.java:293) > > org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:242) > > org.apache.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:275) > > org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$SheetTextAsHTML.cell(XSSFExcelExtractorDecorator.java:203) > > org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.endElement(XSSFSheetXMLHandler.java:295) > > org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$XSSFSheetInterestingPartsCapturer.endElement(XSSFExcelExtractorDecorator.java:287) > org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source) > > org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanEndElement(Unknown > Source) > > org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown > Source) > > org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown > Source) > org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) > org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source) > org.apache.xerces.parsers.XMLParser.parse(Unknown Source) > org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source) > org.apache.xerces.jaxp.SAXParserImpl$JAXPSAXParser.parse(Unknown > Source) > > org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.processSheet(XSSFExcelExtractorDecorator.java:164) > > org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator.buildXHTML(XSSFExcelExtractorDecorator.java:120) > > org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor.getXHTML(AbstractOOXMLExtractor.java:105) > > org.apache.tika.parser.microsoft.ooxml.OOXMLExtractorFactory.parse(OOXMLExtractorFactory.java:112) > > org.apache.tika.parser.microsoft.ooxml.OOXMLParser.parse(OOXMLParser.java:82) > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) > > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120) > org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:91) > > org.elasticsearch.index.mapper.attachment.AttachmentMapper$RecursiveMetadataParser.parse(AttachmentMapper.java:104) > org.apache.tika.parser.DelegatingParser.parse(DelegatingParser.java:72) > > org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor.parseEmbedded(ParsingEmbeddedDocumentExtractor.java:102) > > org.apache.tika.parser.pkg.PackageParser.parseEntry(PackageParser.java:169) > org.apache.tika.parser.pkg.PackageParser.parse(PackageParser.java:135) > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) > org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:242) > > org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120) > org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:91) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)