[ 
https://issues.apache.org/jira/browse/TIKA-3722?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

sagi shechter updated TIKA-3722:
--------------------------------
    Description: 
The file is ~3mb , fails with tika app 2.3.0
{code:java}
java.lang.OutOfMemoryError: Java heap space
    at java.base/java.util.Arrays.copyOf(Arrays.java:3817)
    at java.base/java.util.BitSet.ensureCapacity(BitSet.java:338)
    at java.base/java.util.BitSet.expandTo(BitSet.java:353)
    at java.base/java.util.BitSet.set(BitSet.java:448)
    at 
de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.characters(BoilerpipeHTMLContentHandler.java:267)
    at 
org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler.characters(BoilerpipeContentHandler.java:165)
    at 
org.apache.tika.sax.TeeContentHandler.characters(TeeContentHandler.java:97)
    at 
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
    at 
org.apache.tika.sax.SecureContentHandler.characters(SecureContentHandler.java:253)
    at 
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
    at 
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
    at 
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
    at 
org.apache.tika.sax.SafeContentHandler.access$201(SafeContentHandler.java:47)
    at 
org.apache.tika.sax.SafeContentHandler.lambda$new$0(SafeContentHandler.java:57)
    at 
org.apache.tika.sax.SafeContentHandler$$Lambda$515/0x0000000800506c40.write(Unknown
 Source)
    at 
org.apache.tika.sax.SafeContentHandler.filter(SafeContentHandler.java:106)
    at 
org.apache.tika.sax.SafeContentHandler.characters(SafeContentHandler.java:250)
    at 
org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:270)
    at 
org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:295)
    at 
org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$SheetTextAsHTML.cell(XSSFExcelExtractorDecorator.java:473)
    at 
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.outputCell(XSSFSheetXMLHandler.java:444)
    at 
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.endElement(XSSFSheetXMLHandler.java:317)
    at 
org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$XSSFSheetInterestingPartsCapturer.endElement(XSSFExcelExtractorDecorator.java:561)
    at 
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:132)
    at org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source)
    at org.apache.xerces.impl.XMLNSDocumentScannerImpl.scanEndElement(Unknown 
Source)
    at 
org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown
 Source)
    at 
org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown 
Source)
    at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
    at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
    at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
    at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
{code}
 

  was:
 
{code:java}
The full exception stack trace is included below:
java.lang.OutOfMemoryError: Java heap space
    at java.base/java.util.Arrays.copyOf(Arrays.java:3817)
    at java.base/java.util.BitSet.ensureCapacity(BitSet.java:338)
    at java.base/java.util.BitSet.expandTo(BitSet.java:353)
    at java.base/java.util.BitSet.set(BitSet.java:448)
    at 
de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.characters(BoilerpipeHTMLContentHandler.java:267)
    at 
org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler.characters(BoilerpipeContentHandler.java:165)
    at 
org.apache.tika.sax.TeeContentHandler.characters(TeeContentHandler.java:97)
    at 
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
    at 
org.apache.tika.sax.SecureContentHandler.characters(SecureContentHandler.java:253)
    at 
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
    at 
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
    at 
org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
    at 
org.apache.tika.sax.SafeContentHandler.access$201(SafeContentHandler.java:47)
    at 
org.apache.tika.sax.SafeContentHandler.lambda$new$0(SafeContentHandler.java:57)
    at 
org.apache.tika.sax.SafeContentHandler$$Lambda$515/0x0000000800506c40.write(Unknown
 Source)
    at 
org.apache.tika.sax.SafeContentHandler.filter(SafeContentHandler.java:106)
    at 
org.apache.tika.sax.SafeContentHandler.characters(SafeContentHandler.java:250)
    at 
org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:270)
    at 
org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:295)
    at 
org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$SheetTextAsHTML.cell(XSSFExcelExtractorDecorator.java:473)
    at 
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.outputCell(XSSFSheetXMLHandler.java:444)
    at 
org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.endElement(XSSFSheetXMLHandler.java:317)
    at 
org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$XSSFSheetInterestingPartsCapturer.endElement(XSSFExcelExtractorDecorator.java:561)
    at 
org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:132)
    at org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source)
    at org.apache.xerces.impl.XMLNSDocumentScannerImpl.scanEndElement(Unknown 
Source)
    at 
org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown
 Source)
    at 
org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown 
Source)
    at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
    at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
    at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
    at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
{code}
 


> OOM exception on xlsx parsing
> -----------------------------
>
>                 Key: TIKA-3722
>                 URL: https://issues.apache.org/jira/browse/TIKA-3722
>             Project: Tika
>          Issue Type: Bug
>            Reporter: sagi shechter
>            Priority: Major
>
> The file is ~3mb , fails with tika app 2.3.0
> {code:java}
> java.lang.OutOfMemoryError: Java heap space
>     at java.base/java.util.Arrays.copyOf(Arrays.java:3817)
>     at java.base/java.util.BitSet.ensureCapacity(BitSet.java:338)
>     at java.base/java.util.BitSet.expandTo(BitSet.java:353)
>     at java.base/java.util.BitSet.set(BitSet.java:448)
>     at 
> de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler.characters(BoilerpipeHTMLContentHandler.java:267)
>     at 
> org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler.characters(BoilerpipeContentHandler.java:165)
>     at 
> org.apache.tika.sax.TeeContentHandler.characters(TeeContentHandler.java:97)
>     at 
> org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
>     at 
> org.apache.tika.sax.SecureContentHandler.characters(SecureContentHandler.java:253)
>     at 
> org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
>     at 
> org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
>     at 
> org.apache.tika.sax.ContentHandlerDecorator.characters(ContentHandlerDecorator.java:141)
>     at 
> org.apache.tika.sax.SafeContentHandler.access$201(SafeContentHandler.java:47)
>     at 
> org.apache.tika.sax.SafeContentHandler.lambda$new$0(SafeContentHandler.java:57)
>     at 
> org.apache.tika.sax.SafeContentHandler$$Lambda$515/0x0000000800506c40.write(Unknown
>  Source)
>     at 
> org.apache.tika.sax.SafeContentHandler.filter(SafeContentHandler.java:106)
>     at 
> org.apache.tika.sax.SafeContentHandler.characters(SafeContentHandler.java:250)
>     at 
> org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:270)
>     at 
> org.apache.tika.sax.XHTMLContentHandler.characters(XHTMLContentHandler.java:295)
>     at 
> org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$SheetTextAsHTML.cell(XSSFExcelExtractorDecorator.java:473)
>     at 
> org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.outputCell(XSSFSheetXMLHandler.java:444)
>     at 
> org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.endElement(XSSFSheetXMLHandler.java:317)
>     at 
> org.apache.tika.parser.microsoft.ooxml.XSSFExcelExtractorDecorator$XSSFSheetInterestingPartsCapturer.endElement(XSSFExcelExtractorDecorator.java:561)
>     at 
> org.apache.tika.sax.ContentHandlerDecorator.endElement(ContentHandlerDecorator.java:132)
>     at org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source)
>     at org.apache.xerces.impl.XMLNSDocumentScannerImpl.scanEndElement(Unknown 
> Source)
>     at 
> org.apache.xerces.impl.XMLDocumentFragmentScannerImpl$FragmentContentDispatcher.dispatch(Unknown
>  Source)
>     at 
> org.apache.xerces.impl.XMLDocumentFragmentScannerImpl.scanDocument(Unknown 
> Source)
>     at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
>     at org.apache.xerces.parsers.XML11Configuration.parse(Unknown Source)
>     at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
>     at org.apache.xerces.parsers.AbstractSAXParser.parse(Unknown Source)
> {code}
>  



--
This message was sent by Atlassian Jira
(v8.20.7#820007)

Reply via email to