[jira] [Comment Edited] (TIKA-4228) Tika parser crashes JVM when it gets metadata and embedded objects from pdf

Tim Allison (Jira) Wed, 27 Mar 2024 13:26:05 -0700


    [ 
https://issues.apache.org/jira/browse/TIKA-4228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17831518#comment-17831518
 ]


Tim Allison edited comment on TIKA-4228 at 3/27/24 8:25 PM:
------------------------------------------------------------

As I think about it, this code wouldn't extract all of the embedded images in 
the PDF...so that's not a concern...you'd have to turn on extractInlineImages.

I can run getMetadata() with -Xmx256m with no problems with the current 
branch_2x.

If I roll back to PDFBox 2.0.29, which we used in Tika 2.9.0 and run Java 
corretto 21, I'm still not able to repro any crashes with metadata or file 
extract even if I multithread it and run continuous loops.


was (Author: talli...@mitre.org):
As I think about it, this code wouldn't extract all of the embedded images in 
the PDF...so that's not a concern...you'd have to turn on extractInlineImages.

I can run getMetadata() with -Xmx256m with no problems with the current 
branch_2x.

> Tika parser crashes JVM when it gets metadata and embedded objects from pdf
> ---------------------------------------------------------------------------
>
>                 Key: TIKA-4228
>                 URL: https://issues.apache.org/jira/browse/TIKA-4228
>             Project: Tika
>          Issue Type: Bug
>            Reporter: Xiaohong Yang
>            Priority: Major
>         Attachments: tika-config-and-sample-file.zip
>
>
> [^tika-config-and-sample-file.zip]
>  
> We use org.apache.tika.parser.AutoDetectParser to get metadata and embedded 
> objects from pdf documents.  And we found out that it crashes the program (or 
> the JVM) when it gets metadata and embedded files from the sample pdf file.
>  
> Following is the sample code and attached is the tika-config.xml and the 
> sample pdf file. Note that the sample file crashes the JVM in 1 out of 10 
> runs in our production environment.  Sometimes it happens when it gets 
> metadata and sometimes it happens when it extracts embedded files (the 
> chances are about 50/50).
>  
> The operating system is Ubuntu 20.04. Java version is 21.  Tika version is 
> 2.9.0 and POI version is 5.2.3.   
>  
>  
> import org.apache.pdfbox.io.IOUtils;
> import org.apache.poi.poifs.filesystem.DirectoryEntry;
> import org.apache.poi.poifs.filesystem.DocumentEntry;
> import org.apache.poi.poifs.filesystem.DocumentInputStream;
> import org.apache.poi.poifs.filesystem.POIFSFileSystem;
> import org.apache.tika.config.TikaConfig;
> import org.apache.tika.detect.Detector;
> import org.apache.tika.extractor.EmbeddedDocumentExtractor;
> import org.apache.tika.io.FilenameUtils;
> import org.apache.tika.io.TikaInputStream;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.metadata.TikaCoreProperties;
> import org.apache.tika.mime.MediaType;
> import org.apache.tika.parser.AutoDetectParser;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.apache.tika.sax.BodyContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> import org.xml.sax.helpers.DefaultHandler;
>  
> import java.io.*;
> import java.net.URL;
> import java.nio.file.Files;
> import java.nio.file.Path;
> import java.nio.file.Paths;
>  
> public class ProcessPdf {
>     private final Path inputFile = new 
> File("/home/ubuntu/testdirs/testdir_pdf/sample.pdf").toPath();
>     private final Path outputDir = new 
> File("/home/ubuntu/testdirs/testdir_pdf/tika_output/").toPath();
>  
>     private Parser parser;
>     private ParseContext context;
>  
>  
>     public static void main(String args[]) {
>         try
> {             System.out.println("Start");             ProcessPdf processPdf 
> = new ProcessPdf();             System.out.println("Get metadata");           
>   processPdf.getMataData();             System.out.println("Extract embedded 
> files");             processPdf.extract();             
> System.out.println("End");         }
>         catch(Exception ex)
> {             ex.printStackTrace();         }
>     }
>  
>     public ProcessPdf()
> {     }
>  
>     public void getMataData() throws Exception {
>         BodyContentHandler handler = new BodyContentHandler(-1);
>  
>         Metadata metadata = new Metadata();
>         try (FileInputStream inputData = new 
> FileInputStream(inputFile.toString()))
> {             TikaConfig config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");             
> Parser autoDetectParser = new AutoDetectParser(config);             
> ParseContext context = new ParseContext();             
> context.set(TikaConfig.class, config);             
> autoDetectParser.parse(inputData, handler, metadata, context);         }
>  
>         String content = handler.toString();
>     }
>  
>     public void extract() throws Exception {
>         TikaConfig config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");
>         ProcessPdf.FileEmbeddedDocumentExtractor 
> fileEmbeddedDocumentExtractor = new 
> ProcessPdf.FileEmbeddedDocumentExtractor();
>  
>         parser = new AutoDetectParser(config);
>         context = new ParseContext();
>         context.set(Parser.class, parser);
>         context.set(TikaConfig.class, config);
>         context.set(EmbeddedDocumentExtractor.class, 
> fileEmbeddedDocumentExtractor);
>  
>         URL url = inputFile.toUri().toURL();
>         Metadata metadata = new Metadata();
>         try (InputStream input = TikaInputStream.get(url, metadata))
> {             ContentHandler handler = new DefaultHandler();             
> parser.parse(input, handler, metadata, context);         }
>     }
>  
>     private class FileEmbeddedDocumentExtractor implements 
> EmbeddedDocumentExtractor {
>         private int count = 0;
>  
>         public boolean shouldParseEmbedded(Metadata metadata)
> {             return true;         }
>  
>         public void parseEmbedded(InputStream inputStream, ContentHandler 
> contentHandler, Metadata metadata,
>                                   boolean outputHtml) throws SAXException, 
> IOException {
>             String fullFileName = 
> metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
>             if (fullFileName == null)
> {                 fullFileName = "file" + count++;             }
>  
>             TikaConfig config = null;
>             try
> {                 config = new 
> TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");             }
> catch (Exception ex)
> {                 ex.printStackTrace();             }
>             if (config == null) \{                 return;             }
>  
>             Detector detector = config.getDetector();;
>             MediaType contentType = detector.detect(inputStream, metadata);
>             String tikaExtension = null;
>             if(fullFileName.indexOf('.') == -1 && contentType != null){
>                 try \{                     tikaExtension = 
> config.getMimeRepository().forName(contentType.toString()).getExtension();    
>              } catch (Exception ex) \{                     
> ex.printStackTrace();                 }
>  
>                 if (tikaExtension != null && !tikaExtension.isEmpty() ) \{    
>                  fullFileName += tikaExtension;                 }
>             }
>  
>             String[] fileNameSplit = fullFileName.split("/");
>             String fileName = fileNameSplit[fileNameSplit.length - 1];
>             File outputFile = new File(outputDir.toFile(), 
> FilenameUtils.normalize(fileName));
>             System.out.println("Extracting '" + fileName + " to " + 
> outputFile);
>             FileOutputStream os = null;
>             try {
>                 os = new FileOutputStream(outputFile);
>                 if (inputStream instanceof TikaInputStream tin) {
>                     if (tin.getOpenContainer() instanceof DirectoryEntry) {
>                         try(POIFSFileSystem fs = new POIFSFileSystem())\{     
>                         copy((DirectoryEntry) tin.getOpenContainer(), 
> fs.getRoot());                             fs.writeFilesystem(os);            
>              }
>                     } else \{                         
> IOUtils.copy(inputStream, os);                     }
>                 } else \{                     IOUtils.copy(inputStream, os);  
>                }
>             } catch (Exception ex) \{                 ex.printStackTrace();   
>           }
> finally {
>                 if (os != null)
> {                     os.flush();                     os.close();             
>     }
>             }
>         }
>  
>         protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
> throws IOException {
>             for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
>                 if (entry instanceof DirectoryEntry)
> {                     // Need to recurse                     DirectoryEntry 
> newDir = destDir.createDirectory(entry.getName());                     
> copy((DirectoryEntry) entry, newDir);                 }
> else {
>                     // Copy entry
>                     try (InputStream contents = new 
> DocumentInputStream((DocumentEntry) entry))
> {                         destDir.createDocument(entry.getName(), contents);  
>                    }
>                 }
>             }
>         }
>     }
> }
>  ^^ 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

[jira] [Comment Edited] (TIKA-4228) Tika parser crashes JVM when it gets metadata and embedded objects from pdf

Reply via email to