Xiaohong Yang created TIKA-4228:
-----------------------------------

             Summary: Tika parser crashes JVM when it gets metadata and 
embedded objects from pdf
                 Key: TIKA-4228
                 URL: https://issues.apache.org/jira/browse/TIKA-4228
             Project: Tika
          Issue Type: Bug
            Reporter: Xiaohong Yang
         Attachments: tika-config-and-sample-file.zip

[^tika-config-and-sample-file.zip]

 

We use org.apache.tika.parser.AutoDetectParser to get metadata and embedded 
objects from pdf documents.  And we found out that it crashes program (or the 
JVM) when it gets metadata and embedded files.

 

Following is the sample code and attached is the tika-config.xml and the sample 
pdf file. Note that the sample file crashes the JVM in 1 out of 10 runs in our 
production environment.  Sometimes it happens when it gets metadata and 
sometimes it happens when it extracts embedded files (the chances are about 
50/50).

 

The operating system is Ubuntu 20.04. Java version is 21.  Tika version is 
2.9.0 and POI version is 5.2.3.   

 

 

import org.apache.pdfbox.io.IOUtils;

import org.apache.poi.poifs.filesystem.DirectoryEntry;

import org.apache.poi.poifs.filesystem.DocumentEntry;

import org.apache.poi.poifs.filesystem.DocumentInputStream;

import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import org.apache.tika.config.TikaConfig;

import org.apache.tika.detect.Detector;

import org.apache.tika.extractor.EmbeddedDocumentExtractor;

import org.apache.tika.io.FilenameUtils;

import org.apache.tika.io.TikaInputStream;

import org.apache.tika.metadata.Metadata;

import org.apache.tika.metadata.TikaCoreProperties;

import org.apache.tika.mime.MediaType;

import org.apache.tika.parser.AutoDetectParser;

import org.apache.tika.parser.ParseContext;

import org.apache.tika.parser.Parser;

import org.apache.tika.sax.BodyContentHandler;

import org.xml.sax.ContentHandler;

import org.xml.sax.SAXException;

import org.xml.sax.helpers.DefaultHandler;

 

import java.io.*;

import java.net.URL;

import java.nio.file.Files;

import java.nio.file.Path;

import java.nio.file.Paths;

 

public class ProcessPdf {

    private final Path inputFile = new 
File("/home/ubuntu/testdirs/testdir_pdf/sample.pdf").toPath();

    private final Path outputDir = new 
File("/home/ubuntu/testdirs/testdir_pdf/tika_output/").toPath();

 

    private Parser parser;

    private ParseContext context;

 

 

    public static void main(String args[]) {

        try {

            System.out.println("Start");

            ProcessPdf processPdf = new ProcessPdf();

            System.out.println("Get metadata");

            processPdf.getMataData();

            System.out.println("Extract embedded files");

            processPdf.extract();

            System.out.println("End");

        }

        catch(Exception ex) {

            ex.printStackTrace();

        }

    }

 

    public ProcessPdf() {

    }

 

    public void getMataData() throws Exception {

        BodyContentHandler handler = new BodyContentHandler(-1);

 

        Metadata metadata = new Metadata();

        try (FileInputStream inputData = new 
FileInputStream(inputFile.toString())) {

            TikaConfig config = new 
TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");

            Parser autoDetectParser = new AutoDetectParser(config);

            ParseContext context = new ParseContext();

            context.set(TikaConfig.class, config);

            autoDetectParser.parse(inputData, handler, metadata, context);

        }

 

        String content = handler.toString();

    }

 

    public void extract() throws Exception {

        TikaConfig config = new 
TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");

        ProcessPdf.FileEmbeddedDocumentExtractor fileEmbeddedDocumentExtractor 
= new ProcessPdf.FileEmbeddedDocumentExtractor();

 

        parser = new AutoDetectParser(config);

        context = new ParseContext();

        context.set(Parser.class, parser);

        context.set(TikaConfig.class, config);

        context.set(EmbeddedDocumentExtractor.class, 
fileEmbeddedDocumentExtractor);

 

        URL url = inputFile.toUri().toURL();

        Metadata metadata = new Metadata();

        try (InputStream input = TikaInputStream.get(url, metadata)) {

            ContentHandler handler = new DefaultHandler();

            parser.parse(input, handler, metadata, context);

        }

    }

 

    private class FileEmbeddedDocumentExtractor implements 
EmbeddedDocumentExtractor {

        private int count = 0;

 

        public boolean shouldParseEmbedded(Metadata metadata) {

            return true;

        }

 

        public void parseEmbedded(InputStream inputStream, ContentHandler 
contentHandler, Metadata metadata,

                                  boolean outputHtml) throws SAXException, 
IOException {

            String fullFileName = 
metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);

            if (fullFileName == null) {

                fullFileName = "file" + count++;

            }

 

            TikaConfig config = null;

            try {

                config = new 
TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml");

            } catch (Exception ex) {

                ex.printStackTrace();

            }

            if (config == null) {

                return;

            }

 

            Detector detector = config.getDetector();;

            MediaType contentType = detector.detect(inputStream, metadata);

            String tikaExtension = null;

            if(fullFileName.indexOf('.') == -1 && contentType != null){

                try {

                    tikaExtension = 
config.getMimeRepository().forName(contentType.toString()).getExtension();

                } catch (Exception ex) {

                    ex.printStackTrace();

                }

 

                if (tikaExtension != null && !tikaExtension.isEmpty() ) {

                    fullFileName += tikaExtension;

                }

            }

 

            String[] fileNameSplit = fullFileName.split("/");

            String fileName = fileNameSplit[fileNameSplit.length - 1];

            File outputFile = new File(outputDir.toFile(), 
FilenameUtils.normalize(fileName));

            System.out.println("Extracting '" + fileName + " to " + outputFile);

            FileOutputStream os = null;

            try {

                os = new FileOutputStream(outputFile);

                if (inputStream instanceof TikaInputStream tin) {

                    if (tin.getOpenContainer() instanceof DirectoryEntry) {

                        try(POIFSFileSystem fs = new POIFSFileSystem()){

                            copy((DirectoryEntry) tin.getOpenContainer(), 
fs.getRoot());

                            fs.writeFilesystem(os);

                        }

                    } else {

                        IOUtils.copy(inputStream, os);

                    }

                } else {

                    IOUtils.copy(inputStream, os);

                }

            } catch (Exception ex) {

                ex.printStackTrace();

            } finally {

                if (os != null) {

                    os.flush();

                    os.close();

                }

            }

        }

 

        protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) 
throws IOException {

            for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {

                if (entry instanceof DirectoryEntry) {

                    // Need to recurse

                    DirectoryEntry newDir = 
destDir.createDirectory(entry.getName());

                    copy((DirectoryEntry) entry, newDir);

                } else {

                    // Copy entry

                    try (InputStream contents = new 
DocumentInputStream((DocumentEntry) entry)) {

                        destDir.createDocument(entry.getName(), contents);

                    }

                }

            }

        }

    }

}

 ^^ 



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to