[ https://issues.apache.org/jira/browse/TIKA-4228?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17831524#comment-17831524 ]
Tim Allison edited comment on TIKA-4228 at 3/27/24 8:29 PM: ------------------------------------------------------------ What's the exit code? -Are you on a system with an oom killer or other process killer-, and if so, do the logs suggest that the OS killed the process? Sorry, ubuntu, right. Anything in the logs? https://www.baeldung.com/linux/what-killed-a-process was (Author: talli...@mitre.org): What's the exit code? Are you on a system with an oom killer or other process killer, and if so, what do its logs say? > Tika parser crashes JVM when it gets metadata and embedded objects from pdf > --------------------------------------------------------------------------- > > Key: TIKA-4228 > URL: https://issues.apache.org/jira/browse/TIKA-4228 > Project: Tika > Issue Type: Bug > Reporter: Xiaohong Yang > Priority: Major > Attachments: tika-config-and-sample-file.zip > > > [^tika-config-and-sample-file.zip] > > We use org.apache.tika.parser.AutoDetectParser to get metadata and embedded > objects from pdf documents. And we found out that it crashes the program (or > the JVM) when it gets metadata and embedded files from the sample pdf file. > > Following is the sample code and attached is the tika-config.xml and the > sample pdf file. Note that the sample file crashes the JVM in 1 out of 10 > runs in our production environment. Sometimes it happens when it gets > metadata and sometimes it happens when it extracts embedded files (the > chances are about 50/50). > > The operating system is Ubuntu 20.04. Java version is 21. Tika version is > 2.9.0 and POI version is 5.2.3. > > > import org.apache.pdfbox.io.IOUtils; > import org.apache.poi.poifs.filesystem.DirectoryEntry; > import org.apache.poi.poifs.filesystem.DocumentEntry; > import org.apache.poi.poifs.filesystem.DocumentInputStream; > import org.apache.poi.poifs.filesystem.POIFSFileSystem; > import org.apache.tika.config.TikaConfig; > import org.apache.tika.detect.Detector; > import org.apache.tika.extractor.EmbeddedDocumentExtractor; > import org.apache.tika.io.FilenameUtils; > import org.apache.tika.io.TikaInputStream; > import org.apache.tika.metadata.Metadata; > import org.apache.tika.metadata.TikaCoreProperties; > import org.apache.tika.mime.MediaType; > import org.apache.tika.parser.AutoDetectParser; > import org.apache.tika.parser.ParseContext; > import org.apache.tika.parser.Parser; > import org.apache.tika.sax.BodyContentHandler; > import org.xml.sax.ContentHandler; > import org.xml.sax.SAXException; > import org.xml.sax.helpers.DefaultHandler; > > import java.io.*; > import java.net.URL; > import java.nio.file.Files; > import java.nio.file.Path; > import java.nio.file.Paths; > > public class ProcessPdf { > private final Path inputFile = new > File("/home/ubuntu/testdirs/testdir_pdf/sample.pdf").toPath(); > private final Path outputDir = new > File("/home/ubuntu/testdirs/testdir_pdf/tika_output/").toPath(); > > private Parser parser; > private ParseContext context; > > > public static void main(String args[]) { > try > { System.out.println("Start"); ProcessPdf processPdf > = new ProcessPdf(); System.out.println("Get metadata"); > processPdf.getMataData(); System.out.println("Extract embedded > files"); processPdf.extract(); > System.out.println("End"); } > catch(Exception ex) > { ex.printStackTrace(); } > } > > public ProcessPdf() > { } > > public void getMataData() throws Exception { > BodyContentHandler handler = new BodyContentHandler(-1); > > Metadata metadata = new Metadata(); > try (FileInputStream inputData = new > FileInputStream(inputFile.toString())) > { TikaConfig config = new > TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml"); > Parser autoDetectParser = new AutoDetectParser(config); > ParseContext context = new ParseContext(); > context.set(TikaConfig.class, config); > autoDetectParser.parse(inputData, handler, metadata, context); } > > String content = handler.toString(); > } > > public void extract() throws Exception { > TikaConfig config = new > TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml"); > ProcessPdf.FileEmbeddedDocumentExtractor > fileEmbeddedDocumentExtractor = new > ProcessPdf.FileEmbeddedDocumentExtractor(); > > parser = new AutoDetectParser(config); > context = new ParseContext(); > context.set(Parser.class, parser); > context.set(TikaConfig.class, config); > context.set(EmbeddedDocumentExtractor.class, > fileEmbeddedDocumentExtractor); > > URL url = inputFile.toUri().toURL(); > Metadata metadata = new Metadata(); > try (InputStream input = TikaInputStream.get(url, metadata)) > { ContentHandler handler = new DefaultHandler(); > parser.parse(input, handler, metadata, context); } > } > > private class FileEmbeddedDocumentExtractor implements > EmbeddedDocumentExtractor { > private int count = 0; > > public boolean shouldParseEmbedded(Metadata metadata) > { return true; } > > public void parseEmbedded(InputStream inputStream, ContentHandler > contentHandler, Metadata metadata, > boolean outputHtml) throws SAXException, > IOException { > String fullFileName = > metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); > if (fullFileName == null) > { fullFileName = "file" + count++; } > > TikaConfig config = null; > try > { config = new > TikaConfig("/home/ubuntu/testdirs/testdir_pdf/tika-config.xml"); } > catch (Exception ex) > { ex.printStackTrace(); } > if (config == null) \{ return; } > > Detector detector = config.getDetector();; > MediaType contentType = detector.detect(inputStream, metadata); > String tikaExtension = null; > if(fullFileName.indexOf('.') == -1 && contentType != null){ > try \{ tikaExtension = > config.getMimeRepository().forName(contentType.toString()).getExtension(); > } catch (Exception ex) \{ > ex.printStackTrace(); } > > if (tikaExtension != null && !tikaExtension.isEmpty() ) \{ > fullFileName += tikaExtension; } > } > > String[] fileNameSplit = fullFileName.split("/"); > String fileName = fileNameSplit[fileNameSplit.length - 1]; > File outputFile = new File(outputDir.toFile(), > FilenameUtils.normalize(fileName)); > System.out.println("Extracting '" + fileName + " to " + > outputFile); > FileOutputStream os = null; > try { > os = new FileOutputStream(outputFile); > if (inputStream instanceof TikaInputStream tin) { > if (tin.getOpenContainer() instanceof DirectoryEntry) { > try(POIFSFileSystem fs = new POIFSFileSystem())\{ > copy((DirectoryEntry) tin.getOpenContainer(), > fs.getRoot()); fs.writeFilesystem(os); > } > } else \{ > IOUtils.copy(inputStream, os); } > } else \{ IOUtils.copy(inputStream, os); > } > } catch (Exception ex) \{ ex.printStackTrace(); > } > finally { > if (os != null) > { os.flush(); os.close(); > } > } > } > > protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) > throws IOException { > for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) { > if (entry instanceof DirectoryEntry) > { // Need to recurse DirectoryEntry > newDir = destDir.createDirectory(entry.getName()); > copy((DirectoryEntry) entry, newDir); } > else { > // Copy entry > try (InputStream contents = new > DocumentInputStream((DocumentEntry) entry)) > { destDir.createDocument(entry.getName(), contents); > } > } > } > } > } > } > ^^ -- This message was sent by Atlassian Jira (v8.20.10#820010)