This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit da055766a892254199ff8253834b871548a9fbef Author: tallison <[email protected]> AuthorDate: Tue Mar 23 17:43:00 2021 -0400 TIKA-3334 -- fix thread safety bug in handling embedded docs in open office parser --- CHANGES.txt | 2 ++ .../apache/tika/parser/odf/OpenDocumentParser.java | 30 +++++++++-------- .../org/apache/tika/parser/odf/ODFParserTest.java | 39 ++++++++++++++++++++++ 3 files changed, 58 insertions(+), 13 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 37dcabe..c16f37b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Release 1.26 - ??/??/???? + * Fix thread safety bug in OpenOffice parser (TIKA-3334). + * The "writeLimit" header now pertains to the combined characters written per container document (and embedded documents) in the /rmeta endpoint in tika-server (TIKA-3325); it no longer functions only diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java index babaac2..fafefd6 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java @@ -65,7 +65,7 @@ public class OpenDocumentParser extends AbstractParser { private static final long serialVersionUID = -6410276875438618287L; private static final Set<MediaType> SUPPORTED_TYPES = - Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList( + Collections.unmodifiableSet(new HashSet<>(Arrays.asList( MediaType.application("vnd.sun.xml.writer"), MediaType.application("vnd.oasis.opendocument.text"), MediaType.application("vnd.oasis.opendocument.graphics"), @@ -103,8 +103,6 @@ public class OpenDocumentParser extends AbstractParser { private static final String META_NAME = "meta.xml"; private static final String MANIFEST_NAME = "META-INF/manifest.xml"; - private EmbeddedDocumentUtil embeddedDocumentUtil; - private Parser meta = new OpenDocumentMetaParser(); private Parser content = new OpenDocumentContentParser(); @@ -136,7 +134,7 @@ public class OpenDocumentParser extends AbstractParser { Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - embeddedDocumentUtil = new EmbeddedDocumentUtil(context); + EmbeddedDocumentUtil embeddedDocumentUtil = new EmbeddedDocumentUtil(context); // Open the Zip stream // Use a File if we can, and an already open zip is even better @@ -167,14 +165,14 @@ public class OpenDocumentParser extends AbstractParser { try { if (zipFile != null) { try { - handleZipFile(zipFile, metadata, context, handler); + handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil); } finally { //Do we want to close silently == catch an exception here? zipFile.close(); } } else { try { - handleZipStream(zipStream, metadata, context, handler); + handleZipStream(zipStream, metadata, context, handler, embeddedDocumentUtil); } finally { //Do we want to close silently == catch an exception here? zipStream.close(); @@ -200,7 +198,9 @@ public class OpenDocumentParser extends AbstractParser { private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, - EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException { + EndDocumentShieldingContentHandler handler, + EmbeddedDocumentUtil embeddedDocumentUtil) throws IOException, + TikaException, SAXException { ZipEntry entry = zipStream.getNextEntry(); if (entry == null) { throw new IOException("No entries found in ZipInputStream"); @@ -208,7 +208,7 @@ public class OpenDocumentParser extends AbstractParser { List<SAXException> saxExceptions = new ArrayList<>(); do { try { - handleZipEntry(entry, zipStream, metadata, context, handler); + handleZipEntry(entry, zipStream, metadata, context, handler, embeddedDocumentUtil); } catch (SAXException e) { if (e.getCause() instanceof EncryptedDocumentException) { throw (EncryptedDocumentException)e.getCause(); @@ -225,12 +225,14 @@ public class OpenDocumentParser extends AbstractParser { } private void handleZipFile(ZipFile zipFile, Metadata metadata, - ParseContext context, EndDocumentShieldingContentHandler handler) + ParseContext context, EndDocumentShieldingContentHandler handler, + EmbeddedDocumentUtil embeddedDocumentUtil) throws IOException, TikaException, SAXException { ZipEntry entry = zipFile.getEntry(MANIFEST_NAME); if (entry != null) { - handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler); + handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler, + embeddedDocumentUtil); } // If we can, process the metadata first, then the // rest of the file afterwards (TIKA-1353) @@ -238,19 +240,21 @@ public class OpenDocumentParser extends AbstractParser { entry = zipFile.getEntry(META_NAME); if (entry != null) { handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, - handler); + handler, embeddedDocumentUtil); } Enumeration<? extends ZipEntry> entries = zipFile.entries(); while (entries.hasMoreElements()) { entry = entries.nextElement(); if (!META_NAME.equals(entry.getName())) { - handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler); + handleZipEntry(entry, zipFile.getInputStream(entry), metadata, context, handler, + embeddedDocumentUtil); } } } private void handleZipEntry(ZipEntry entry, InputStream zip, Metadata metadata, - ParseContext context, ContentHandler handler) + ParseContext context, ContentHandler handler, + EmbeddedDocumentUtil embeddedDocumentUtil) throws IOException, SAXException, TikaException { if (entry == null) { return; diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java index 0affa14..6006548 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java @@ -24,6 +24,11 @@ import java.io.InputStream; import java.nio.file.Path; import java.nio.file.Paths; import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -652,4 +657,38 @@ public class ODFParserTest extends TikaTest { parseContext.set(Parser.class, new EmptyParser()); return parseContext; } + + @Test + public void testMultiThreaded() throws Exception { + int numThreads = 10; + ExecutorService executorService = Executors.newFixedThreadPool(numThreads); + ExecutorCompletionService<Integer> executorCompletionService = + new ExecutorCompletionService<>(executorService); + + for (int i = 0; i < numThreads; i++) { + executorCompletionService.submit(new Callable<Integer>() { + @Override + public Integer call() throws Exception { + for (int i = 0; i < 10; i++) { + List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt"); + assertEquals(3, metadataList.size()); + assertEquals("THUMBNAIL", + metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE)); + } + return 1; + } + }); + } + + try { + int finished = 0; + while (finished < numThreads) { + Future<Integer> future = executorCompletionService.take(); + future.get(); + finished++; + } + } finally { + executorService.shutdownNow(); + } + } }
