This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4399 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5053c7dff439d0e74100e9f81a1e588a38fb1134 Author: tallison <[email protected]> AuthorDate: Tue Apr 8 13:20:43 2025 -0400 TIKA-4399 -- require TikaInputStream for embedded documents --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 14 +++---- .../tika/extractor/EmbeddedDocumentExtractor.java | 4 +- .../tika/extractor/EmbeddedDocumentUtil.java | 5 +-- .../ParsingEmbeddedDocumentExtractor.java | 20 +++------- .../apache/tika/extractor/RUnpackExtractor.java | 20 ++++------ .../java/org/apache/tika/io/TikaInputStream.java | 45 ++++++++++++++++++++++ .../org/apache/tika/renderer/RenderResult.java | 3 +- .../org/apache/tika/parser/mock/MockParser.java | 4 +- .../apache/tika/example/ExtractEmbeddedFiles.java | 3 +- .../tika/parser/apple/AppleSingleFileParser.java | 18 +++++++-- .../parser/iwork/iwana/IWork13PackageParser.java | 30 ++++++++------- .../org/apache/tika/parser/crypto/TSDParser.java | 4 +- .../org/apache/tika/parser/html/HtmlHandler.java | 20 +++++----- .../apache/tika/parser/jdbc/JDBCTableReader.java | 6 +-- .../tika/parser/mail/MailContentHandler.java | 8 ++-- .../org/apache/tika/parser/mbox/MboxParser.java | 5 ++- .../apache/tika/parser/microsoft/EMFParser.java | 8 ++-- .../tika/parser/microsoft/JackcessExtractor.java | 6 +-- .../apache/tika/parser/microsoft/OfficeParser.java | 9 ++--- .../tika/parser/microsoft/OutlookExtractor.java | 11 +++--- .../tika/parser/microsoft/chm/ChmParser.java | 9 ++--- .../tika/parser/microsoft/libpst/EmailVisitor.java | 5 +-- .../microsoft/ooxml/xps/XPSExtractorDecorator.java | 13 ++++--- .../parser/microsoft/pst/PSTMailItemParser.java | 6 +-- .../parser/odf/FlatOpenDocumentMacroHandler.java | 5 +-- .../tika/parser/odf/OpenDocumentBodyHandler.java | 5 +-- .../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 38 ++++++++---------- .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 5 +-- .../java/org/apache/tika/parser/pdf/PDFParser.java | 4 +- .../apache/tika/parser/pdf/PDFRenderingTest.java | 11 +++--- .../apache/tika/parser/pkg/CompressorParser.java | 5 ++- .../java/org/apache/tika/parser/pkg/RarParser.java | 4 +- .../org/apache/tika/parser/pkg/UnrarParser.java | 4 +- .../org/apache/tika/parser/http/HttpParser.java | 11 +++--- .../org/apache/tika/parser/wacz/WACZParser.java | 4 +- .../org/apache/tika/parser/warc/WARCParser.java | 2 +- .../apache/tika/parser/xml/FictionBookParser.java | 8 ++-- .../org/apache/tika/parser/pkg/ZipParserTest.java | 3 +- .../server/core/resource/UnpackerResource.java | 8 ++-- 39 files changed, 214 insertions(+), 179 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 74c3cbf44..448feeff3 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -1086,12 +1086,10 @@ public class TikaCLI { return true; } - public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { + @Override + public void parseEmbedded(TikaInputStream tis, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { - if (!inputStream.markSupported()) { - inputStream = TikaInputStream.get(inputStream); - } - MediaType contentType = detector.detect(inputStream, metadata); + MediaType contentType = detector.detect(tis, metadata); String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); File outputFile = null; @@ -1110,12 +1108,12 @@ public class TikaCLI { System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile); try (FileOutputStream os = new FileOutputStream(outputFile)) { - if (embeddedStreamTranslator.shouldTranslate(inputStream, metadata)) { - try (InputStream translated = embeddedStreamTranslator.translate(inputStream, metadata)) { + if (embeddedStreamTranslator.shouldTranslate(tis, metadata)) { + try (InputStream translated = embeddedStreamTranslator.translate(tis, metadata)) { IOUtils.copy(translated, os); } } else { - IOUtils.copy(inputStream, os); + IOUtils.copy(tis, os); } } catch (Exception e) { // diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java index f80420868..3f977e3db 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java @@ -18,11 +18,11 @@ package org.apache.tika.extractor; import java.io.IOException; -import java.io.InputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; public interface EmbeddedDocumentExtractor { @@ -39,6 +39,6 @@ public interface EmbeddedDocumentExtractor { * @throws java.io.IOException */ void parseEmbedded( - InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) + TikaInputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException; } diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java index d6e2c28a8..4d73545c1 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java @@ -18,7 +18,6 @@ package org.apache.tika.extractor; import java.io.IOException; -import java.io.InputStream; import java.io.Serializable; import org.xml.sax.ContentHandler; @@ -219,9 +218,9 @@ public class EmbeddedDocumentUtil implements Serializable { return embeddedDocumentExtractor; } - public void parseEmbedded(InputStream inputStream, ContentHandler handler, Metadata metadata, + public void parseEmbedded(TikaInputStream tis, ContentHandler handler, Metadata metadata, boolean outputHtml) throws IOException, SAXException { - embeddedDocumentExtractor.parseEmbedded(inputStream, handler, metadata, outputHtml); + embeddedDocumentExtractor.parseEmbedded(tis, handler, metadata, outputHtml); } /** diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java index 4f2331359..21117b33b 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java @@ -21,9 +21,7 @@ import static org.apache.tika.sax.XHTMLContentHandler.XHTML; import java.io.File; import java.io.FilenameFilter; import java.io.IOException; -import java.io.InputStream; -import org.apache.commons.io.input.CloseShieldInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -31,7 +29,6 @@ import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -82,7 +79,7 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract @Override public void parseEmbedded( - InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) + TikaInputStream tis, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { if (outputHtml) { AttributesImpl attributes = new AttributesImpl(); @@ -99,16 +96,9 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract } // Use the delegate parser to parse this entry - try (TemporaryResources tmp = new TemporaryResources()) { - final TikaInputStream newStream = - TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata); - if (stream instanceof TikaInputStream) { - final Object container = ((TikaInputStream) stream).getOpenContainer(); - if (container != null) { - newStream.setOpenContainer(container); - } - } - DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)), + try { + tis.setCloseShield(); + DELEGATING_PARSER.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(handler)), metadata, context); } catch (EncryptedDocumentException ede) { recordException(ede, context); @@ -118,6 +108,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract throw new IOException(e); } catch (TikaException e) { recordException(e, context); + } finally { + tis.removeCloseShield(); } if (outputHtml) { diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java index 0e5928845..cbd560c50 100644 --- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java +++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java @@ -35,7 +35,6 @@ import org.apache.tika.exception.CorruptedFileException; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; import org.apache.tika.io.BoundedInputStream; -import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -68,7 +67,7 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { @Override public void parseEmbedded( - InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) + TikaInputStream tis, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { if (outputHtml) { AttributesImpl attributes = new AttributesImpl(); @@ -85,20 +84,13 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { } // Use the delegate parser to parse this entry - try (TemporaryResources tmp = new TemporaryResources()) { - final TikaInputStream newStream = - TikaInputStream.get(CloseShieldInputStream.wrap(stream), tmp, metadata); - if (stream instanceof TikaInputStream) { - final Object container = ((TikaInputStream) stream).getOpenContainer(); - if (container != null) { - newStream.setOpenContainer(container); - } - } + try { EmbeddedDocumentBytesHandler bytesHandler = context.get(EmbeddedDocumentBytesHandler.class); + tis.setCloseShield(); if (bytesHandler != null) { - parseWithBytes(newStream, handler, metadata); + parseWithBytes(tis, handler, metadata); } else { - parse(newStream, handler, metadata); + parse(tis, handler, metadata); } } catch (EncryptedDocumentException ede) { recordException(ede, context); @@ -108,6 +100,8 @@ public class RUnpackExtractor extends ParsingEmbeddedDocumentExtractor { throw new IOException(e); } catch (TikaException e) { recordException(e, context); + } finally { + tis.removeCloseShield(); } if (outputHtml) { diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index 77e09226a..ea48487a0 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -119,6 +119,11 @@ public class TikaInputStream extends TaggedInputStream { private int consecutiveEOFs = 0; private byte[] skipBuffer; + /** + * If the stream should be shielded from closing + */ + private int closeShieldDepth = 0; + //suffix of the file if known. This is used to create temp files //with the right suffixes. This should include the initial . as in ".doc" private String suffix = null; @@ -266,6 +271,32 @@ public class TikaInputStream extends TaggedInputStream { return get(stream, new TemporaryResources(), null); } + /** + * Casts or wraps the given stream to a TikaInputStream instance. + * This method can be used to access the functionality of this class + * even when given just a normal input stream instance. + * <p> + * Use this method instead of the + * {@link #get(InputStream, TemporaryResources, Metadata)} alternative when you + * <em>do</em> explicitly close the returned stream. The recommended + * access pattern is: + * <pre> + * try (TikaInputStream stream = TikaInputStream.get(...)) { + * // process stream + * } + * </pre> + * <p> + * The given stream instance will be closed along with any other resources + * associated with the returned TikaInputStream instance when the + * {@link #close()} method is called by the try-with-resources statement. + * + * @param stream normal input stream + * @return a TikaInputStream instance + */ + public static TikaInputStream get(InputStream stream, Metadata metadata) { + return get(stream, new TemporaryResources(), metadata); + } + /** * Returns the given stream casts to a TikaInputStream, or * <code>null</code> if the stream is not a TikaInputStream. @@ -827,6 +858,9 @@ public class TikaInputStream extends TaggedInputStream { @Override public void close() throws IOException { + if (closeShieldDepth > 0) { + return; + } path = null; mark = -1; @@ -853,6 +887,17 @@ public class TikaInputStream extends TaggedInputStream { } } + public void setCloseShield() { + this.closeShieldDepth++; + } + + public void removeCloseShield() { + this.closeShieldDepth--; + } + + public boolean isCloseShield() { + return closeShieldDepth > 0; + } @Override public String toString() { String str = "TikaInputStream of "; diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java index 3fd8d7d2c..25588c45b 100644 --- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java +++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java @@ -18,7 +18,6 @@ package org.apache.tika.renderer; import java.io.Closeable; import java.io.IOException; -import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; @@ -62,7 +61,7 @@ public class RenderResult implements Closeable { } } - public InputStream getInputStream() throws IOException { + public TikaInputStream getInputStream() throws IOException { if (result instanceof Path) { return TikaInputStream.get((Path)result, metadata); } else { diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java index 16458c9e9..84c9b7ab1 100644 --- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java +++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java @@ -19,7 +19,6 @@ package org.apache.tika.parser.mock; import static java.nio.charset.StandardCharsets.UTF_8; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.PrintStream; @@ -56,6 +55,7 @@ import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -267,7 +267,7 @@ public class MockParser implements Parser { if (!"".equals(contentType)) { m.set(Metadata.CONTENT_TYPE, contentType); } - try (InputStream is = new ByteArrayInputStream(embeddedText.getBytes(UTF_8))) { + try (TikaInputStream is = TikaInputStream.get(embeddedText.getBytes(UTF_8))) { extractor.parseEmbedded(is, new EmbeddedContentHandler(handler), m, true); } } diff --git a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java index 9f1425da8..68d136202 100644 --- a/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java +++ b/tika-example/src/main/java/org/apache/tika/example/ExtractEmbeddedFiles.java @@ -32,6 +32,7 @@ import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -73,7 +74,7 @@ public class ExtractEmbeddedFiles { } @Override - public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { + public void parseEmbedded(TikaInputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { //try to get the name of the embedded file from the metadata String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java index ac43b2985..d97ed1ba7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java @@ -26,7 +26,6 @@ import java.util.List; import java.util.Set; import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.CloseShieldInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -35,6 +34,8 @@ import org.apache.tika.exception.TikaMemoryLimitException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.EndianUtils; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -99,8 +100,19 @@ public class AppleSingleFileParser implements Parser { // TODO: we should probably add a readlimiting wrapper around this // stream to ensure that not more than contentFieldInfo.length bytes // are read - ex.parseEmbedded(CloseShieldInputStream.wrap(stream), xhtml, embeddedMetadata, - true); + TikaInputStream tis = TikaInputStream.cast(stream); + TemporaryResources tmp = null; + if (tis == null) { + tmp = new TemporaryResources(); + tis = TikaInputStream.get(stream, tmp, embeddedMetadata); + } + try { + ex.parseEmbedded(tis, xhtml, embeddedMetadata, true); + } finally { + if (tmp != null) { + tmp.close(); + } + } } } xhtml.endDocument(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java index 2816efac1..ac65ff587 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/iwana/IWork13PackageParser.java @@ -36,13 +36,13 @@ import com.dd.plist.PropertyListParser; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.CloseShieldInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -128,9 +128,11 @@ public class IWork13PackageParser implements Parser { if (type == null) { type = IWork13DocumentType.detectIfPossible(entry); } - processZipEntry(entry, CloseShieldInputStream.wrap(zipStream), metadata, xhtml, - parseContext, - embeddedDocumentExtractor); + + try (TemporaryResources tmp = new TemporaryResources()) { + TikaInputStream tis = TikaInputStream.get(zipStream, tmp, new Metadata()); + processZipEntry(entry, tis, metadata, xhtml, parseContext, embeddedDocumentExtractor); + } entry = zipStream.getNextEntry(); } if (type == null) { @@ -153,8 +155,8 @@ public class IWork13PackageParser implements Parser { if (type == null) { type = IWork13DocumentType.detectIfPossible(entry); } - try (InputStream is = zipFile.getInputStream(entry)) { - processZipEntry(entry, is, metadata, xhtml, parseContext, embeddedDocumentExtractor); + try (TikaInputStream tis = TikaInputStream.get(zipFile.getInputStream(entry))) { + processZipEntry(entry, tis, metadata, xhtml, parseContext, embeddedDocumentExtractor); } catch (SecurityException e) { throw e; } catch (Exception e) { @@ -171,7 +173,7 @@ public class IWork13PackageParser implements Parser { } private void processZipEntry(ZipEntry entry, - InputStream inputStream, + TikaInputStream tis, Metadata metadata, XHTMLContentHandler xhtml, ParseContext parseContext, EmbeddedDocumentExtractor embeddedDocumentExtractor) @@ -181,18 +183,18 @@ public class IWork13PackageParser implements Parser { return; } if ("Metadata/Properties.plist".equals(streamName)) { - extractProperties(inputStream, metadata); + extractProperties(tis, metadata); } else if ("Metadata/BuildVersionHistory.plist".equals(streamName)) { - extractVersionHistory(inputStream, metadata); + extractVersionHistory(tis, metadata); } else if ("Metadata/DocumentIdentifier".equals(streamName)) { - extractDocumentIdentifier(inputStream, metadata); + extractDocumentIdentifier(tis, metadata); } else if ("preview.jpg".equals(streamName)) { //process thumbnail Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString()); embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, streamName); - handleEmbedded(inputStream, embeddedMetadata, xhtml, embeddedDocumentExtractor); + handleEmbedded(tis, embeddedMetadata, xhtml, embeddedDocumentExtractor); } else if (streamName.equals("preview-micro.jpg") || streamName.equals("preview-web.jpg") || streamName.endsWith(".iwa")) { @@ -200,18 +202,18 @@ public class IWork13PackageParser implements Parser { } else { Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, streamName); - handleEmbedded(inputStream, embeddedMetadata, xhtml, embeddedDocumentExtractor); + handleEmbedded(tis, embeddedMetadata, xhtml, embeddedDocumentExtractor); } } - private void handleEmbedded(InputStream inputStream, Metadata embeddedMetadata, + private void handleEmbedded(TikaInputStream tis, Metadata embeddedMetadata, XHTMLContentHandler xhtml, EmbeddedDocumentExtractor embeddedDocumentExtractor) throws IOException, SAXException { if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { - embeddedDocumentExtractor.parseEmbedded(inputStream, xhtml, embeddedMetadata, true); + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, embeddedMetadata, true); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java index c5a362464..2a0e4a0f9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java @@ -171,8 +171,8 @@ public class TSDParser implements Parser { try { cmsTimeStampedDataParser = new CMSTimeStampedDataParser(stream); - try (InputStream is = TikaInputStream.get(cmsTimeStampedDataParser.getContent())) { - edx.parseEmbedded(is, handler, metadata, true); + try (TikaInputStream tis = TikaInputStream.get(cmsTimeStampedDataParser.getContent())) { + edx.parseEmbedded(tis, handler, metadata, true); } } catch (SecurityException e) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java index ea4d0195f..b613f3939 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java @@ -17,7 +17,6 @@ package org.apache.tika.parser.html; import java.io.IOException; -import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; @@ -31,7 +30,6 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -39,6 +37,7 @@ import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.HTML; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -341,8 +340,8 @@ class HtmlHandler extends TextContentHandler { EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { - try (InputStream stream = UnsynchronizedByteArrayInputStream.builder().setByteArray(string.getBytes(StandardCharsets.UTF_8)).get()) { - embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, true); + try (TikaInputStream tis = TikaInputStream.get(string.getBytes(StandardCharsets.UTF_8))) { + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); } @@ -368,8 +367,8 @@ class HtmlHandler extends TextContentHandler { EmbeddedDocumentExtractor embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { - try (InputStream stream = dataURIScheme.getInputStream()) { - embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, true); + try (TikaInputStream tis = TikaInputStream.get(dataURIScheme.getInputStream())) { + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); } @@ -401,18 +400,17 @@ class HtmlHandler extends TextContentHandler { TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); dataUriMetadata.set(Metadata.CONTENT_TYPE, dataURIScheme.getMediaType().toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) { - try (InputStream dataURISchemeInputStream = dataURIScheme.getInputStream()) { + try (TikaInputStream tis = TikaInputStream.get(dataURIScheme.getInputStream())) { embeddedDocumentExtractor - .parseEmbedded(dataURISchemeInputStream, xhtml, dataUriMetadata, true); + .parseEmbedded(tis, xhtml, dataUriMetadata, true); } catch (IOException e) { //swallow } } } - try (InputStream stream = UnsynchronizedByteArrayInputStream.builder().setByteArray( - script.toString().getBytes(StandardCharsets.UTF_8)).get()) { - embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, true); + try (TikaInputStream tis = TikaInputStream.get(script.toString().getBytes(StandardCharsets.UTF_8))) { + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); } catch (IOException e) { //shouldn't ever happen } finally { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java index 9df4d197d..3ed87ae0b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-jdbc-commons/src/main/java/org/apache/tika/parser/jdbc/JDBCTableReader.java @@ -32,7 +32,6 @@ import java.util.List; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -200,8 +199,9 @@ public class JDBCTableReader { //is there a more efficient way to go from a Reader to an InputStream? String s = clob.getSubString(0, readSize); if (embeddedDocumentUtil.shouldParseEmbedded(m)) { - embeddedDocumentUtil - .parseEmbedded(UnsynchronizedByteArrayInputStream.builder().setByteArray(s.getBytes(UTF_8)).get(), handler, m, true); + try (TikaInputStream tis = TikaInputStream.get(s.getBytes(UTF_8))) { + embeddedDocumentUtil.parseEmbedded(tis, handler, m, true); + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java index 7c5d266ca..9af23d004 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java @@ -26,7 +26,6 @@ import java.util.Map.Entry; import java.util.Stack; import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream; import org.apache.james.mime4j.MimeException; import org.apache.james.mime4j.codec.DecodeMonitor; @@ -546,9 +545,10 @@ class MailContentHandler implements ContentHandler { inlineMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, MediaType.TEXT_PLAIN.toString()); } - parser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(part.bytes).get(), - new EmbeddedContentHandler(new BodyContentHandler(handler)), inlineMetadata, - parseContext); + try (TikaInputStream tis = TikaInputStream.get(part.bytes)) { + parser.parse(tis, + new EmbeddedContentHandler(new BodyContentHandler(handler)), inlineMetadata, parseContext); + } } catch (SAXException | TikaException e) { throw new MimeException(e); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java index 4c7bea74c..dddd9bd92 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mbox/MboxParser.java @@ -40,6 +40,7 @@ import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; @@ -130,11 +131,11 @@ public class MboxParser implements Parser { saveHeaderInMetadata(mailMetadata, item); } - InputStream messageStream = message.toInputStream(); + TikaInputStream msgStream = TikaInputStream.get(message.toInputStream()); message = null; if (extractor.shouldParseEmbedded(mailMetadata)) { - extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true); + extractor.parseEmbedded(msgStream, xhtml, mailMetadata, true); } if (tracking) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java index ae80cb62b..1ef4f1ee6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java @@ -69,11 +69,11 @@ public class EMFParser implements Parser { private static void handleEmbedded(byte[] data, EmbeddedDocumentExtractor embeddedDocumentExtractor, ContentHandler handler) throws TikaException, SAXException { - try (InputStream is = TikaInputStream.get(data)) { + try (TikaInputStream tis = TikaInputStream.get(data)) { Metadata embeddedMetadata = new Metadata(); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { embeddedDocumentExtractor - .parseEmbedded(is, new EmbeddedContentHandler(handler), embeddedMetadata, + .parseEmbedded(tis, new EmbeddedContentHandler(handler), embeddedMetadata, true); } } catch (IOException e) { @@ -204,9 +204,9 @@ public class EMFParser implements Parser { Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(Metadata.CONTENT_TYPE, WMF_MEDIA_TYPE.toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { - try (InputStream is = TikaInputStream.get(bytes)) { + try (TikaInputStream tis = TikaInputStream.get(bytes)) { embeddedDocumentExtractor - .parseEmbedded(is, new EmbeddedContentHandler(contentHandler), + .parseEmbedded(tis, new EmbeddedContentHandler(contentHandler), embeddedMetadata, true); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java index 409c34ca0..619ba601c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java @@ -41,7 +41,6 @@ import com.healthmarketscience.jackcess.Table; import com.healthmarketscience.jackcess.query.Query; import com.healthmarketscience.jackcess.util.OleBlob; import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.xml.sax.SAXException; @@ -220,9 +219,8 @@ class JackcessExtractor extends AbstractPOIFSExtractor { BodyContentHandler h = new BodyContentHandler(); Metadata m = new Metadata(); m.set(Metadata.CONTENT_TYPE, "text/html; charset=UTF-8"); - try { - htmlParser - .parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(v.getBytes(UTF_8)).get(), h, m, parseContext); + try (TikaInputStream tis = TikaInputStream.get(v.getBytes(UTF_8))) { + htmlParser.parse(tis, h, m, parseContext); handler.characters(h.toString()); } catch (SAXException e) { WriteLimitReachedException.throwIfWriteLimitReached(e); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java index 8fe685686..dade2ca5f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java @@ -32,7 +32,6 @@ import java.util.Set; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hpbf.extractor.PublisherTextExtractor; import org.apache.poi.poifs.crypt.Decryptor; @@ -118,7 +117,7 @@ public class OfficeParser extends AbstractOfficeParser { if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { embeddedDocumentExtractor.parseEmbedded( //pass in space character so that we don't trigger a zero-byte exception - UnsynchronizedByteArrayInputStream.builder().setByteArray(new byte[]{'\u0020'}).get(), xhtml, m, true); + TikaInputStream.get(new byte[]{'\u0020'}), xhtml, m, true); } return; } @@ -131,9 +130,9 @@ public class OfficeParser extends AbstractOfficeParser { m.set(TikaCoreProperties.RESOURCE_NAME_KEY, e.getKey()); } if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { - embeddedDocumentExtractor.parseEmbedded( - UnsynchronizedByteArrayInputStream.builder().setByteArray(e.getValue().getBytes(StandardCharsets.UTF_8)).get(), - xhtml, m, true); + try (TikaInputStream tis = TikaInputStream.get(e.getValue().getBytes(StandardCharsets.UTF_8))) { + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); + } } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java index 687724566..30e1ca14a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java @@ -468,9 +468,9 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { if (htmlParser == null) { htmlParser = new JSoupParser(); } - htmlParser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(), - new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), - parseContext); + try (TikaInputStream tis = TikaInputStream.get(data)) { + htmlParser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), parseContext); + } doneBody = true; } } @@ -488,8 +488,9 @@ public class OutlookExtractor extends AbstractPOIFSExtractor { if (rtfParser == null) { rtfParser = new RTFParser(); } - rtfParser.parseInline(UnsynchronizedByteArrayInputStream.builder().setByteArray(rtf.getData()).get(), - xhtml, new Metadata(), parseContext); + try (TikaInputStream tis = TikaInputStream.get(rtf.getData())) { + rtfParser.parseInline(tis, xhtml, new Metadata(), parseContext); + } doneBody = true; } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java index 0255a9161..06b2dd518 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmParser.java @@ -23,12 +23,12 @@ import java.util.Collections; import java.util.HashSet; import java.util.Set; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; @@ -94,12 +94,11 @@ public class ChmParser implements Parser { private void parsePage(byte[] byteObject, Parser htmlParser, ContentHandler xhtml, ParseContext context) throws TikaException, IOException, SAXException { // throws IOException - InputStream stream = null; Metadata metadata = new Metadata(); ContentHandler handler = new EmbeddedContentHandler(new BodyContentHandler(xhtml));// -1 - stream = UnsynchronizedByteArrayInputStream.builder().setByteArray(byteObject).get(); - htmlParser.parse(stream, handler, metadata, context); - + try (TikaInputStream tis = TikaInputStream.get(byteObject)) { + htmlParser.parse(tis, handler, metadata, context); + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java index a12806472..d9cb6b9d2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java @@ -17,7 +17,6 @@ package org.apache.tika.parser.microsoft.libpst; import java.io.IOException; -import java.io.InputStream; import java.nio.file.FileVisitResult; import java.nio.file.FileVisitor; import java.nio.file.Path; @@ -78,9 +77,9 @@ public class EmailVisitor implements FileVisitor<Path> { .relativize(file.getParent()) .toString(); emailMetadata.set(PST.PST_FOLDER_PATH, pstPath); - try (InputStream is = TikaInputStream.get(file)) { + try (TikaInputStream tis = TikaInputStream.get(file)) { try { - embeddedDocumentExtractor.parseEmbedded(is, xhtml, emailMetadata, true); + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, emailMetadata, true); } catch (SAXException e) { throw new IOException(e); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java index 933f3e99e..2e3d2d914 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSExtractorDecorator.java @@ -41,6 +41,7 @@ import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor; @@ -69,7 +70,7 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor { } } - private static InputStream getZipStream(String zipPath, ZipPackage zipPackage) + private static TikaInputStream getZipStream(String zipPath, ZipPackage zipPackage) throws IOException, TikaException { String targPath = (zipPath.length() > 1 && zipPath.startsWith("/") ? zipPath.substring(1) : zipPath); @@ -86,7 +87,7 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor { if (zipEntry == null) { throw new TikaException("Couldn't find required zip entry: " + zipPath); } - return zipEntrySource.getInputStream(zipEntry); + return TikaInputStream.get(zipEntrySource.getInputStream(zipEntry)); } @Override @@ -130,9 +131,9 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor { private void handleEmbeddedImage(String zipPath, Metadata metadata, EmbeddedDocumentUtil embeddedDocumentUtil, XHTMLContentHandler xhtml) throws SAXException, IOException { - InputStream stream = null; + TikaInputStream tis = null; try { - stream = getZipStream(zipPath, pkg); + tis = getZipStream(zipPath, pkg); } catch (IOException | TikaException e) { //store this exception in the parent's metadata EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); @@ -140,9 +141,9 @@ public class XPSExtractorDecorator extends AbstractOOXMLExtractor { } try { - embeddedDocumentUtil.parseEmbedded(stream, xhtml, metadata, true); + embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true); } finally { - IOUtils.closeQuietly(stream); + IOUtils.closeQuietly(tis); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index 4b21e5141..13c23a690 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -27,7 +27,6 @@ import com.pff.PSTAttachment; import com.pff.PSTException; import com.pff.PSTMessage; import com.pff.PSTRecipient; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -115,8 +114,9 @@ public class PSTMailItemParser implements Parser { metadata, context); } else { byte[] data = htmlChunk.getBytes(StandardCharsets.UTF_8); - htmlParser.parse(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(), - new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), context); + try (TikaInputStream tis = TikaInputStream.get(data)) { + htmlParser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), new Metadata(), context); + } } return; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java index 3bd4b92de..4688cda1a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java @@ -17,7 +17,6 @@ package org.apache.tika.parser.odf; import java.io.IOException; -import java.io.InputStream; import java.nio.charset.StandardCharsets; import org.apache.commons.lang3.StringUtils; @@ -108,9 +107,9 @@ class FlatOpenDocumentMacroHandler extends ContentHandlerDecorator { TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { - try (InputStream is = TikaInputStream.get(bytes)) { + try (TikaInputStream tis = TikaInputStream.get(bytes)) { embeddedDocumentExtractor - .parseEmbedded(is, contentHandler, embeddedMetadata, true); + .parseEmbedded(tis, contentHandler, embeddedMetadata, true); } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java index 94b1d86fb..0fbe29a74 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java @@ -19,7 +19,6 @@ package org.apache.tika.parser.odf; import static org.apache.tika.sax.XHTMLContentHandler.XHTML; import java.io.IOException; -import java.io.InputStream; import java.util.BitSet; import java.util.HashMap; import java.util.Map; @@ -529,8 +528,8 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { } Metadata embeddedMetadata = new Metadata(); if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { - try (InputStream is = TikaInputStream.get(bytes)) { - embeddedDocumentExtractor.parseEmbedded(is, handler, embeddedMetadata, true); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + embeddedDocumentExtractor.parseEmbedded(tis, handler, embeddedMetadata, true); } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java index c3e6bb7e8..de47f2394 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java @@ -263,9 +263,9 @@ class AbstractPDF2XHTML extends PDFTextStripper { if (supportedTypes.contains(XMP_MEDIA_TYPE)) { //try the main metadata if (pdfDocument.getDocumentCatalog().getMetadata() != null) { - try (InputStream is = pdfDocument.getDocumentCatalog().getMetadata() - .exportXMPMetadata()) { - extractXMPAsEmbeddedFile(is, XMP_DOCUMENT_CATALOG_LOCATION); + try (TikaInputStream tis = TikaInputStream.get( + pdfDocument.getDocumentCatalog().getMetadata().exportXMPMetadata())) { + extractXMPAsEmbeddedFile(tis, XMP_DOCUMENT_CATALOG_LOCATION); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); } @@ -274,8 +274,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { int pageNumber = 1; for (PDPage page : pdfDocument.getPages()) { if (page.getMetadata() != null) { - try (InputStream is = page.getMetadata().exportXMPMetadata()) { - extractXMPAsEmbeddedFile(is, XMP_PAGE_LOCATION_PREFIX + pageNumber); + try (TikaInputStream tis = TikaInputStream.get(page.getMetadata().exportXMPMetadata())) { + extractXMPAsEmbeddedFile(tis, XMP_PAGE_LOCATION_PREFIX + pageNumber); } catch (IOException e) { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); } @@ -301,17 +301,17 @@ class AbstractPDF2XHTML extends PDFTextStripper { EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata); } if (bytes != null) { - try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) { - parseMetadata(is, xfaMetadata); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + parseMetadata(tis, xfaMetadata); } } } } } - private void extractXMPAsEmbeddedFile(InputStream is, String location) + private void extractXMPAsEmbeddedFile(TikaInputStream tis, String location) throws IOException, SAXException { - if (is == null) { + if (tis == null) { return; } Metadata xmpMetadata = new Metadata(); @@ -320,19 +320,15 @@ class AbstractPDF2XHTML extends PDFTextStripper { TikaCoreProperties.EmbeddedResourceType.METADATA.toString()); xmpMetadata.set(PDF.XMP_LOCATION, location); if (embeddedDocumentExtractor.shouldParseEmbedded(xmpMetadata)) { - try { - parseMetadata(is, xmpMetadata); - } finally { - IOUtils.closeQuietly(is); - } + parseMetadata(tis, xmpMetadata); } } - private void parseMetadata(InputStream stream, Metadata embeddedMetadata) + private void parseMetadata(TikaInputStream tis, Metadata embeddedMetadata) throws IOException, SAXException { try { - embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), + embeddedDocumentExtractor.parseEmbedded(tis, new EmbeddedContentHandler(xhtml), embeddedMetadata, true); } catch (IOException e) { handleCatchableIOE(e); @@ -557,10 +553,10 @@ class AbstractPDF2XHTML extends PDFTextStripper { try (TemporaryResources tmp = new TemporaryResources()) { try (RenderResult renderResult = renderCurrentPage(pdPage, context, tmp)) { Metadata renderMetadata = renderResult.getMetadata(); - try (InputStream is = renderResult.getInputStream()) { + try (TikaInputStream tis = renderResult.getInputStream()) { renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, ocrImageMediaType.toString()); - ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), + ocrParser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), renderMetadata, context); } } @@ -966,8 +962,8 @@ class AbstractPDF2XHTML extends PDFTextStripper { String js = jsAction.getAction(); js = (js == null) ? "" : js; if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { - try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) { - embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, true); + try (TikaInputStream tis = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) { + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, m, true); } } addNonNullAttribute("class", "javascript", attributes); @@ -1105,7 +1101,7 @@ class AbstractPDF2XHTML extends PDFTextStripper { updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.VERSION.toString()); if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata)) { - try (InputStream tis = TikaInputStream.get(update)) { + try (TikaInputStream tis = TikaInputStream.get(update)) { context.set(IsIncrementalUpdate.class, IsIncrementalUpdate.IS_INCREMENTAL_UPDATE); embeddedDocumentExtractor.parseEmbedded(tis, xhtml, updateMetadata, false); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java index 70d17a8b3..c483c4090 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java @@ -17,7 +17,6 @@ package org.apache.tika.parser.pdf; import java.io.IOException; -import java.io.InputStream; import java.io.Writer; import java.util.HashMap; import java.util.HashSet; @@ -175,9 +174,9 @@ class PDF2XHTML extends AbstractPDF2XHTML { if (result.getStatus() == RenderResult.STATUS.SUCCESS) { if (embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) { - try (InputStream is = result.getInputStream()) { + try (TikaInputStream resultInputStream = result.getInputStream()) { //TODO: add markup here? - embeddedDocumentExtractor.parseEmbedded(is, xhtml, + embeddedDocumentExtractor.parseEmbedded(resultInputStream, xhtml, result.getMetadata(), true); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index ce2cb398f..0d92ee520 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -432,8 +432,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable { for (RenderResult result : renderResults.getResults()) { if (result.getStatus() == RenderResult.STATUS.SUCCESS) { if (embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) { - try (InputStream is = result.getInputStream()) { - embeddedDocumentExtractor.parseEmbedded(is, xhtml, result.getMetadata(), + try (TikaInputStream tis = result.getInputStream()) { + embeddedDocumentExtractor.parseEmbedded(tis, xhtml, result.getMetadata(), false); } catch (SecurityException e) { throw e; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java index 08d18b6c1..526f27731 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java @@ -115,14 +115,13 @@ public class PDFRenderingTest extends TikaTest { super(context); } - public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata, + @Override + public void parseEmbedded(TikaInputStream tis, ContentHandler handler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { - TikaInputStream tstream = TikaInputStream.get(stream); - byte[] bytes = Files.readAllBytes(tstream.getPath()); + + byte[] bytes = Files.readAllBytes(tis.getPath()); embedded.put(count++, bytes); - try (InputStream is = Files.newInputStream(tstream.getPath())) { - super.parseEmbedded(is, handler, metadata, outputHtml); - } + super.parseEmbedded(tis, handler, metadata, outputHtml); } public Map<Integer, byte[]> getEmbedded() { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index 11ecf902e..48651cc7b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -65,6 +65,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaMemoryLimitException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -228,7 +229,9 @@ public class CompressorParser implements Parser { EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (extractor.shouldParseEmbedded(entrydata)) { - extractor.parseEmbedded(cis, xhtml, entrydata, true); + try (TikaInputStream tis = TikaInputStream.get(cis)) { + extractor.parseEmbedded(tis, xhtml, entrydata, true); + } } } finally { cis.close(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java index 871f29388..11a6e52a4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java @@ -86,9 +86,9 @@ public class RarParser implements Parser { Metadata entrydata = PackageParser.handleEntryMetadata(header.getFileName(), header.getCTime(), header.getMTime(), header.getFullUnpackSize(), xhtml); - try (InputStream subFile = rar.getInputStream(header)) { + try (TikaInputStream rarTis = TikaInputStream.get(rar.getInputStream(header))) { if (extractor.shouldParseEmbedded(entrydata)) { - extractor.parseEmbedded(subFile, handler, entrydata, true); + extractor.parseEmbedded(rarTis, handler, entrydata, true); } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java index 20ee89dd7..f4005b526 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UnrarParser.java @@ -132,8 +132,8 @@ public class UnrarParser implements Parser { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fName); metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, relPath); if (extractor.shouldParseEmbedded(metadata)) { - try (InputStream is = TikaInputStream.get(embeddedFile)) { - extractor.parseEmbedded(is, xhtml, metadata, true); + try (TikaInputStream tis = TikaInputStream.get(embeddedFile)) { + extractor.parseEmbedded(tis, xhtml, metadata, true); } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/http/HttpParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/http/HttpParser.java index 440291aee..3bfa09a8f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/http/HttpParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/http/HttpParser.java @@ -35,6 +35,7 @@ import org.xml.sax.SAXException; import org.apache.tika.exception.TikaException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; @@ -79,8 +80,8 @@ public class HttpParser implements Parser { if (contentLength > 0) { MessageBody messageBody = LengthedBody.create(channel, buffer, contentLength); Metadata payloadMetadata = new Metadata(); - try (InputStream messageStream = messageBody.stream()) { - parsePayload(messageStream, xhtml, payloadMetadata, context); + try (TikaInputStream tis = TikaInputStream.get(messageBody.stream())) { + parsePayload(tis, xhtml, payloadMetadata, context); } } } finally { @@ -88,11 +89,11 @@ public class HttpParser implements Parser { } } - private void parsePayload(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException { + private void parsePayload(TikaInputStream tis, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException { EmbeddedDocumentExtractor ex = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); if (ex.shouldParseEmbedded(metadata)) { - ex.parseEmbedded(stream, handler, metadata, true); + ex.parseEmbedded(tis, handler, metadata, true); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java index e78511f2b..fcf4042df 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java @@ -116,9 +116,9 @@ public class WACZParser implements Parser { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(zae.getSize())); - try (InputStream inputStream = getMaybeGzipInputStream(TikaInputStream.get(zais))) { + try (TikaInputStream tis = TikaInputStream.get(getMaybeGzipInputStream(TikaInputStream.get(zais)))) { if (ex.shouldParseEmbedded(metadata)) { - ex.parseEmbedded(inputStream, xhtml, metadata, true); + ex.parseEmbedded(tis, xhtml, metadata, true); } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java index ad4894b54..9aa1e2f2f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java @@ -146,7 +146,7 @@ public class WARCParser implements Parser { if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) { //TODO check Content-Encoding on the warcResponse.http.headers and wrap the stream. //May need to sniff first few bytes to confirm accuracy, e.g. gzip compression ? - try (InputStream tis = TikaInputStream.get(payload.body().stream())) { + try (TikaInputStream tis = TikaInputStream.get(payload.body().stream())) { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, metadata, true); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java index 4e7f0dad7..094db7cc9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-xml-module/src/main/java/org/apache/tika/parser/xml/FictionBookParser.java @@ -17,12 +17,10 @@ package org.apache.tika.parser.xml; import java.io.IOException; -import java.io.InputStream; import java.util.Collections; import java.util.Set; import org.apache.commons.codec.binary.Base64; -import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -30,6 +28,7 @@ import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -86,10 +85,9 @@ public class FictionBookParser extends XMLParser { @Override public void endElement(String uri, String localName, String qName) throws SAXException { if (binaryMode) { - try (InputStream stream = - UnsynchronizedByteArrayInputStream.builder().setByteArray(Base64.decodeBase64(binaryData.toString())).get()) { + try (TikaInputStream tis = TikaInputStream.get(Base64.decodeBase64(binaryData.toString()))) { partExtractor.parseEmbedded( - stream, handler, metadata, true); + tis, handler, metadata, true); } catch (IOException e) { throw new SAXException("IOException in parseEmbedded", e); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java index 9f9f71357..de112e5ac 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java @@ -29,6 +29,7 @@ import org.junit.jupiter.api.Test; import org.xml.sax.ContentHandler; import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; @@ -134,7 +135,7 @@ public class ZipParserTest extends AbstractPkgTest { return false; } - public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, + public void parseEmbedded(TikaInputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) { throw new UnsupportedOperationException("should never be called"); } diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java index 7fb362300..a2e3064d6 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java @@ -58,6 +58,7 @@ import org.apache.tika.extractor.DefaultEmbeddedStreamTranslator; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedStreamTranslator; import org.apache.tika.io.BoundedInputStream; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MimeTypeException; @@ -186,12 +187,13 @@ public class UnpackerResource { return true; } - public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean b) throws SAXException, IOException { + @Override + public void parseEmbedded(TikaInputStream tis, ContentHandler contentHandler, Metadata metadata, boolean b) throws SAXException, IOException { UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream .builder() .get(); - BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes, inputStream); + BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes, tis); IOUtils.copy(bis, bos); if (bis.hasHitBound()) { throw new IOException(new TikaMemoryLimitException( @@ -222,7 +224,7 @@ public class UnpackerResource { LOG.warn("Unexpected MimeTypeException", e); } } - try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get()) { + try (TikaInputStream is = TikaInputStream.get(data)) { if (embeddedStreamTranslator.shouldTranslate(is, metadata)) { InputStream translated = embeddedStreamTranslator.translate(UnsynchronizedByteArrayInputStream.builder().setByteArray(data).get(), metadata); UnsynchronizedByteArrayOutputStream bos2 = UnsynchronizedByteArrayOutputStream
