This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-2849 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1e08fb1e54653954a71c71a68218bcdf7a7a3afb Author: TALLISON <[email protected]> AuthorDate: Fri Apr 12 17:11:03 2019 -0400 TIKA-2849 -- move to streaming detection of zip files if TikaInputStream doesn't already have a file. --- .../org/apache/tika/io/BoundedInputStream.java | 118 ++++++++ .../tika/parser/digest/InputStreamDigester.java | 103 +------ .../src/test/java/org/apache/tika/TikaTest.java | 2 +- .../org/apache/tika/parser/epub/EpubParser.java | 12 +- .../tika/parser/iwork/IWorkPackageParser.java | 2 +- .../microsoft/ooxml/OOXMLExtractorFactory.java | 37 ++- .../parser/pkg/AbstractZipContainerDetector.java | 163 +++++++++++ .../parser/pkg/StreamingZipContainerDetector.java | 213 ++++++++++++++ .../tika/parser/pkg/ZipContainerDetector.java | 325 ++++++--------------- .../org/apache/tika/parser/utils/ZipSalvager.java | 75 ++--- .../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 1 + .../tika/parser/pkg/ZipContainerDetectorTest.java | 166 ++++++++++- .../org/apache/tika/parser/pkg/tika-config.xml | 31 ++ 13 files changed, 857 insertions(+), 391 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java new file mode 100644 index 0000000..dabedf5 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.io; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Very slight modification of Commons' BoundedInputStream + * so that we can figure out if this hit the bound or not. + */ +public class BoundedInputStream extends InputStream { + + + private final static int EOF = -1; + private final long max; + private final InputStream in; + private long pos; + + public BoundedInputStream(long max, InputStream in) { + this.max = max; + this.in = in; + } + + @Override + public int read() throws IOException { + if (max >= 0 && pos >= max) { + return EOF; + } + final int result = in.read(); + pos++; + return result; + } + + /** + * Invokes the delegate's <code>read(byte[])</code> method. + * + * @param b the buffer to read the bytes into + * @return the number of bytes read or -1 if the end of stream or + * the limit has been reached. + * @throws IOException if an I/O error occurs + */ + @Override + public int read(final byte[] b) throws IOException { + return this.read(b, 0, b.length); + } + + /** + * Invokes the delegate's <code>read(byte[], int, int)</code> method. + * + * @param b the buffer to read the bytes into + * @param off The start offset + * @param len The number of bytes to read + * @return the number of bytes read or -1 if the end of stream or + * the limit has been reached. + * @throws IOException if an I/O error occurs + */ + @Override + public int read(final byte[] b, final int off, final int len) throws IOException { + if (max >= 0 && pos >= max) { + return EOF; + } + final long maxRead = max >= 0 ? Math.min(len, max - pos) : len; + final int bytesRead = in.read(b, off, (int) maxRead); + + if (bytesRead == EOF) { + return EOF; + } + + pos += bytesRead; + return bytesRead; + } + + /** + * Invokes the delegate's <code>skip(long)</code> method. + * + * @param n the number of bytes to skip + * @return the actual number of bytes skipped + * @throws IOException if an I/O error occurs + */ + @Override + public long skip(final long n) throws IOException { + final long toSkip = max >= 0 ? Math.min(n, max - pos) : n; + final long skippedBytes = in.skip(toSkip); + pos += skippedBytes; + return skippedBytes; + } + + @Override + public void reset() throws IOException { + in.reset(); + pos = 0; + } + + @Override + public void mark(int readLimit) { + in.mark(readLimit); + } + + public boolean hasHitBound() { + return pos >= max; + } +} + diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java index a208fab..3d3ff17 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java +++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java @@ -26,6 +26,7 @@ import java.security.NoSuchAlgorithmException; import java.security.Provider; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.IOExceptionWithCause; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; @@ -126,7 +127,7 @@ public class InputStreamDigester implements DigestingParser.Digester { //try the usual mark/reset stuff. //however, if you actually hit the bound, //then stop and spool to file via TikaInputStream - SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is); + BoundedInputStream bis = new BoundedInputStream(markLimit, is); boolean finishedStream = false; bis.mark(markLimit + 1); finishedStream = digestStream(bis, metadata); @@ -153,7 +154,6 @@ public class InputStreamDigester implements DigestingParser.Digester { } } - private String getMetadataKey() { return TikaCoreProperties.TIKA_META_PREFIX + "digest" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + @@ -179,8 +179,8 @@ public class InputStreamDigester implements DigestingParser.Digester { updateDigest(messageDigest, is); digestBytes = messageDigest.digest(); - if (is instanceof SimpleBoundedInputStream) { - if (((SimpleBoundedInputStream) is).hasHitBound()) { + if (is instanceof BoundedInputStream) { + if (((BoundedInputStream) is).hasHitBound()) { return false; } } @@ -202,99 +202,4 @@ public class InputStreamDigester implements DigestingParser.Digester { return digest; } - - /** - * Very slight modification of Commons' BoundedInputStream - * so that we can figure out if this hit the bound or not. - */ - private static class SimpleBoundedInputStream extends InputStream { - private final static int EOF = -1; - private final long max; - private final InputStream in; - private long pos; - - private SimpleBoundedInputStream(long max, InputStream in) { - this.max = max; - this.in = in; - } - - @Override - public int read() throws IOException { - if (max >= 0 && pos >= max) { - return EOF; - } - final int result = in.read(); - pos++; - return result; - } - - /** - * Invokes the delegate's <code>read(byte[])</code> method. - * - * @param b the buffer to read the bytes into - * @return the number of bytes read or -1 if the end of stream or - * the limit has been reached. - * @throws IOException if an I/O error occurs - */ - @Override - public int read(final byte[] b) throws IOException { - return this.read(b, 0, b.length); - } - - /** - * Invokes the delegate's <code>read(byte[], int, int)</code> method. - * - * @param b the buffer to read the bytes into - * @param off The start offset - * @param len The number of bytes to read - * @return the number of bytes read or -1 if the end of stream or - * the limit has been reached. - * @throws IOException if an I/O error occurs - */ - @Override - public int read(final byte[] b, final int off, final int len) throws IOException { - if (max >= 0 && pos >= max) { - return EOF; - } - final long maxRead = max >= 0 ? Math.min(len, max - pos) : len; - final int bytesRead = in.read(b, off, (int) maxRead); - - if (bytesRead == EOF) { - return EOF; - } - - pos += bytesRead; - return bytesRead; - } - - /** - * Invokes the delegate's <code>skip(long)</code> method. - * - * @param n the number of bytes to skip - * @return the actual number of bytes skipped - * @throws IOException if an I/O error occurs - */ - @Override - public long skip(final long n) throws IOException { - final long toSkip = max >= 0 ? Math.min(n, max - pos) : n; - final long skippedBytes = in.skip(toSkip); - pos += skippedBytes; - return skippedBytes; - } - - @Override - public void reset() throws IOException { - in.reset(); - pos = 0; - } - - @Override - public void mark(int readLimit) { - in.mark(readLimit); - } - - public boolean hasHitBound() { - return pos >= max; - } - } } diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 00d8600..0aaaf35 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -394,7 +394,7 @@ public abstract class TikaTest { IOUtils.copy(is, bos); } if (truncatedLength > bos.toByteArray().length) { - throw new EOFException("Can't truncate beyond file length"); + throw new EOFException("Can't truncate beyond file length: "+bos.toByteArray().length); } byte[] truncated = new byte[truncatedLength]; System.arraycopy(bos.toByteArray(), 0, truncated, 0, truncatedLength); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java index df5b221..49019b6 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -175,6 +175,12 @@ public class EpubParser extends AbstractParser { TemporaryResources temporaryResources = null; if (TikaInputStream.isTikaInputStream(stream)) { tis = TikaInputStream.cast(stream); + if (tis.getOpenContainer() instanceof ZipFile) { + bufferedParseZipFile( + (ZipFile)tis.getOpenContainer(), + bodyHandler, xhtml, metadata, context, true); + return; + } } else { temporaryResources = new TemporaryResources(); tis = TikaInputStream.get(new CloseShieldInputStream(stream), temporaryResources); @@ -192,7 +198,11 @@ public class EpubParser extends AbstractParser { tis.close(); } } - bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true); + try { + bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true); + } finally { + zipFile.close(); + } } private void trySalvage(Path brokenZip, ContentHandler bodyHandler, diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java index 5d8f01a..2ffbf56 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java @@ -119,7 +119,7 @@ public class IWorkPackageParser extends AbstractParser { return detectType(zip); } - private static IWORKDocumentType detectType(InputStream stream) { + public static IWORKDocumentType detectType(InputStream stream) { QName qname = new XmlRootExtractor().extractRootElement(stream); if (qname != null) { String uri = qname.getNamespaceURI(); diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 017469b..c37d895 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -16,6 +16,7 @@ */ package org.apache.tika.parser.microsoft.ooxml; +import java.io.EOFException; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -55,6 +56,7 @@ import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtra import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; import org.apache.tika.parser.pkg.ZipContainerDetector; import org.apache.tika.parser.utils.ZipSalvager; +import org.apache.tika.utils.RereadableInputStream; import org.apache.xmlbeans.XmlException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -68,6 +70,7 @@ import org.xml.sax.SAXException; public class OOXMLExtractorFactory { private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class); + private static final int MAX_BUFFER_LENGTH = 1000000; public static void parse( InputStream stream, ContentHandler baseHandler, @@ -98,14 +101,36 @@ public class OOXMLExtractorFactory { } tis.setOpenContainer(pkg); } else { - InputStream shield = new CloseShieldInputStream(stream); - pkg = OPCPackage.open(shield); + //OPCPackage slurps rris into memory so we can close rris + //without apparent problems + try (RereadableInputStream rereadableInputStream = + new RereadableInputStream(stream, MAX_BUFFER_LENGTH, + true, false)) { + try { + pkg = OPCPackage.open(rereadableInputStream); + } catch (EOFException e) { + rereadableInputStream.rewind(); + tmpRepairedCopy = File.createTempFile("tika-ooxml-repair", ""); + ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy); + pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ); + } + } + } + + MediaType type = null; + String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE); + if (mediaTypeString != null) { + type = MediaType.parse(mediaTypeString); + } + if (type != null && OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) { + // Not a supported type, delegate to Empty Parser + EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context); + return; } - // Get the type, and ensure it's one we handle - MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg); - if (type == null) { - type = ZipContainerDetector.detectXPSOPC(pkg); + if (type == null || ! OOXMLParser.SUPPORTED_TYPES.contains(type)) { + // Get the type, and ensure it's one we handle + type = ZipContainerDetector.detectOfficeOpenXML(pkg); } if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) { diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java new file mode 100644 index 0000000..beadf74 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; +import java.util.regex.Pattern; + +import org.apache.tika.mime.MediaType; + +abstract class AbstractZipContainerDetector { + + + static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml"); + static final MediaType DOCX = + MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"); + static final MediaType DOCM = + MediaType.application("vnd.ms-word.document.macroEnabled.12"); + static final MediaType DOTX = + MediaType.application("vnd.ms-word.document.macroEnabled.12"); + static final MediaType PPTX = + MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"); + + static final MediaType PPSM = + MediaType.application("vnd.ms-powerpoint.slideshow.macroEnabled.12"); + static final MediaType PPSX = + MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow"); + static final MediaType PPTM = + MediaType.application("vnd.ms-powerpoint.presentation.macroEnabled.12"); + static final MediaType POTM = + MediaType.application("vnd.ms-powerpoint.template.macroenabled.12"); + static final MediaType POTX = + MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"); + static final MediaType THMX = + MediaType.application("vnd.openxmlformats-officedocument"); + static final MediaType XLSB = + MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12"); + static final MediaType XLSX = + MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"); + static final MediaType XLSM = + MediaType.application("vnd.ms-excel.sheet.macroEnabled.12"); + static final MediaType XPS = + MediaType.application("vnd.ms-xpsdocument"); + + static final Set<String> OOXML_HINTS = fillSet( + "word/document.xml", + "_rels/.rels", + "[Content_Types].xml", + "ppt/presentation.xml", + "ppt/slides/slide1.xml", + "xl/workbook.xml", + "xl/sharedStrings.xml", + "xl/worksheets/sheet1.xml" + ); + + private static Set<String> fillSet(String ... args) { + Set<String> tmp = new HashSet<>(); + for (String arg : args) { + tmp.add(arg); + } + return Collections.unmodifiableSet(tmp); + } + + static MediaType detectJar(Set<String> entryNames) { + if (entryNames.contains("META-INF/MANIFEST.MF")) { + // It's a Jar file, or something based on Jar + + // Is it an Android APK? + if (entryNames.contains("AndroidManifest.xml")) { + return MediaType.application("vnd.android.package-archive"); + } + + // Check for WAR and EAR + if (entryNames.contains("WEB-INF/")) { + return MediaType.application("x-tika-java-web-archive"); + } + if (entryNames.contains("META-INF/application.xml")) { + return MediaType.application("x-tika-java-enterprise-archive"); + } + + // Looks like a regular Jar Archive + return MediaType.application("java-archive"); + } else { + // Some Android APKs miss the default Manifest + if (entryNames.contains("AndroidManifest.xml")) { + return MediaType.application("vnd.android.package-archive"); + } + + return null; + } + } + + static MediaType detectKmz(Set<String> entryFileNames) { + //look for a single kml at the main level + boolean kmlFound = false; + for (String entryFileName : entryFileNames) { + if (entryFileName.indexOf('/') != -1 + || entryFileName.indexOf('\\') != -1) { + continue; + } + if (entryFileName.endsWith(".kml") && !kmlFound) { + kmlFound = true; + } else { + return null; + } + } + if (kmlFound) { + return MediaType.application("vnd.google-earth.kmz"); + } + return null; + } + + /** + * To be considered as an IPA file, it needs to match all of these + */ + private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() { + private static final long serialVersionUID = 6545295886322115362L; + { + add(Pattern.compile("^Payload/$")); + add(Pattern.compile("^Payload/.*\\.app/$")); + add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$")); + add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$")); + add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$")); + add(Pattern.compile("^Payload/.*\\.app/PkgInfo$")); + }}; + @SuppressWarnings("unchecked") + static MediaType detectIpa(Set<String> entryNames) { + // Note - consider generalising this logic, if another format needs many regexp matching + Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone(); + + for (String entryName : entryNames) { + Iterator<Pattern> ip = tmpPatterns.iterator(); + while (ip.hasNext()) { + if (ip.next().matcher(entryName).matches()) { + ip.remove(); + } + } + if (tmpPatterns.isEmpty()) { + // We've found everything we need to find + return MediaType.application("x-itunes-ipa"); + } + + } + return null; + } + +} diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java new file mode 100644 index 0000000..4d54b61 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.pkg; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.InputStream; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; +import org.apache.commons.io.IOUtils; +import org.apache.poi.xslf.usermodel.XSLFRelation; +import org.apache.poi.xssf.usermodel.XSSFRelation; +import org.apache.poi.xwpf.usermodel.XWPFRelation; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.iwork.IWorkPackageParser; +import org.apache.tika.utils.XMLReaderUtils; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +class StreamingZipContainerDetector extends AbstractZipContainerDetector { + + static Map<String, MediaType> OOXML_CONTENT_TYPES = new ConcurrentHashMap<>(); + static { + OOXML_CONTENT_TYPES.put(XWPFRelation.DOCUMENT.getContentType(), DOCX); + OOXML_CONTENT_TYPES.put(XWPFRelation.MACRO_DOCUMENT.getContentType(), DOCM); + OOXML_CONTENT_TYPES.put(XWPFRelation.TEMPLATE.getContentType(), DOTX); + + OOXML_CONTENT_TYPES.put(XSSFRelation.WORKBOOK.getContentType(), XLSX); + OOXML_CONTENT_TYPES.put(XSSFRelation.MACROS_WORKBOOK.getContentType(), XLSM); + OOXML_CONTENT_TYPES.put(XSSFRelation.XLSB_BINARY_WORKBOOK.getContentType(), XLSB); + OOXML_CONTENT_TYPES.put(XSLFRelation.MAIN.getContentType(), PPTX); + OOXML_CONTENT_TYPES.put(XSLFRelation.MACRO.getContentType(), PPSM); + OOXML_CONTENT_TYPES.put(XSLFRelation.MACRO_TEMPLATE.getContentType(), POTM); + OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), PPTM); + OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML.getContentType(), PPSX); + OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATION_MACRO.getContentType(), PPTM); + OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), POTX); + OOXML_CONTENT_TYPES.put(XSLFRelation.THEME_MANAGER.getContentType(), THMX); + OOXML_CONTENT_TYPES.put("application/vnd.ms-package.xps-fixeddocumentsequence+xml", XPS); + } + + static MediaType detect(InputStream is) { + + Set<String> fileNames = new HashSet<>(); + Set<String> directoryNames = new HashSet<>(); + try { + ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is); + ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry(); + while (zae != null) { + String name = zae.getName(); + if (zae.isDirectory()) { + directoryNames.add(name); + zae = zipArchiveInputStream.getNextZipEntry(); + continue; + } + fileNames.add(name); + //we could also parse _rel/.rels, but if + // there isn't a valid content_types, then POI + //will throw an exception...Better to backoff to PKG + //than correctly identify a truncated + if (name.equals("[Content_Types].xml")) { + MediaType mt = parseOOXMLContentTypes(zipArchiveInputStream); + if (mt != null) { + return mt; + } + return TIKA_OOXML; + } else if (IWorkPackageParser.IWORK_CONTENT_ENTRIES.contains(name)) { + IWorkPackageParser.IWORKDocumentType type = IWorkPackageParser.IWORKDocumentType.detectType(zipArchiveInputStream); + if (type != null) { + return type.getType(); + } + } else if (name.equals("mimetype")) { + //odt + return MediaType.parse(IOUtils.toString(zipArchiveInputStream, UTF_8)); + } + zae = zipArchiveInputStream.getNextZipEntry(); + } + } catch (SecurityException e) { + throw e; + } catch (Exception e) { + //swallow + } + Set<String> entryNames = new HashSet<>(fileNames); + entryNames.addAll(fileNames); + MediaType mt = detectKmz(fileNames); + if (mt != null) { + return mt; + } + mt = detectJar(entryNames); + if (mt != null) { + return mt; + } + mt = detectIpa(entryNames); + if (mt != null) { + return mt; + } + mt = detectIWorks(entryNames); + if (mt != null) { + return mt; + } + int hits = 0; + for (String s : OOXML_HINTS) { + if (entryNames.contains(s)) { + hits++; + } + } + if (hits > 2) { + return TIKA_OOXML; + } + return MediaType.APPLICATION_ZIP; + } + + private static MediaType detectIWorks(Set<String> entryNames) { + //general iworks + if (entryNames.contains(IWorkPackageParser.IWORK_COMMON_ENTRY)) { + return MediaType.application("vnd.apple.iwork"); + } + return null; + } + + + public static Set<String> parseOOXMLRels(InputStream is) { + RelsHandler relsHandler = new RelsHandler(); + try { + XMLReaderUtils.parseSAX(is, relsHandler, new ParseContext()); + } catch (SecurityException e) { + throw e; + } catch (Exception e) { + + } + return relsHandler.rels; + } + + private static class RelsHandler extends DefaultHandler { + Set<String> rels = new HashSet<>(); + private MediaType mediaType = null; + @Override + public void startElement(String uri, String localName, + String name, Attributes attrs) throws SAXException { + for (int i = 0; i < attrs.getLength(); i++) { + String attrName = attrs.getLocalName(i); + if (attrName.equals("Type")) { + String contentType = attrs.getValue(i); + rels.add(contentType); + if (OOXML_CONTENT_TYPES.containsKey(contentType)) { + mediaType = OOXML_CONTENT_TYPES.get(contentType); + } + } + } + } + } + + public static MediaType parseOOXMLContentTypes(InputStream is) { + ContentTypeHandler contentTypeHandler = new ContentTypeHandler(); + try { + XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext()); + } catch (SecurityException e) { + throw e; + } catch (Exception e) { + + } + return contentTypeHandler.mediaType; + } + + + + + private static class ContentTypeHandler extends DefaultHandler { + + private MediaType mediaType = null; + + @Override + public void startElement(String uri, String localName, + String name, Attributes attrs) throws SAXException { + for (int i = 0; i < attrs.getLength(); i++) { + String attrName = attrs.getLocalName(i); + if (attrName.equals("ContentType")) { + String contentType = attrs.getValue(i); + if (OOXML_CONTENT_TYPES.containsKey(contentType)) { + mediaType = OOXML_CONTENT_TYPES.get(contentType); + throw new StoppingEarlyException(); + } + + } + } + } + } + + private static class StoppingEarlyException extends SAXException { + + } +} diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java index 3f2303b..0f448d5 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java @@ -16,10 +16,22 @@ */ package org.apache.tika.parser.pkg; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Enumeration; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Pattern; + import org.apache.commons.compress.archivers.ArchiveException; import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; -import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; import org.apache.commons.compress.archivers.zip.ZipFile; import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorStreamFactory; @@ -31,39 +43,15 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; import org.apache.poi.openxml4j.util.ZipEntrySource; import org.apache.poi.openxml4j.util.ZipFileZipEntrySource; -import org.apache.poi.xslf.usermodel.XSLFRelation; -import org.apache.poi.xssf.usermodel.XSSFRelation; -import org.apache.poi.xwpf.usermodel.XWPFRelation; +import org.apache.tika.config.Field; import org.apache.tika.detect.Detector; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.BoundedInputStream; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.iwork.IWorkPackageParser; import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType; import org.apache.tika.parser.iwork.iwana.IWork13PackageParser; -import org.apache.tika.utils.XMLReaderUtils; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import java.io.ByteArrayInputStream; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.Collections; -import java.util.Enumeration; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.regex.Pattern; - -import static java.nio.charset.StandardCharsets.UTF_8; /** * A detector that works on Zip documents and other archive and compression @@ -95,45 +83,14 @@ public class ZipContainerDetector implements Detector { private static final String XPS_DOCUMENT = "http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"; - private static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml"); - private static final MediaType DOCX = - MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document"); - private static final MediaType DOCM = - MediaType.application("vnd.ms-word.document.macroEnabled.12"); - private static final MediaType DOTX = - MediaType.application("vnd.ms-word.document.macroEnabled.12"); - private static final MediaType PPTX = - MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation"); - private static final MediaType PPTM = - MediaType.application("vnd.ms-powerpoint.presentation.macroEnabled.12"); - private static final MediaType POTX = - MediaType.application("vnd.openxmlformats-officedocument.presentationml.template"); - private static final MediaType XLSX = - MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet"); - private static final MediaType XLSM = - MediaType.application("vnd.ms-excel.sheet.macroEnabled.12"); - - private static final Set<String> OOXML_HINTS = fillSet( - "word/document.xml", - "_rels/.rels", - "[Content_Types].xml", - "ppt/presentation.xml", - "ppt/slides/slide1.xml", - "xl/workbook.xml", - "xl/sharedStrings.xml", - "xl/worksheets/sheet1.xml" - ); - - static Set<String> fillSet(String ... args) { - Set<String> tmp = new HashSet<>(); - for (String arg : args) { - tmp.add(arg); - } - return Collections.unmodifiableSet(tmp); - } + /** Serial version UID */ private static final long serialVersionUID = 2891763938430295453L; + //this has to be > 100,000 to handle some of our test iworks files + @Field + int markLimit = 500_000; + public MediaType detect(InputStream input, Metadata metadata) throws IOException { // Check if we have access to the document @@ -141,32 +98,41 @@ public class ZipContainerDetector implements Detector { return MediaType.OCTET_STREAM; } - TemporaryResources tmp = new TemporaryResources(); + byte[] prefix = new byte[1024]; // enough for all known archive formats + input.mark(1024); + int length = -1; try { - TikaInputStream tis = TikaInputStream.get(input, tmp); + length = IOUtils.read(input, prefix); + } finally { + input.reset(); + } - byte[] prefix = new byte[1024]; // enough for all known formats - int length = tis.peek(prefix); + MediaType type = detectArchiveFormat(prefix, length); - MediaType type = detectArchiveFormat(prefix, length); + if (type == TIFF) { + return TIFF; + } else if (PackageParser.isZipArchive(type)) { - if (type == TIFF) { - return TIFF; - } else if (PackageParser.isZipArchive(type) - && TikaInputStream.isTikaInputStream(input)) { - return detectZipFormat(tis); - } else if (!type.equals(MediaType.OCTET_STREAM)) { - return type; - } else { - return detectCompressorFormat(prefix, length); + if (TikaInputStream.isTikaInputStream(input)) { + TikaInputStream tis = TikaInputStream.cast(input); + if (tis.hasFile()) { + return detectZipFormatOnFile(tis); + } } - } finally { + + input.mark(markLimit); try { - tmp.dispose(); - } catch (TikaException e) { - // ignore + return StreamingZipContainerDetector.detect( + new BoundedInputStream(markLimit, input)); + } finally { + input.reset(); } + } else if (!type.equals(MediaType.OCTET_STREAM)) { + return type; + } else { + return detectCompressorFormat(prefix, length); } + } private static MediaType detectCompressorFormat(byte[] prefix, int length) { @@ -211,17 +177,18 @@ public class ZipContainerDetector implements Detector { } } - private static MediaType detectZipFormat(TikaInputStream tis) { + /** + * This will call TikaInputStream's getFile(). If there are no exceptions, + * it will place the ZipFile in TikaInputStream's openContainer and leave it + * open. + * @param tis + * @return + */ + private static MediaType detectZipFormatOnFile(TikaInputStream tis) { try { - //try opc first because opening a package - //will not necessarily throw an exception for - //truncated files. - MediaType type = detectOPCBased(tis); - if (type != null) { - return type; - } ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()? + MediaType type = null; try { type = detectOpenDocument(zip); @@ -244,14 +211,17 @@ public class ZipContainerDetector implements Detector { return type; } } finally { - // TODO: shouldn't we record the open - // container so it can be later - // reused...? - // tis.setOpenContainer(zip); - try { - zip.close(); - } catch (IOException e) { - // ignore + tis.setOpenContainer(zip); + } + //finally, test for opc based + //if it is not an opc based file, poi throws an exception + //and we close the zip + //if it is opc based, we put the pkg in TikaInputStream's open container + if (zip.getEntry("_rels/.rels") != null + || zip.getEntry("[Content_Types].xml") != null) { + type = detectOPCBased(zip, tis); + if (type != null) { + return type; } } } catch (IOException e) { @@ -281,57 +251,32 @@ public class ZipContainerDetector implements Detector { } } - private static MediaType detectOPCBased(TikaInputStream stream) { + //If this is not an OPCBased file, POI throws an exception and we close the zipFile. + private static MediaType detectOPCBased(ZipFile zipFile, TikaInputStream stream) { + //as of 4.x, POI throws an exception for non-POI OPC file types + //unless we change POI, we can't rely on POI for non-POI files + ZipEntrySource zipEntrySource = new ZipFileZipEntrySource(zipFile); - ZipEntrySource zipEntrySource = null; - try { - zipEntrySource = new ZipFileZipEntrySource(new ZipFile(stream.getFile())); - } catch (IOException e) { - return tryStreamingDetection(stream); - } - - //if (zip.getEntry("_rels/.rels") != null - // || zip.getEntry("[Content_Types].xml") != null) { // Use POI to open and investigate it for us //Unfortunately, POI can throw a RuntimeException...so we //have to catch that. OPCPackage pkg = null; - try { - pkg = OPCPackage.open(zipEntrySource); - } catch (SecurityException e) { - closeQuietly(zipEntrySource); - //TIKA-2571 - throw e; - } catch (InvalidFormatException|RuntimeException e) { - closeQuietly(zipEntrySource); - return null; - } - MediaType type = null; try { - - // Is at an OOXML format? + pkg = OPCPackage.open(zipEntrySource); type = detectOfficeOpenXML(pkg); - if (type == null) { - // Is it XPS format? - type = detectXPSOPC(pkg); - } - if (type == null) { - // Is it an AutoCAD format? - type = detectAutoCADOPC(pkg); - } - } catch (SecurityException e) { closeQuietly(zipEntrySource); + IOUtils.closeQuietly(zipFile); //TIKA-2571 throw e; - } catch (RuntimeException e) { + } catch (InvalidFormatException|RuntimeException e) { closeQuietly(zipEntrySource); + IOUtils.closeQuietly(zipFile); return null; } //only set the open container if we made it here stream.setOpenContainer(pkg); - // We don't know what it is, sorry return type; } @@ -360,7 +305,19 @@ public class ZipContainerDetector implements Detector { if (core.size() == 0) { core = pkg.getRelationshipsByType(VISIO_DOCUMENT); } - + if (core.size() == 0) { + core = pkg.getRelationshipsByType(XPS_DOCUMENT); + if (core.size() == 1) { + return MediaType.application("vnd.ms-xpsdocument"); + } + } + + if (core.size() == 0) { + core = pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence"); + if (core.size() == 1) { + return MediaType.parse("model/vnd.dwfx+xps"); + } + } // If we didn't find a single core document of any type, skip detection if (core.size() != 1) { // Invalid OOXML Package received @@ -389,19 +346,7 @@ public class ZipContainerDetector implements Detector { // Build the MediaType object and return return MediaType.parse(docType); } - /** - * Detects Open XML Paper Specification (XPS) - */ - public static MediaType detectXPSOPC(OPCPackage pkg) { - PackageRelationshipCollection xps = - pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation"); - if (xps.size() == 1) { - return MediaType.application("vnd.ms-xpsdocument"); - } else { - // Non-XPS Package received - return null; - } - } + /** * Detects AutoCAD formats that live in OPC packaging */ @@ -534,95 +479,5 @@ public class ZipContainerDetector implements Detector { return null; } - private static MediaType tryStreamingDetection(TikaInputStream stream) { - Set<String> entryNames = new HashSet<>(); - try (InputStream is = new FileInputStream(stream.getFile())) { - ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is); - ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry(); - while (zae != null) { - if (zae.isDirectory()) { - zae = zipArchiveInputStream.getNextZipEntry(); - continue; - } - entryNames.add(zae.getName()); - //we could also parse _rel/.rels, but if - // there isn't a valid content_types, then POI - //will throw an exception...Better to backoff to PKG - //than correctly identify a truncated - if (zae.getName().equals("[Content_Types].xml")) { - MediaType mt = parseContentTypes(zipArchiveInputStream); - if (mt != null) { - return mt; - } - return TIKA_OOXML; - } - zae = zipArchiveInputStream.getNextZipEntry(); - } - } catch (SecurityException e) { - throw e; - } catch (Exception e) { - //swallow - } - int hits = 0; - for (String s : OOXML_HINTS) { - if (entryNames.contains(s)) { - hits++; - } - } - if (hits > 2) { - return TIKA_OOXML; - } - return MediaType.APPLICATION_ZIP; - } - - private static MediaType parseContentTypes(InputStream is) { - ContentTypeHandler contentTypeHandler = new ContentTypeHandler(); - try { - XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext()); - } catch (SecurityException e) { - throw e; - } catch (Exception e) { - - } - return contentTypeHandler.mediaType; - } - - - private static class ContentTypeHandler extends DefaultHandler { - static Map<String, MediaType> CONTENT_TYPES = new ConcurrentHashMap<>(); - static { - CONTENT_TYPES.put(XWPFRelation.DOCUMENT.getContentType(), DOCX); - CONTENT_TYPES.put(XWPFRelation.MACRO_DOCUMENT.getContentType(), DOCM); - CONTENT_TYPES.put(XWPFRelation.TEMPLATE.getContentType(), DOTX); - - CONTENT_TYPES.put(XSSFRelation.WORKBOOK.getContentType(), XLSX); - CONTENT_TYPES.put(XSSFRelation.MACROS_WORKBOOK.getContentType(), XLSM); - CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML.getContentType(), PPTX); - CONTENT_TYPES.put(XSLFRelation.PRESENTATION_MACRO.getContentType(), PPTM); - CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), POTX); - } - - private MediaType mediaType = null; - - @Override - public void startElement(String uri, String localName, - String name, Attributes attrs) throws SAXException { - for (int i = 0; i < attrs.getLength(); i++) { - String attrName = attrs.getLocalName(i); - if (attrName.equals("ContentType")) { - String contentType = attrs.getValue(i); - if (CONTENT_TYPES.containsKey(contentType)) { - mediaType = CONTENT_TYPES.get(contentType); - throw new StoppingEarlyException(); - } - - } - } - } - } - - private static class StoppingEarlyException extends SAXException { - - } } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java index 20ebf1b..f7cf08a 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java @@ -18,9 +18,9 @@ package org.apache.tika.parser.utils; import java.io.EOFException; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.file.Files; import java.util.zip.ZipException; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; @@ -38,52 +38,57 @@ public class ZipSalvager { * This streams the broken zip and rebuilds a new zip that * is at least a valid zip file. The contents of the final stream * may be truncated, but the result should be a valid zip file. - * + * <p> * This does nothing fancy to fix the underlying broken zip. * * @param brokenZip * @param salvagedZip */ - public static void salvageCopy(File brokenZip, File salvagedZip) { + public static void salvageCopy(InputStream brokenZip, File salvagedZip) { try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip)) { - try (InputStream is = new FileInputStream(brokenZip)) { - ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is); - ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry(); - while (zae != null) { - try { - if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) { - //create a new ZAE and copy over only the name so that - //if there is bad info (e.g. CRC) in brokenZip's zae, that - //won't be propagated or cause an exception - outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName())); - //this will copy an incomplete stream...so there - //could be truncation of the xml/contents, but the zip file - //should be intact. - boolean successfullyCopied = false; - try { - IOUtils.copy(zipArchiveInputStream, outputStream); - successfullyCopied = true; - } catch (IOException e) { - //this can hit a "truncated ZipFile" IOException - } - outputStream.flush(); - outputStream.closeArchiveEntry(); - if (!successfullyCopied) { - break; - } + ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(brokenZip); + ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry(); + while (zae != null) { + try { + if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) { + //create a new ZAE and copy over only the name so that + //if there is bad info (e.g. CRC) in brokenZip's zae, that + //won't be propagated or cause an exception + outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName())); + //this will copy an incomplete stream...so there + //could be truncation of the xml/contents, but the zip file + //should be intact. + boolean successfullyCopied = false; + try { + IOUtils.copy(zipArchiveInputStream, outputStream); + successfullyCopied = true; + } catch (IOException e) { + //this can hit a "truncated ZipFile" IOException + } + outputStream.flush(); + outputStream.closeArchiveEntry(); + if (!successfullyCopied) { + break; } - zae = zipArchiveInputStream.getNextZipEntry(); - } catch (ZipException|EOFException e) { - break; } - + zae = zipArchiveInputStream.getNextZipEntry(); + } catch (ZipException | EOFException e) { + break; } - outputStream.flush(); - outputStream.finish(); - outputStream.close(); + } + outputStream.flush(); + outputStream.finish(); + + } catch (IOException e) { LOG.warn("problem fixing zip", e); } } + + public static void salvageCopy(File brokenZip, File salvagedZip) throws IOException { + try (InputStream is = Files.newInputStream(brokenZip.toPath())) { + salvageCopy(is, salvagedZip); + } + } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java index 1cf1874..1247cc1 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java @@ -38,6 +38,7 @@ import java.nio.file.Paths; import java.util.List; import java.util.Random; +import static junit.framework.TestCase.assertTrue; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java index 2865442..bcba573 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java @@ -18,38 +18,178 @@ package org.apache.tika.parser.pkg; -import org.apache.commons.compress.compressors.CompressorStreamFactory; -import org.apache.tika.TikaTest; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.ParseContext; -import org.junit.BeforeClass; -import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; import java.io.InputStream; +import java.nio.file.Paths; +import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Set; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; +import org.apache.tika.TikaTest; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.detect.Detector; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MediaTypeRegistry; +import org.apache.tika.parser.odf.ODFParserTest; +import org.junit.Ignore; +import org.junit.Test; public class ZipContainerDetectorTest extends TikaTest { + private static MediaType ODT_TEXT = MediaType.application("vnd.oasis.opendocument.text"); + private static MediaType TIFF = MediaType.image("tiff"); + ZipContainerDetector zipContainerDetector = new ZipContainerDetector(); @Test public void testTiffWorkaround() throws Exception { //TIKA-2591 - ZipContainerDetector zipContainerDetector = new ZipContainerDetector(); Metadata metadata = new Metadata(); try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) { MediaType mt = zipContainerDetector.detect(is, metadata); - assertEquals(MediaType.image("tiff"), mt); + assertEquals(TIFF, mt); } metadata = new Metadata(); try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) { MediaType mt = zipContainerDetector.detect(is, metadata); - assertEquals(MediaType.image("tiff"), mt); + assertEquals(TIFF, mt); + } + } + + @Test + public void testODT() throws Exception { + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testODFwithOOo3.odt")) { + Metadata metadata = new Metadata(); + MediaType mt = zipContainerDetector.detect(input, metadata); + assertEquals(ODT_TEXT, mt); + } + } + + @Test + public void testIWorks() throws Exception { + //have to have marklimit in ZipContainerDetector > 100000 for this to work + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testPages.pages")) { + Metadata metadata = new Metadata(); + MediaType mt = zipContainerDetector.detect(input, metadata); + assertEquals("application/vnd.apple.pages", mt.toString()); + } + + InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pkg/tika-config.xml"); + assertNotNull(is); + TikaConfig tikaConfig = new TikaConfig(is); + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/testPages.pages")) { + Metadata metadata = new Metadata(); + MediaType mt = tikaConfig.getDetector().detect(input, metadata); + assertEquals("application/zip", mt.toString()); + } + } + + @Test + public void testXPS() throws Exception { + for (String file : new String[]{"testXPS_various.xps", "testPPT.xps"}) { + long start = System.currentTimeMillis(); + try (InputStream input = ODFParserTest.class.getResourceAsStream( + "/test-documents/" + file)) { + MediaType mediaType = StreamingZipContainerDetector.detect(input); + assertEquals(AbstractZipContainerDetector.XPS, mediaType); + } + try (TikaInputStream input = TikaInputStream.get(Paths.get(ODFParserTest.class.getResource( + "/test-documents/" + file).toURI()))) { + MediaType mediaType = zipContainerDetector.detect(input, new Metadata()); + assertEquals(AbstractZipContainerDetector.XPS, mediaType); + } } + } + @Test + @Ignore("to be used for offline timing tests") + public void timeDetection() throws Exception { + TikaConfig config = TikaConfig.getDefaultConfig(); + Detector detector = config.getDetector(); + MediaTypeRegistry registry = config.getMediaTypeRegistry(); + List<File> zips = getTestZipBasedFiles(detector, registry); + + Set<MediaType> mediaTypeSet = new HashSet<>(); + long stream = 0; + long file = 0; + for (int i = 0; i < 20; i++) { + for (File z : zips) { + long start = System.currentTimeMillis(); + try (InputStream is = new BufferedInputStream(new FileInputStream(z))) { + MediaType mt = detector.detect(is, new Metadata()); + mediaTypeSet.add(mt); + } + stream += System.currentTimeMillis()-start; + } + + for (File z : zips) { + long start = System.currentTimeMillis(); + try (InputStream is = TikaInputStream.get(z)) { + MediaType mt = detector.detect(is, new Metadata()); + mediaTypeSet.add(mt); + } + file += System.currentTimeMillis()-start; + } + } + System.out.println("stream: "+stream + " file: "+file); + } + + @Test + @Ignore("to be used for offline timing tests") + public void timeParsing() throws Exception { + TikaConfig config = TikaConfig.getDefaultConfig(); + Detector detector = config.getDetector(); + MediaTypeRegistry registry = config.getMediaTypeRegistry(); + + List<File> zips = getTestZipBasedFiles(detector, registry); + System.out.println("zips size: "+zips.size()); + Set<MediaType> mediaTypeSet = new HashSet<>(); + long stream = 0; + long file = 0; + for (int i = 0; i < 20; i++) { + for (File z : zips) { + long start = System.currentTimeMillis(); + try (InputStream is = new BufferedInputStream(new FileInputStream(z))) { + getRecursiveMetadata(is, true); + } + stream += System.currentTimeMillis()-start; + } + + for (File z : zips) { + long start = System.currentTimeMillis(); + try (InputStream is = TikaInputStream.get(z)) { + getRecursiveMetadata(is, true); + } + file += System.currentTimeMillis()-start; + } + } + System.out.println("stream: "+stream + " file: "+file); + } + + //TODO -- we need to find a dwg+xps file for testing + + private List<File> getTestZipBasedFiles(Detector detector, MediaTypeRegistry registry) throws Exception { + List<File> zips = new ArrayList<>(); + for (File f : Paths.get( + this.getClass().getResource("/test-documents").toURI()).toFile().listFiles()) { + try (InputStream is = TikaInputStream.get(f)) { + MediaType mt = detector.detect(is, new Metadata()); + if (registry.isSpecializationOf(mt, MediaType.APPLICATION_ZIP)) { + zips.add(f); + } + } catch (Exception e) { + + } + } + return zips; } } \ No newline at end of file diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml new file mode 100644 index 0000000..97d7c7b --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml @@ -0,0 +1,31 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers/> + <detectors> + <detector class="org.apache.tika.detect.DefaultDetector"> + <detector-exclude class="org.apache.tika.parser.pkg.ZipContainerDetector"/> + </detector> + <detector class="org.apache.tika.parser.pkg.ZipContainerDetector"> + <params> + <param name="markLimit" type="int">100000</param> + </params> + </detector> + </detectors> + <translator class="org.apache.tika.language.translate.DefaultTranslator"/> +</properties> \ No newline at end of file
