[tika] branch TIKA-2849 updated: TIKA-2849 -- improve documentation, add spoolToFile parameter

tallison Fri, 19 Apr 2019 08:14:21 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-2849
in repository https://gitbox.apache.org/repos/asf/tika.git



The following commit(s) were added to refs/heads/TIKA-2849 by this push:
     new bcb587d  TIKA-2849 -- improve documentation, add spoolToFile parameter
bcb587d is described below

commit bcb587d226c4300a821d6b0da6b19b6b41872883
Author: TALLISON <[email protected]>
AuthorDate: Fri Apr 19 11:13:51 2019 -0400

    TIKA-2849 -- improve documentation, add spoolToFile parameter
---
 CHANGES.txt                                        |  5 ++++
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  7 ++++--
 .../parser/pkg/StreamingZipContainerDetector.java  | 29 ++++++++++++++--------
 .../tika/parser/pkg/ZipContainerDetector.java      | 29 +++++++++++++++++++++-
 ...Detector.java => ZipContainerDetectorBase.java} |  3 +--
 .../tika/parser/pkg/ZipContainerDetectorTest.java  |  4 +--
 6 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index c73a3df..61d4c7c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -8,6 +8,11 @@ Release 2.0.0 - ???
 
 Release 1.21 - ????
 
+   * The ZipContainerDetector's default behavior was changed to run
+     streaming detection up to its markLimit.  Users can get the
+     legacy behavior (spool-to-file/rely-on-underlying-file-in-TikaInputStream)
+     by setting "spoolToFile" to true and using a TikaInputStream (TIKA-2849).
+
    * Upgrade PDFBox to 2.0.14 (TIKA-2834).
 
    * Add CSV detection and replace TXTParser with TextAndCSVParser;
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index c37d895..4ac436c 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -95,7 +95,7 @@ public class OOXMLExtractorFactory {
                 try {
                     pkg = OPCPackage.open(tis.getFile().getPath(), 
PackageAccess.READ);
                 } catch (InvalidOperationException e) {
-                    tmpRepairedCopy = File.createTempFile("tika-ooxml-repair", 
"");
+                    tmpRepairedCopy = 
File.createTempFile("tika-ooxml-repair-", "");
                     ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
                     pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
                 }
@@ -110,8 +110,11 @@ public class OOXMLExtractorFactory {
                         pkg = OPCPackage.open(rereadableInputStream);
                     } catch (EOFException e) {
                         rereadableInputStream.rewind();
-                        tmpRepairedCopy = 
File.createTempFile("tika-ooxml-repair", "");
+                        tmpRepairedCopy = 
File.createTempFile("tika-ooxml-repair-", "");
                         ZipSalvager.salvageCopy(rereadableInputStream, 
tmpRepairedCopy);
+                        //if there isn't enough left to be opened as a package
+                        //throw an exception -- we may want to fall back to 
streaming
+                        //parsing
                         pkg = OPCPackage.open(tmpRepairedCopy, 
PackageAccess.READ);
                     }
                 }
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index 4d54b61..61db730 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -30,6 +30,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.iwork.IWorkPackageParser;
@@ -38,7 +39,7 @@ import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-class StreamingZipContainerDetector extends AbstractZipContainerDetector {
+class StreamingZipContainerDetector extends ZipContainerDetectorBase {
 
     static Map<String, MediaType> OOXML_CONTENT_TYPES = new 
ConcurrentHashMap<>();
     static {
@@ -60,12 +61,21 @@ class StreamingZipContainerDetector extends 
AbstractZipContainerDetector {
         
OOXML_CONTENT_TYPES.put("application/vnd.ms-package.xps-fixeddocumentsequence+xml",
 XPS);
     }
 
+    /**
+     *
+     * @param is inputstream to read from. Callers must mark/reset the stream
+     *           before/after this call to detect.  This call does not close 
the stream!
+     *           Depending on the file type, this call to detect may read the 
entire stream.
+     *           Make sure to use a {@link 
org.apache.tika.io.BoundedInputStream} or similar
+     *           if you want to protect against reading the entire stream.
+     * @return
+     */
     static MediaType detect(InputStream is) {
 
         Set<String> fileNames = new HashSet<>();
         Set<String> directoryNames = new HashSet<>();
-        try {
-            ZipArchiveInputStream zipArchiveInputStream = new 
ZipArchiveInputStream(is);
+        try (ZipArchiveInputStream zipArchiveInputStream =
+                     new ZipArchiveInputStream(new 
CloseShieldInputStream(is))) {
             ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
             while (zae != null) {
                 String name = zae.getName();
@@ -91,7 +101,8 @@ class StreamingZipContainerDetector extends 
AbstractZipContainerDetector {
                         return type.getType();
                     }
                 } else if (name.equals("mimetype")) {
-                    //odt
+                    //odt -- TODO -- bound the read and check that the results 
are
+                    //valid
                     return 
MediaType.parse(IOUtils.toString(zipArchiveInputStream, UTF_8));
                 }
                 zae = zipArchiveInputStream.getNextZipEntry();
@@ -101,6 +112,7 @@ class StreamingZipContainerDetector extends 
AbstractZipContainerDetector {
         } catch (Exception e) {
             //swallow
         }
+        //entrynames is the union of directory names and file names
         Set<String> entryNames = new HashSet<>(fileNames);
         entryNames.addAll(fileNames);
         MediaType mt = detectKmz(fileNames);
@@ -122,12 +134,11 @@ class StreamingZipContainerDetector extends 
AbstractZipContainerDetector {
         int hits = 0;
         for (String s : OOXML_HINTS) {
             if (entryNames.contains(s)) {
-                hits++;
+                if (++hits > 2) {
+                    return TIKA_OOXML;
+                }
             }
         }
-        if (hits > 2) {
-            return TIKA_OOXML;
-        }
         return MediaType.APPLICATION_ZIP;
     }
 
@@ -184,8 +195,6 @@ class StreamingZipContainerDetector extends 
AbstractZipContainerDetector {
     }
 
 
-
-
     private static class ContentTypeHandler extends DefaultHandler {
 
         private MediaType mediaType = null;
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 0f448d5..d9770f7 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -87,10 +87,14 @@ public class ZipContainerDetector implements Detector {
     /** Serial version UID */
     private static final long serialVersionUID = 2891763938430295453L;
 
-    //this has to be > 100,000 to handle some of our test iworks files
+    //this has to be > 100,000 to handle some of the iworks files
+    //in our unit tests
     @Field
     int markLimit = 500_000;
 
+    @Field
+    boolean spoolToFile = false;
+
     public MediaType detect(InputStream input, Metadata metadata)
             throws IOException {
         // Check if we have access to the document
@@ -115,6 +119,9 @@ public class ZipContainerDetector implements Detector {
 
             if (TikaInputStream.isTikaInputStream(input)) {
                 TikaInputStream tis = TikaInputStream.cast(input);
+                if (spoolToFile) {
+                    tis.getFile();
+                }
                 if (tis.hasFile()) {
                     return detectZipFormatOnFile(tis);
                 }
@@ -132,7 +139,27 @@ public class ZipContainerDetector implements Detector {
         } else {
             return detectCompressorFormat(prefix, length);
         }
+    }
+
+    /**
+     *
+     * @param markLimit mark limit for streaming detection
+     */
+    public void setMarkLimit(int markLimit) {
+        this.markLimit = markLimit;
+    }
 
+    /**
+     * Before version 1.21, if a user passed in a {@link TikaInputStream},
+     * the entire stream would be spooled to a file for Zip detection.
+     * With Tika 1.21, the default is now <code>false</code>, which means
+     * that the {@link ZipContainerDetector} will try streaming detection
+     * up to the {@link ZipContainerDetector#markLimit} on all streams.
+     * To revert to the legacy (pre 1.21) behavior, set this to 
<code>true</code>
+     * @param spoolToFile
+     */
+    public void setSpoolToFile(boolean spoolToFile) {
+        this.spoolToFile = spoolToFile;
     }
 
     private static MediaType detectCompressorFormat(byte[] prefix, int length) 
{
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
similarity index 99%
rename from 
tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
rename to 
tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
index beadf74..1e3aa58 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
@@ -24,7 +24,7 @@ import java.util.regex.Pattern;
 
 import org.apache.tika.mime.MediaType;
 
-abstract class AbstractZipContainerDetector {
+abstract class ZipContainerDetectorBase {
 
 
     static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml");
@@ -36,7 +36,6 @@ abstract class AbstractZipContainerDetector {
             MediaType.application("vnd.ms-word.document.macroEnabled.12");
     static final MediaType PPTX =
             
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
-
     static final MediaType PPSM =
             
MediaType.application("vnd.ms-powerpoint.slideshow.macroEnabled.12");
     static final MediaType PPSX =
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
index d18e638..f4f5059 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -100,12 +100,12 @@ public class ZipContainerDetectorTest extends TikaTest {
             try (InputStream input = ODFParserTest.class.getResourceAsStream(
                     "/test-documents/" + file)) {
                 MediaType mediaType = 
StreamingZipContainerDetector.detect(input);
-                assertEquals(AbstractZipContainerDetector.XPS, mediaType);
+                assertEquals(ZipContainerDetectorBase.XPS, mediaType);
             }
             try (TikaInputStream input = 
TikaInputStream.get(Paths.get(ODFParserTest.class.getResource(
                     "/test-documents/" + file).toURI()))) {
                 MediaType mediaType = zipContainerDetector.detect(input, new 
Metadata());
-                assertEquals(AbstractZipContainerDetector.XPS, mediaType);
+                assertEquals(ZipContainerDetectorBase.XPS, mediaType);
             }
         }
     }

[tika] branch TIKA-2849 updated: TIKA-2849 -- improve documentation, add spoolToFile parameter

Reply via email to