This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-2849
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-2849 by this push:
new bcb587d TIKA-2849 -- improve documentation, add spoolToFile parameter
bcb587d is described below
commit bcb587d226c4300a821d6b0da6b19b6b41872883
Author: TALLISON <[email protected]>
AuthorDate: Fri Apr 19 11:13:51 2019 -0400
TIKA-2849 -- improve documentation, add spoolToFile parameter
---
CHANGES.txt | 5 ++++
.../microsoft/ooxml/OOXMLExtractorFactory.java | 7 ++++--
.../parser/pkg/StreamingZipContainerDetector.java | 29 ++++++++++++++--------
.../tika/parser/pkg/ZipContainerDetector.java | 29 +++++++++++++++++++++-
...Detector.java => ZipContainerDetectorBase.java} | 3 +--
.../tika/parser/pkg/ZipContainerDetectorTest.java | 4 +--
6 files changed, 60 insertions(+), 17 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index c73a3df..61d4c7c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -8,6 +8,11 @@ Release 2.0.0 - ???
Release 1.21 - ????
+ * The ZipContainerDetector's default behavior was changed to run
+ streaming detection up to its markLimit. Users can get the
+ legacy behavior (spool-to-file/rely-on-underlying-file-in-TikaInputStream)
+ by setting "spoolToFile" to true and using a TikaInputStream (TIKA-2849).
+
* Upgrade PDFBox to 2.0.14 (TIKA-2834).
* Add CSV detection and replace TXTParser with TextAndCSVParser;
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index c37d895..4ac436c 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -95,7 +95,7 @@ public class OOXMLExtractorFactory {
try {
pkg = OPCPackage.open(tis.getFile().getPath(),
PackageAccess.READ);
} catch (InvalidOperationException e) {
- tmpRepairedCopy = File.createTempFile("tika-ooxml-repair",
"");
+ tmpRepairedCopy =
File.createTempFile("tika-ooxml-repair-", "");
ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
}
@@ -110,8 +110,11 @@ public class OOXMLExtractorFactory {
pkg = OPCPackage.open(rereadableInputStream);
} catch (EOFException e) {
rereadableInputStream.rewind();
- tmpRepairedCopy =
File.createTempFile("tika-ooxml-repair", "");
+ tmpRepairedCopy =
File.createTempFile("tika-ooxml-repair-", "");
ZipSalvager.salvageCopy(rereadableInputStream,
tmpRepairedCopy);
+ //if there isn't enough left to be opened as a package
+ //throw an exception -- we may want to fall back to
streaming
+ //parsing
pkg = OPCPackage.open(tmpRepairedCopy,
PackageAccess.READ);
}
}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index 4d54b61..61db730 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -30,6 +30,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.poi.xslf.usermodel.XSLFRelation;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.iwork.IWorkPackageParser;
@@ -38,7 +39,7 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-class StreamingZipContainerDetector extends AbstractZipContainerDetector {
+class StreamingZipContainerDetector extends ZipContainerDetectorBase {
static Map<String, MediaType> OOXML_CONTENT_TYPES = new
ConcurrentHashMap<>();
static {
@@ -60,12 +61,21 @@ class StreamingZipContainerDetector extends
AbstractZipContainerDetector {
OOXML_CONTENT_TYPES.put("application/vnd.ms-package.xps-fixeddocumentsequence+xml",
XPS);
}
+ /**
+ *
+ * @param is inputstream to read from. Callers must mark/reset the stream
+ * before/after this call to detect. This call does not close
the stream!
+ * Depending on the file type, this call to detect may read the
entire stream.
+ * Make sure to use a {@link
org.apache.tika.io.BoundedInputStream} or similar
+ * if you want to protect against reading the entire stream.
+ * @return
+ */
static MediaType detect(InputStream is) {
Set<String> fileNames = new HashSet<>();
Set<String> directoryNames = new HashSet<>();
- try {
- ZipArchiveInputStream zipArchiveInputStream = new
ZipArchiveInputStream(is);
+ try (ZipArchiveInputStream zipArchiveInputStream =
+ new ZipArchiveInputStream(new
CloseShieldInputStream(is))) {
ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
while (zae != null) {
String name = zae.getName();
@@ -91,7 +101,8 @@ class StreamingZipContainerDetector extends
AbstractZipContainerDetector {
return type.getType();
}
} else if (name.equals("mimetype")) {
- //odt
+ //odt -- TODO -- bound the read and check that the results
are
+ //valid
return
MediaType.parse(IOUtils.toString(zipArchiveInputStream, UTF_8));
}
zae = zipArchiveInputStream.getNextZipEntry();
@@ -101,6 +112,7 @@ class StreamingZipContainerDetector extends
AbstractZipContainerDetector {
} catch (Exception e) {
//swallow
}
+ //entrynames is the union of directory names and file names
Set<String> entryNames = new HashSet<>(fileNames);
entryNames.addAll(fileNames);
MediaType mt = detectKmz(fileNames);
@@ -122,12 +134,11 @@ class StreamingZipContainerDetector extends
AbstractZipContainerDetector {
int hits = 0;
for (String s : OOXML_HINTS) {
if (entryNames.contains(s)) {
- hits++;
+ if (++hits > 2) {
+ return TIKA_OOXML;
+ }
}
}
- if (hits > 2) {
- return TIKA_OOXML;
- }
return MediaType.APPLICATION_ZIP;
}
@@ -184,8 +195,6 @@ class StreamingZipContainerDetector extends
AbstractZipContainerDetector {
}
-
-
private static class ContentTypeHandler extends DefaultHandler {
private MediaType mediaType = null;
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 0f448d5..d9770f7 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -87,10 +87,14 @@ public class ZipContainerDetector implements Detector {
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
- //this has to be > 100,000 to handle some of our test iworks files
+ //this has to be > 100,000 to handle some of the iworks files
+ //in our unit tests
@Field
int markLimit = 500_000;
+ @Field
+ boolean spoolToFile = false;
+
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
// Check if we have access to the document
@@ -115,6 +119,9 @@ public class ZipContainerDetector implements Detector {
if (TikaInputStream.isTikaInputStream(input)) {
TikaInputStream tis = TikaInputStream.cast(input);
+ if (spoolToFile) {
+ tis.getFile();
+ }
if (tis.hasFile()) {
return detectZipFormatOnFile(tis);
}
@@ -132,7 +139,27 @@ public class ZipContainerDetector implements Detector {
} else {
return detectCompressorFormat(prefix, length);
}
+ }
+
+ /**
+ *
+ * @param markLimit mark limit for streaming detection
+ */
+ public void setMarkLimit(int markLimit) {
+ this.markLimit = markLimit;
+ }
+ /**
+ * Before version 1.21, if a user passed in a {@link TikaInputStream},
+ * the entire stream would be spooled to a file for Zip detection.
+ * With Tika 1.21, the default is now <code>false</code>, which means
+ * that the {@link ZipContainerDetector} will try streaming detection
+ * up to the {@link ZipContainerDetector#markLimit} on all streams.
+ * To revert to the legacy (pre 1.21) behavior, set this to
<code>true</code>
+ * @param spoolToFile
+ */
+ public void setSpoolToFile(boolean spoolToFile) {
+ this.spoolToFile = spoolToFile;
}
private static MediaType detectCompressorFormat(byte[] prefix, int length)
{
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
similarity index 99%
rename from
tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
rename to
tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
index beadf74..1e3aa58 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/AbstractZipContainerDetector.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
@@ -24,7 +24,7 @@ import java.util.regex.Pattern;
import org.apache.tika.mime.MediaType;
-abstract class AbstractZipContainerDetector {
+abstract class ZipContainerDetectorBase {
static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml");
@@ -36,7 +36,6 @@ abstract class AbstractZipContainerDetector {
MediaType.application("vnd.ms-word.document.macroEnabled.12");
static final MediaType PPTX =
MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
-
static final MediaType PPSM =
MediaType.application("vnd.ms-powerpoint.slideshow.macroEnabled.12");
static final MediaType PPSX =
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
index d18e638..f4f5059 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -100,12 +100,12 @@ public class ZipContainerDetectorTest extends TikaTest {
try (InputStream input = ODFParserTest.class.getResourceAsStream(
"/test-documents/" + file)) {
MediaType mediaType =
StreamingZipContainerDetector.detect(input);
- assertEquals(AbstractZipContainerDetector.XPS, mediaType);
+ assertEquals(ZipContainerDetectorBase.XPS, mediaType);
}
try (TikaInputStream input =
TikaInputStream.get(Paths.get(ODFParserTest.class.getResource(
"/test-documents/" + file).toURI()))) {
MediaType mediaType = zipContainerDetector.detect(input, new
Metadata());
- assertEquals(AbstractZipContainerDetector.XPS, mediaType);
+ assertEquals(ZipContainerDetectorBase.XPS, mediaType);
}
}
}