[ https://issues.apache.org/jira/browse/TIKA-4459?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18010647#comment-18010647 ]
Manish S N edited comment on TIKA-4459 at 7/29/25 12:49 PM: ------------------------------------------------------------ I did a performance test with spooling (for every file type) and withour explicit spooling the results are {code:java} 17:05:39.547 [main] INFO org.manish.PerfTest -- #$# spooling: true, fileCount: 23, meanTime: 5.96, stdDeviation: 3.78, minTime: 2.0, maxTime: 13.0 17:05:39.547 [main] INFO org.manish.PerfTest -- #$# spooling: false, fileCount: 23, meanTime: 5.13, stdDeviation: 4.81, minTime: 1.0, maxTime: 17.0 {code} I only walked all the files in my document folder and my machine has an ssd so the results might be influenced by that. but it seems the spooling variant performed better. I attach the performance test code below so you can try it on your test files folder {code:java} public class PerfTest { //this class is to test performance difference between spooling and not spooling input stream public static final Path TEST_FOLDER = Path.of("/path/redacted/for/privacy"); static Logger log = LoggerFactory.getLogger(PerfTest.class); static DescriptiveStatistics spoolingStats = new DescriptiveStatistics(); static DescriptiveStatistics nonSpoolingStats = new DescriptiveStatistics(); public static void main(String[] args) { testSpoolingPerf(); } static Tika parser = new Tika(); private static String parse(InputStream fileStream, Metadata meta) throws TikaException, IOException { return parser.parseToString(fileStream, meta); } private static void testSpoolingPerf() { // This method walks the files in the folder and calls test path on each path try (Stream<Path> paths = Files.walk(TEST_FOLDER)) { paths.filter(Files::isRegularFile).forEach(PerfTest::testPath); } catch (IOException e) { e.printStackTrace(); } logStats(spoolingStats, true); logStats(nonSpoolingStats, false); } private static void logStats(DescriptiveStatistics stats, boolean spooling) { //log overall statistics for time taken log.info("#$# spooling: {}, fileCount: {}, meanTime: {}, stdDeviation: {}, minTime: {}, maxTime: {}", spooling, stats.getN(), String.format("%.2f",stats.getMean()), String.format("%.2f", stats.getStandardDeviation()), stats.getMin(), stats.getMax()); } private static void testPath(Path file){ parsePath(file, true, null); //for warmup parsePath(file, false, null); //for warmup log.info("Warmup completed"); parsePath(file,true, spoolingStats); //parse with spooling parsePath(file,false, nonSpoolingStats); //parse without spooling } private static void parsePath(Path filePath, boolean spooling, DescriptiveStatistics stats) { long startTime = System.currentTimeMillis(); long size =-1; long parsedSize = -1; try (InputStream inputStream = TikaInputStream.get(Files.newInputStream(filePath))) { log.debug("\n parsing file: {}", filePath.getFileName()); Metadata meta = new Metadata(); meta.set("resourceName", filePath.getFileName().toString()); meta.set("size", String.valueOf(Files.size(filePath))); TikaInputStream tis = TikaInputStream.get( inputStream); if(spooling){ tis.getPath(); //spools Input stream to temp file } String content = parse(tis, meta); size = Files.size(filePath); parsedSize = content.length(); log.debug("File: {} , Type: {} , Size: {} , ParsedSize: {}", filePath.getFileName(), meta.get("Content-Type"), size, parsedSize); log.debug("Content:\n{}", content); } catch (EncryptedDocumentException e) { log.info("File is encrypted: {}", filePath.getFileName()); } catch (UnsupportedFormatException e) { log.info("Unsupported format for file: {}", filePath.getFileName()); } catch (TikaException e) { Throwable cause = e.getCause(); while (cause != null) { if (cause instanceof PasswordRequiredException) { log.info("File is encrypted: {}", filePath.getFileName()); return; } cause = cause.getCause(); } log.error("Error processing file: {}", filePath.getFileName(),e); } catch (IOException e) { log.error("Error processing file: {}", filePath.getFileName(), e); } long timeTaken = System.currentTimeMillis() - startTime; if(stats!=null){ stats.addValue(timeTaken); } log.info("#@# spool: {} , file: {} , size: {} , parsedSize: {} , time taken: {}", spooling, filePath.getFileName(), size, parsedSize, timeTaken); } } {code} P.S: - You can directly run the code just by changing the path of the constant TEST_FOLDER to your test files folder's path - I used slf4j logger which would expect an slf4j implementation to be in the classpath. you also can replace all log.info|error|debug with System.out.println if you want - I used org.apache.commons.math3.stat.descriptive.DescriptiveStatistics for ease of calculating mean, max, min, count and standard deviations - #@# and #$# are just there to easily search and extract the results was (Author: JIRAUSER306563): I did a performance test with spooling (for every file type) and withour explicit spooling the results are {code:java} 17:05:39.547 [main] INFO org.manish.PerfTest -- #$# spooling: true, fileCount: 23, meanTime: 5.96, stdDeviation: 3.78, minTime: 2.0, maxTime: 13.0 17:05:39.547 [main] INFO org.manish.PerfTest -- #$# spooling: false, fileCount: 23, meanTime: 5.13, stdDeviation: 4.81, minTime: 1.0, maxTime: 17.0 {code} I only walked all the files in my document folder and my machine has an ssd so the results might be influenced by that. but it seems the spooling variant performed better. I attach the performance test code below so you can try it on your test files folder {code:java} public class PerfTest { //this class is to test performance difference between spooling and not spooling input stream public static final Path TEST_FOLDER = Path.of("/path/redacted/for/privacy"); static Logger log = LoggerFactory.getLogger(PerfTest.class); static DescriptiveStatistics spoolingStats = new DescriptiveStatistics(); static DescriptiveStatistics nonSpoolingStats = new DescriptiveStatistics(); public static void main(String[] args) { testSpoolingPerf(); } static Tika parser = new Tika(); private static String parse(InputStream fileStream, Metadata meta) throws TikaException, IOException { return parser.parseToString(fileStream, meta); } private static void testSpoolingPerf() { // This method walks the files in the folder and calls test path on each path try (Stream<Path> paths = Files.walk(TEST_FOLDER)) { paths.filter(Files::isRegularFile).forEach(PerfTest::testPath); } catch (IOException e) { e.printStackTrace(); } logStats(spoolingStats, true); logStats(nonSpoolingStats, false); } private static void logStats(DescriptiveStatistics stats, boolean spooling) { //log overall statistics for time taken log.info("#$# spooling: {}, fileCount: {}, meanTime: {}, stdDeviation: {}, minTime: {}, maxTime: {}", spooling, stats.getN(), String.format("%.2f",stats.getMean()), String.format("%.2f", stats.getStandardDeviation()), stats.getMin(), stats.getMax()); } private static void testPath(Path file){ parsePath(file, true, null); //for warmup parsePath(file, false, null); //for warmup log.info("Warmup completed"); parsePath(file,true, spoolingStats); //parse with spooling parsePath(file,false, nonSpoolingStats); //parse without spooling } private static void parsePath(Path filePath, boolean spooling, DescriptiveStatistics stats) { long startTime = System.currentTimeMillis(); long size =-1; long parsedSize = -1; try (InputStream inputStream = TikaInputStream.get(Files.newInputStream(filePath))) { log.debug("\n parsing file: {}", filePath.getFileName()); Metadata meta = new Metadata(); meta.set("resourceName", filePath.getFileName().toString()); meta.set("size", String.valueOf(Files.size(filePath))); TikaInputStream tis = TikaInputStream.get( inputStream); if(spooling){ tis.getPath(); //spools Input stream to temp file } String content = parse(tis, meta); size = Files.size(filePath); parsedSize = content.length(); log.debug("File: {} , Type: {} , Size: {} , ParsedSize: {}", filePath.getFileName(), meta.get("Content-Type"), size, parsedSize); log.debug("Content:\n{}", content); } catch (EncryptedDocumentException e) { log.info("File is encrypted: {}", filePath.getFileName()); } catch (UnsupportedFormatException e) { log.info("Unsupported format for file: {}", filePath.getFileName()); } catch (TikaException e) { Throwable cause = e.getCause(); while (cause != null) { if (cause instanceof PasswordRequiredException) { log.info("File is encrypted: {}", filePath.getFileName()); return; } cause = cause.getCause(); } log.error("Error processing file: {}", filePath.getFileName(),e); } catch (IOException e) { log.error("Error processing file: {}", filePath.getFileName(), e); } long timeTaken = System.currentTimeMillis() - startTime; if(stats!=null){ stats.addValue(timeTaken); } log.info("#@# spool: {} , file: {} , size: {} , parsedSize: {} , time taken: {}", spooling, filePath.getFileName(), size, parsedSize, timeTaken); } } {code} P.S: - You can directly run the code just by changing the path of the constant TEST_FOLDER to your test files folder's path - I used slf4j logger which would expect an slf4j implementation to be in the classpath. you also can replace all log.info|error|debug with System.out.println if you want - #@# and #$# are just there to easily search and extract the results > protected ODF encryption detection fail > --------------------------------------- > > Key: TIKA-4459 > URL: https://issues.apache.org/jira/browse/TIKA-4459 > Project: Tika > Issue Type: Bug > Components: parser > Affects Versions: 3.2.1 > Environment: Ubuntu 24.04.2 LTS x86_64 > Reporter: Manish S N > Priority: Minor > Labels: encryption, odf, open-document-format, protected, > regression, zip > Fix For: 4.0.0, 3.2.2 > > Attachments: protected.odt, testProtected.odp > > > When passing inputstream of protected odf format file to tika we get a > ZipException instead of a EncryptedDocumentException. > This works well and correctly throws EncryptedDocumentException if you create > TikaInputStream with Path or call TikaInputStream.getPath() as it will write > to a temporary file in memory. > But when working with InputStreams we get the following zip exception: > > org.apache.tika.exception.TikaException: TIKA-198: Illegal IOException from > org.apache.tika.parser.odf.OpenDocumentParser@bae47a0 > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:304) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) > at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:204) > at org.apache.tika.Tika.parseToString(Tika.java:525) > at org.apache.tika.Tika.parseToString(Tika.java:495) > at org.manish.AttachmentParser.parse(AttachmentParser.java:21) > at org.manish.AttachmentParser.lambda$testParse$1(AttachmentParser.java:72) > at > java.base/java.util.stream.ForEachOps$ForEachOp$OfRef.accept(ForEachOps.java:183) > at > java.base/java.util.stream.ReferencePipeline$2$1.accept(ReferencePipeline.java:177) > at > java.base/java.util.stream.ReferencePipeline$3$1.accept(ReferencePipeline.java:195) > at java.base/java.util.Iterator.forEachRemaining(Iterator.java:133) > at > java.base/java.util.Spliterators$IteratorSpliterator.forEachRemaining(Spliterators.java:1801) > at > java.base/java.util.stream.AbstractPipeline.copyInto(AbstractPipeline.java:484) > at > java.base/java.util.stream.AbstractPipeline.wrapAndCopyInto(AbstractPipeline.java:474) > at > java.base/java.util.stream.ForEachOps$ForEachOp.evaluateSequential(ForEachOps.java:150) > at > java.base/java.util.stream.ForEachOps$ForEachOp$OfRef.evaluateSequential(ForEachOps.java:173) > at > java.base/java.util.stream.AbstractPipeline.evaluate(AbstractPipeline.java:234) > at > java.base/java.util.stream.ReferencePipeline.forEach(ReferencePipeline.java:497) > at org.manish.AttachmentParser.testParse(AttachmentParser.java:64) > at org.manish.AttachmentParser.main(AttachmentParser.java:57) > Caused by: java.util.zip.ZipException: only DEFLATED entries can have EXT > descriptor > at java.base/java.util.zip.ZipInputStream.readLOC(ZipInputStream.java:313) > at > java.base/java.util.zip.ZipInputStream.getNextEntry(ZipInputStream.java:125) > at > org.apache.tika.parser.odf.OpenDocumentParser.handleZipStream(OpenDocumentParser.java:218) > at > org.apache.tika.parser.odf.OpenDocumentParser.parse(OpenDocumentParser.java:169) > at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:298) > ... 19 more > > (We use tika to detect encrypted docs) -- This message was sent by Atlassian Jira (v8.20.10#820010)