This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4388
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4388 by this push:
new 8a6385238 TIKA-4388 -- multithreaded test for mimetypes.detect then
parse
8a6385238 is described below
commit 8a63852386f6a70021c77e36521daeb5106c13c0
Author: tallison <[email protected]>
AuthorDate: Wed Feb 26 10:41:19 2025 -0500
TIKA-4388 -- multithreaded test for mimetypes.detect then parse
---
.../java/org/apache/tika/config/TikaConfig.java | 2 +-
.../java/org/apache/tika/utils/XMLReaderUtils.java | 2 +-
.../tika/mime/TestMimeTypesMultithreaded.java | 29 +++++++++++++++++++---
3 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 63c72bfef..03ddeedd9 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -93,7 +93,7 @@ public class TikaConfig {
public static String MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME =
"maxJsonStringFieldLength";
//use this to look for unneeded instantiations of TikaConfig
- protected static final AtomicInteger TIMES_INSTANTIATED = new
AtomicInteger();
+ public static final AtomicInteger TIMES_INSTANTIATED = new AtomicInteger();
private static final Logger LOG =
LoggerFactory.getLogger(TikaConfig.class);
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index efc9c019b..58239051d 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -652,7 +652,7 @@ public class XMLReaderUtils implements Serializable {
.readLock()
.lock();
try {
- parser = SAX_PARSERS.poll();
+ parser = SAX_PARSERS.poll();//100, TimeUnit.MILLISECONDS);
} finally {
SAX_POOL_LOCK
.readLock()
diff --git
a/tika-core/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
similarity index 75%
rename from
tika-core/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
rename to
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
index 4add116d2..a75f707d7 100644
---
a/tika-core/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
@@ -1,6 +1,7 @@
package org.apache.tika.mime;
import java.io.File;
+import java.io.InputStream;
import java.io.StringWriter;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
@@ -9,10 +10,12 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.Test;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -57,7 +60,7 @@ public class TestMimeTypesMultithreaded {
}
long elapsed = System.currentTimeMillis() - start;
System.out.println("total files=" + processed + ", elapsed=" + elapsed
+ "ms");
-
+ System.out.println("TikaConfig instantiated: " +
TikaConfig.TIMES_INSTANTIATED);
}
private ArrayBlockingQueue<File> loadQueue(File dir) {
@@ -70,7 +73,9 @@ public class TestMimeTypesMultithreaded {
return queue;
}
- private class MyWorker implements Callable<Integer> {
+ private static class MyWorker implements Callable<Integer> {
+ static final AtomicInteger COUNTER = new AtomicInteger();
+ static final long STARTED = System.currentTimeMillis();
private final ArrayBlockingQueue<File> files;
private final MimeTypes mimeTypes;// =
TikaConfig.getDefaultConfig().getMimeRepository();
private final Parser parser;// = new AutoDetectParser();
@@ -83,20 +88,36 @@ public class TestMimeTypesMultithreaded {
@Override
public Integer call() throws Exception {
int counter = 0;
+ Detector detector = TikaConfig.getDefaultConfig()
+ .getDetector();
while (true) {
File f = files.poll(1, TimeUnit.SECONDS);
if (f == STOP_NOW) {
files.offer(STOP_NOW);
return counter;
}
- MimeType mimeType = mimeTypes.getMimeType(f);
- if ("text/html".equals(mimeType.toString())) {
+
+ //pick your detection method
+ String mimeString = "";
+// mimeString = mimeTypes.getMimeType(f).toString();
+
+ try (InputStream tis = TikaInputStream.get(f)) {
+ mimeString = mimeTypes.detect(tis, new
Metadata()).toString();
+ }
+
+ if ("text/html".equals(mimeString)) {
StringWriter stringWriter = new StringWriter();
try (TikaInputStream tis = TikaInputStream.get(f)) {
parser.parse(tis, new
ToTextContentHandler(stringWriter), new Metadata(), new ParseContext());
} catch (ZeroByteFileException e) {
//swallow
}
+ int cnt = COUNTER.incrementAndGet();
+ if (cnt % 100 == 0) {
+ long elapsed = System.currentTimeMillis() - STARTED;
+ double perMillis = (double)cnt/(double)elapsed;
+ System.out.println("processed " + cnt + " files in " +
elapsed + "ms, " + perMillis + " per ms");
+ }
}
//System.out.println(mimeType);
counter++;