This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4388
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-4388 by this push:
     new 8a6385238 TIKA-4388 -- multithreaded test for mimetypes.detect then 
parse
8a6385238 is described below

commit 8a63852386f6a70021c77e36521daeb5106c13c0
Author: tallison <[email protected]>
AuthorDate: Wed Feb 26 10:41:19 2025 -0500

    TIKA-4388 -- multithreaded test for mimetypes.detect then parse
---
 .../java/org/apache/tika/config/TikaConfig.java    |  2 +-
 .../java/org/apache/tika/utils/XMLReaderUtils.java |  2 +-
 .../tika/mime/TestMimeTypesMultithreaded.java      | 29 +++++++++++++++++++---
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 63c72bfef..03ddeedd9 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -93,7 +93,7 @@ public class TikaConfig {
     public static String MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME = 
"maxJsonStringFieldLength";
 
     //use this to look for unneeded instantiations of TikaConfig
-    protected static final AtomicInteger TIMES_INSTANTIATED = new 
AtomicInteger();
+    public static final AtomicInteger TIMES_INSTANTIATED = new AtomicInteger();
 
     private static final Logger LOG = 
LoggerFactory.getLogger(TikaConfig.class);
 
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index efc9c019b..58239051d 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -652,7 +652,7 @@ public class XMLReaderUtils implements Serializable {
                 .readLock()
                 .lock();
         try {
-            parser = SAX_PARSERS.poll();
+            parser = SAX_PARSERS.poll();//100, TimeUnit.MILLISECONDS);
         } finally {
             SAX_POOL_LOCK
                     .readLock()
diff --git 
a/tika-core/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
similarity index 75%
rename from 
tika-core/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
rename to 
tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
index 4add116d2..a75f707d7 100644
--- 
a/tika-core/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/mime/TestMimeTypesMultithreaded.java
@@ -1,6 +1,7 @@
 package org.apache.tika.mime;
 
 import java.io.File;
+import java.io.InputStream;
 import java.io.StringWriter;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.Callable;
@@ -9,10 +10,12 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -57,7 +60,7 @@ public class TestMimeTypesMultithreaded {
         }
         long elapsed = System.currentTimeMillis() - start;
         System.out.println("total files=" + processed + ", elapsed=" + elapsed 
+ "ms");
-
+        System.out.println("TikaConfig instantiated: " + 
TikaConfig.TIMES_INSTANTIATED);
     }
 
     private ArrayBlockingQueue<File> loadQueue(File dir) {
@@ -70,7 +73,9 @@ public class TestMimeTypesMultithreaded {
         return queue;
     }
 
-    private class MyWorker implements Callable<Integer> {
+    private static class MyWorker implements Callable<Integer> {
+        static final AtomicInteger COUNTER = new AtomicInteger();
+        static final long STARTED = System.currentTimeMillis();
         private final ArrayBlockingQueue<File> files;
         private final MimeTypes mimeTypes;// = 
TikaConfig.getDefaultConfig().getMimeRepository();
         private final Parser parser;// = new AutoDetectParser();
@@ -83,20 +88,36 @@ public class TestMimeTypesMultithreaded {
         @Override
         public Integer call() throws Exception {
             int counter = 0;
+            Detector detector = TikaConfig.getDefaultConfig()
+                                          .getDetector();
             while (true) {
                 File f = files.poll(1, TimeUnit.SECONDS);
                 if (f == STOP_NOW) {
                     files.offer(STOP_NOW);
                     return counter;
                 }
-                MimeType mimeType = mimeTypes.getMimeType(f);
-                if ("text/html".equals(mimeType.toString())) {
+
+                //pick your detection method
+                String mimeString = "";
+//                mimeString = mimeTypes.getMimeType(f).toString();
+
+                try (InputStream tis = TikaInputStream.get(f)) {
+                    mimeString = mimeTypes.detect(tis, new 
Metadata()).toString();
+                }
+
+                if ("text/html".equals(mimeString)) {
                     StringWriter stringWriter = new StringWriter();
                     try (TikaInputStream tis = TikaInputStream.get(f)) {
                         parser.parse(tis, new 
ToTextContentHandler(stringWriter), new Metadata(), new ParseContext());
                     } catch (ZeroByteFileException e) {
                         //swallow
                     }
+                    int cnt = COUNTER.incrementAndGet();
+                    if (cnt % 100 == 0) {
+                        long elapsed = System.currentTimeMillis() - STARTED;
+                        double perMillis = (double)cnt/(double)elapsed;
+                        System.out.println("processed " + cnt + " files in " + 
elapsed + "ms, " + perMillis + " per ms");
+                    }
                 }
                 //System.out.println(mimeType);
                 counter++;

Reply via email to