This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_2x by this push:
     new c0f1a8b3b improve robustness of XMLReaderUtils
c0f1a8b3b is described below

commit c0f1a8b3b9d079b66b24e2e5d93a23eaf5d34ddf
Author: tallison <[email protected]>
AuthorDate: Fri Oct 25 11:43:24 2024 -0400

    improve robustness of XMLReaderUtils
    
    (cherry picked from commit 6305da41756e59dcf19e92acf70657624581cfe3)
---
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 147 ++++++++++-----------
 1 file changed, 70 insertions(+), 77 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 9db9d291a..7a1b92924 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -115,8 +115,10 @@ public class XMLReaderUtils implements Serializable {
     };
     private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = 
"jdk.xml.entityExpansionLimit";
     //TODO: figure out if the rw lock is any better than a simple lock
-    private static final ReentrantReadWriteLock SAX_READ_WRITE_LOCK = new 
ReentrantReadWriteLock();
-    private static final ReentrantReadWriteLock DOM_READ_WRITE_LOCK = new 
ReentrantReadWriteLock();
+    //these lock the pool arrayblocking queues so that there isn't a race 
condition
+    //of trying to acquire a parser while the pool is being resized
+    private static final ReentrantReadWriteLock SAX_POOL_LOCK = new 
ReentrantReadWriteLock();
+    private static final ReentrantReadWriteLock DOM_POOL_LOCK = new 
ReentrantReadWriteLock();
     private static final AtomicInteger POOL_GENERATION = new AtomicInteger();
     private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER =
             (publicId, systemId) -> new InputSource(new StringReader(""));
@@ -389,7 +391,11 @@ public class XMLReaderUtils implements Serializable {
         PoolDOMBuilder poolBuilder = null;
         if (builder == null) {
             poolBuilder = acquireDOMBuilder();
-            builder = poolBuilder.getDocumentBuilder();
+            if (poolBuilder != null) {
+                builder = poolBuilder.getDocumentBuilder();
+            } else {
+                builder = getDocumentBuilder();
+            }
         }
 
         try {
@@ -507,7 +513,11 @@ public class XMLReaderUtils implements Serializable {
         PoolSAXParser poolSAXParser = null;
         if (saxParser == null) {
             poolSAXParser = acquireSAXParser();
-            saxParser = poolSAXParser.getSAXParser();
+            if (poolSAXParser != null) {
+                saxParser = poolSAXParser.getSAXParser();
+            } else {
+                saxParser = getSAXParser();
+            }
         }
         try {
             saxParser.parse(is, new OfflineContentHandler(contentHandler));
@@ -539,7 +549,11 @@ public class XMLReaderUtils implements Serializable {
         PoolSAXParser poolSAXParser = null;
         if (saxParser == null) {
             poolSAXParser = acquireSAXParser();
-            saxParser = poolSAXParser.getSAXParser();
+            if (poolSAXParser != null) {
+                saxParser = poolSAXParser.getSAXParser();
+            } else {
+                saxParser = getSAXParser();
+            }
         }
         try {
             saxParser.parse(new InputSource(reader), new 
OfflineContentHandler(contentHandler));
@@ -551,48 +565,32 @@ public class XMLReaderUtils implements Serializable {
     }
 
     /**
-     * Acquire a SAXParser from the pool.  Make sure to
+     * Acquire a DOMBuilder from the pool.  Make sure to
      * {@link #releaseDOMBuilder(PoolDOMBuilder)} in
      * a <code>finally</code> block every time you call this.
      *
-     * @return a DocumentBuilder
+     * @return a DocumentBuilder or null if no DOMBuilders are available
      * @throws TikaException
      */
     private static PoolDOMBuilder acquireDOMBuilder() throws TikaException {
-        int waiting = 0;
-        long lastWarn = -1;
-        while (true) {
-            PoolDOMBuilder builder = null;
-            DOM_READ_WRITE_LOCK.readLock().lock();
-            try {
-                builder = DOM_BUILDERS.poll(100, TimeUnit.MILLISECONDS);
-            } catch (InterruptedException e) {
-                throw new TikaException("interrupted while waiting for 
DOMBuilder", e);
-            } finally {
-                DOM_READ_WRITE_LOCK.readLock().unlock();
-            }
-            if (builder != null) {
-                return builder;
-            }
-            if (lastWarn < 0 || System.currentTimeMillis() - lastWarn > 1000) {
-                //avoid spamming logs
-                LOG.warn("Contention waiting for a DOMParser. " +
-                        "Consider increasing the XMLReaderUtils.POOL_SIZE");
-                lastWarn = System.currentTimeMillis();
-            }
-            waiting++;
 
-            if (waiting > 3000) {
-                //freshen the pool.  Something went very wrong...
-                setPoolSize(POOL_SIZE);
-                //better to get an exception than have permahang by a bug in 
one of our parsers
-                throw new TikaException("Waited more than 5 minutes for a 
DocumentBuilder; " +
-                        "This could indicate that a parser has not correctly 
released its " +
-                        "DocumentBuilder. " +
-                        "Please report this to the Tika team: 
[email protected]");
+        PoolDOMBuilder builder = null;
+        DOM_POOL_LOCK
+                .readLock()
+                .lock();
+        try {
+            builder = DOM_BUILDERS.poll();
+        } finally {
+            DOM_POOL_LOCK
+                    .readLock()
+                    .unlock();
+        }
+        if (builder == null) {
+            LOG.warn("Contention waiting for a DOMBuilder. " +
+                    "Consider increasing the XMLReaderUtils.POOL_SIZE");
 
-            }
         }
+        return builder;
     }
 
     /**
@@ -609,7 +607,8 @@ public class XMLReaderUtils implements Serializable {
         } catch (UnsupportedOperationException e) {
             //ignore
         }
-        DOM_READ_WRITE_LOCK.readLock().lock();
+        DOM_POOL_LOCK
+                .readLock().lock();
         try {
             //if there are extra parsers (e.g. after a reset of the pool to a 
smaller size),
             // this parser will not be added and will then be gc'd
@@ -621,7 +620,8 @@ public class XMLReaderUtils implements Serializable {
                                 "'acquire' than to 'release'");
             }
         } finally {
-            DOM_READ_WRITE_LOCK.readLock().unlock();
+            DOM_POOL_LOCK
+                    .readLock().unlock();
         }
     }
 
@@ -630,42 +630,29 @@ public class XMLReaderUtils implements Serializable {
      * {@link #releaseParser(PoolSAXParser)} in
      * a <code>finally</code> block every time you call this.
      *
-     * @return a SAXParser
+     * @return a SAXParser or null if a parser is not available
      * @throws TikaException
      */
     private static PoolSAXParser acquireSAXParser() throws TikaException {
-        int waiting = 0;
-        long lastWarn = -1;
-        while (true) {
-            PoolSAXParser parser = null;
-            SAX_READ_WRITE_LOCK.readLock().lock();
-            try {
-                parser = SAX_PARSERS.poll(100, TimeUnit.MILLISECONDS);
-            } catch (InterruptedException e) {
-                throw new TikaException("interrupted while waiting for 
SAXParser", e);
-            } finally {
-                SAX_READ_WRITE_LOCK.readLock().unlock();
-            }
-            if (parser != null) {
-                return parser;
-            }
-            if (lastWarn < 0 || System.currentTimeMillis() - lastWarn > 1000) {
-                //avoid spamming logs
-                LOG.warn("Contention waiting for a SAXParser. " +
-                        "Consider increasing the XMLReaderUtils.POOL_SIZE");
-                lastWarn = System.currentTimeMillis();
-            }
-            waiting++;
-            if (waiting > 3000) {
-                //freshen the pool.  Something went very wrong...
-                setPoolSize(POOL_SIZE);
-                //better to get an exception than have permahang by a bug in 
one of our parsers
-                throw new TikaException("Waited more than 5 minutes for a 
SAXParser; " +
-                        "This could indicate that a parser has not correctly 
released its " +
-                        "SAXParser. Please report this to the Tika team: 
[email protected]");
+        PoolSAXParser parser = null;
 
-            }
+        //this locks around the pool so that there's
+        //no race condition with it being resized
+        SAX_POOL_LOCK
+                .readLock()
+                .lock();
+        try {
+            parser = SAX_PARSERS.poll();
+        } finally {
+            SAX_POOL_LOCK
+                    .readLock()
+                    .unlock();
+        }
+        if (parser == null) {
+            LOG.warn("Contention waiting for a SAXParser. " +
+                    "Consider increasing the XMLReaderUtils.POOL_SIZE");
         }
+        return parser;
     }
 
     /**
@@ -684,7 +671,8 @@ public class XMLReaderUtils implements Serializable {
         if (parser.getGeneration() != POOL_GENERATION.get()) {
             return;
         }
-        SAX_READ_WRITE_LOCK.readLock().lock();
+        SAX_POOL_LOCK
+                .readLock().lock();
         try {
             //if there are extra parsers (e.g. after a reset of the pool to a 
smaller size),
             // this parser will not be added and will then be gc'd
@@ -696,7 +684,8 @@ public class XMLReaderUtils implements Serializable {
                                 "than to 'release'");
             }
         } finally {
-            SAX_READ_WRITE_LOCK.readLock().unlock();
+            SAX_POOL_LOCK
+                    .readLock().unlock();
         }
     }
 
@@ -822,7 +811,8 @@ public class XMLReaderUtils implements Serializable {
         //the read locking in case one thread resizes the pool when the
         //parsers have already started.  We could have an NPE on SAX_PARSERS
         //if we didn't lock.
-        SAX_READ_WRITE_LOCK.writeLock().lock();
+        SAX_POOL_LOCK
+                .writeLock().lock();
         try {
             //free up any resources before emptying SAX_PARSERS
             for (PoolSAXParser parser : SAX_PARSERS) {
@@ -840,10 +830,12 @@ public class XMLReaderUtils implements Serializable {
                 }
             }
         } finally {
-            SAX_READ_WRITE_LOCK.writeLock().unlock();
+            SAX_POOL_LOCK
+                    .writeLock().unlock();
         }
 
-        DOM_READ_WRITE_LOCK.writeLock().lock();
+        DOM_POOL_LOCK
+                .writeLock().lock();
         try {
             DOM_BUILDERS.clear();
             DOM_BUILDERS = new ArrayBlockingQueue<>(poolSize);
@@ -851,7 +843,8 @@ public class XMLReaderUtils implements Serializable {
                 DOM_BUILDERS.offer(new PoolDOMBuilder(POOL_GENERATION.get(), 
getDocumentBuilder()));
             }
         } finally {
-            DOM_READ_WRITE_LOCK.writeLock().unlock();
+            DOM_POOL_LOCK
+                    .writeLock().unlock();
         }
         POOL_SIZE = poolSize;
     }

Reply via email to