ignite-5125 Do not limit number of hanging objects dumps, instead increase time 
between dumps.


Project: http://git-wip-us.apache.org/repos/asf/ignite/repo
Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/808ad9fe
Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/808ad9fe
Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/808ad9fe

Branch: refs/heads/ignite-5414
Commit: 808ad9fe4b339dbe86ae96c1915c5aa5ea0521c7
Parents: ec5c3a2
Author: Evgeny Stanilovskiy <estanilovs...@gridgain.com>
Authored: Tue Jun 6 17:58:44 2017 +0300
Committer: sboikov <sboi...@gridgain.com>
Committed: Tue Jun 6 17:58:44 2017 +0300

----------------------------------------------------------------------
 .../apache/ignite/IgniteSystemProperties.java   |   3 +
 .../GridCachePartitionExchangeManager.java      | 129 ++++++++++---------
 .../dht/GridDhtPartitionTopologyImpl.java       |   6 +-
 .../GridDhtPartitionsExchangeFuture.java        |  56 +++++---
 4 files changed, 117 insertions(+), 77 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ignite/blob/808ad9fe/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java
----------------------------------------------------------------------
diff --git 
a/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java 
b/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java
index ea4b690..84f3732 100644
--- a/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java
+++ b/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java
@@ -451,6 +451,9 @@ public final class IgniteSystemProperties {
     /** Cache operations that take more time than value of this property will 
be output to log. Set to {@code 0} to disable. */
     public static final String IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT = 
"IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT";
 
+    /** Upper time limit between long running/hanging operations debug dumps. 
*/
+    public static final String IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT = 
"IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT";
+
     /** JDBC driver cursor remove delay. */
     public static final String IGNITE_JDBC_DRIVER_CURSOR_REMOVE_DELAY = 
"IGNITE_JDBC_DRIVER_CURSOR_RMV_DELAY";
 

http://git-wip-us.apache.org/repos/asf/ignite/blob/808ad9fe/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java
----------------------------------------------------------------------
diff --git 
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java
 
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java
index 2d1aa05..2be4d0f 100644
--- 
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java
+++ 
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java
@@ -110,6 +110,7 @@ import static 
org.apache.ignite.events.EventType.EVT_NODE_LEFT;
 import static org.apache.ignite.internal.GridTopic.TOPIC_CACHE;
 import static 
org.apache.ignite.internal.events.DiscoveryCustomEvent.EVT_DISCOVERY_CUSTOM_EVT;
 import static 
org.apache.ignite.internal.managers.communication.GridIoPolicy.SYSTEM_POOL;
+import static 
org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.*;
 import static 
org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPreloader.DFLT_PRELOAD_RESEND_TIMEOUT;
 
 /**
@@ -175,7 +176,10 @@ public class GridCachePartitionExchangeManager<K, V> 
extends GridCacheSharedMana
     private volatile IgniteCheckedException stopErr;
 
     /** */
-    private int longRunningOpsDumpCnt;
+    private long nextLongRunningOpsDumpTime;
+
+    /** */
+    private int longRunningOpsDumpStep;
 
     /** */
     private DateFormat dateFormat = new SimpleDateFormat("HH:mm:ss.SSS");
@@ -1405,84 +1409,83 @@ public class GridCachePartitionExchangeManager<K, V> 
extends GridCacheSharedMana
 
     /**
      * @param timeout Operation timeout.
+     * @return {@code True} if found long running operations.
      */
-    public void dumpLongRunningOperations(long timeout) {
-        try {
-            GridDhtPartitionsExchangeFuture lastFut = lastInitializedFut;
+    private boolean dumpLongRunningOperations0(long timeout) {
+        long curTime = U.currentTimeMillis();
 
-            // If exchange is in progress it will dump all hanging operations 
if any.
-            if (lastFut != null && !lastFut.isDone())
-                return;
-
-            long curTime = U.currentTimeMillis();
+        boolean found = false;
 
-            boolean found = false;
+        IgniteTxManager tm = cctx.tm();
 
-            IgniteTxManager tm = cctx.tm();
+        GridCacheMvccManager mvcc = cctx.mvcc();
 
-            if (tm != null) {
-                for (IgniteInternalTx tx : tm.activeTransactions()) {
-                    if (curTime - tx.startTime() > timeout) {
-                        found = true;
+        if (tm != null) {
+            for (IgniteInternalTx tx : tm.activeTransactions()) {
+                if (curTime - tx.startTime() > timeout) {
+                    found = true;
 
-                        if (longRunningOpsDumpCnt < 
GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) {
-                            U.warn(log, "Found long running transaction 
[startTime=" + formatTime(tx.startTime()) +
-                                ", curTime=" + formatTime(curTime) + ", tx=" + 
tx + ']');
-                        }
-                        else
-                            break;
-                    }
+                    U.warn(log, "Found long running transaction [startTime=" + 
formatTime(tx.startTime()) +
+                        ", curTime=" + formatTime(curTime) + ", tx=" + tx + 
']');
                 }
             }
+        }
 
-            GridCacheMvccManager mvcc = cctx.mvcc();
-
-            if (mvcc != null) {
-                for (GridCacheFuture<?> fut : mvcc.activeFutures()) {
-                    if (curTime - fut.startTime() > timeout) {
-                        found = true;
+        if (mvcc != null) {
+            for (GridCacheFuture<?> fut : mvcc.activeFutures()) {
+                if (curTime - fut.startTime() > timeout) {
+                    found = true;
 
-                        if (longRunningOpsDumpCnt < 
GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) {
-                            U.warn(log, "Found long running cache future 
[startTime=" + formatTime(fut.startTime()) +
-                                ", curTime=" + formatTime(curTime) + ", fut=" 
+ fut + ']');
-                        }
-                        else
-                            break;
-                    }
+                    U.warn(log, "Found long running cache future [startTime=" 
+ formatTime(fut.startTime()) +
+                        ", curTime=" + formatTime(curTime) + ", fut=" + fut + 
']');
                 }
+            }
 
-                for (GridCacheFuture<?> fut : mvcc.atomicFutures()) {
-                    if (curTime - fut.startTime() > timeout) {
-                        found = true;
+            for (GridCacheFuture<?> fut : mvcc.atomicFutures()) {
+                if (curTime - fut.startTime() > timeout) {
+                    found = true;
 
-                        if (longRunningOpsDumpCnt < 
GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) {
-                            U.warn(log, "Found long running cache future 
[startTime=" + formatTime(fut.startTime()) +
-                                ", curTime=" + formatTime(curTime) + ", fut=" 
+ fut + ']');
-                        }
-                        else
-                            break;
-                    }
+                    U.warn(log, "Found long running cache future [startTime=" 
+ formatTime(fut.startTime()) +
+                        ", curTime=" + formatTime(curTime) + ", fut=" + fut + 
']');
                 }
             }
+        }
 
-            if (found) {
-                if (longRunningOpsDumpCnt < 
GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) {
-                    longRunningOpsDumpCnt++;
+        return found;
+    }
 
-                    if 
(IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, 
false)) {
-                        U.warn(log, "Found long running cache operations, dump 
threads.");
+    /**
+     * @param timeout Operation timeout.
+     */
+    public void dumpLongRunningOperations(long timeout) {
+        try {
+            GridDhtPartitionsExchangeFuture lastFut = lastInitializedFut;
 
-                        U.dumpThreads(log);
-                    }
+            // If exchange is in progress it will dump all hanging operations 
if any.
+            if (lastFut != null && !lastFut.isDone())
+                return;
 
-                    U.warn(log, "Found long running cache operations, dump IO 
statistics.");
+            if (U.currentTimeMillis() < nextLongRunningOpsDumpTime)
+                return;
 
-                    // Dump IO manager statistics.
-                    cctx.gridIO().dumpStats();
+            if (dumpLongRunningOperations0(timeout)) {
+                nextLongRunningOpsDumpTime = U.currentTimeMillis() + 
nextDumpTimeout(longRunningOpsDumpStep++, timeout);
+
+                if 
(IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, 
false)) {
+                    U.warn(log, "Found long running cache operations, dump 
threads.");
+
+                    U.dumpThreads(log);
                 }
+
+                U.warn(log, "Found long running cache operations, dump IO 
statistics.");
+
+                // Dump IO manager statistics.
+                cctx.gridIO().dumpStats();
+            }
+            else {
+                nextLongRunningOpsDumpTime = 0;
+                longRunningOpsDumpStep = 0;
             }
-            else
-                longRunningOpsDumpCnt = 0;
         }
         catch (Exception e) {
             U.error(log, "Failed to dump debug information: " + e, e);
@@ -1774,11 +1777,15 @@ public class GridCachePartitionExchangeManager<K, V> 
extends GridCacheSharedMana
 
                             exchFut.init();
 
-                            int dumpedObjects = 0;
+                            int dumpCnt = 0;
+
+                            final long futTimeout = 2 * 
cctx.gridConfig().getNetworkTimeout();
+
+                            long nextDumpTime = 0;
 
                             while (true) {
                                 try {
-                                    exchFut.get(2 * 
cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS);
+                                    exchFut.get(futTimeout, 
TimeUnit.MILLISECONDS);
 
                                     break;
                                 }
@@ -1788,7 +1795,7 @@ public class GridCachePartitionExchangeManager<K, V> 
extends GridCacheSharedMana
                                         ", node=" + cctx.localNodeId() + "]. " 
+
                                         "Dumping pending objects that might be 
the cause: ");
 
-                                    if (dumpedObjects < 
GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) {
+                                    if (nextDumpTime <= U.currentTimeMillis()) 
{
                                         try {
                                             
dumpDebugInfo(exchFut.topologyVersion());
                                         }
@@ -1799,7 +1806,7 @@ public class GridCachePartitionExchangeManager<K, V> 
extends GridCacheSharedMana
                                         if 
(IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, 
false))
                                             U.dumpThreads(log);
 
-                                        dumpedObjects++;
+                                        nextDumpTime = U.currentTimeMillis() + 
nextDumpTimeout(dumpCnt++, futTimeout);
                                     }
                                 }
                                 catch (Exception e) {

http://git-wip-us.apache.org/repos/asf/ignite/blob/808ad9fe/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java
----------------------------------------------------------------------
diff --git 
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java
 
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java
index 248d44e..89554f3 100644
--- 
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java
+++ 
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java
@@ -80,6 +80,10 @@ public class GridDhtPartitionTopologyImpl implements 
GridDhtPartitionTopology {
     private static final boolean FULL_MAP_DEBUG = false;
 
     /** */
+    private static final int DUMP_PENDING_OBJECTS_THRESHOLD =
+        
IgniteSystemProperties.getInteger(IgniteSystemProperties.IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD,
 10);
+
+    /** */
     private static final Long ZERO = 0L;
 
     /** */
@@ -234,7 +238,7 @@ public class GridDhtPartitionTopologyImpl implements 
GridDhtPartitionTopology {
                             break;
                         }
                         catch (IgniteFutureTimeoutCheckedException ignored) {
-                            if (dumpCnt++ < 
GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) {
+                            if (dumpCnt++ < DUMP_PENDING_OBJECTS_THRESHOLD) {
                                 U.warn(log, "Failed to wait for partition 
eviction [" +
                                     "topVer=" + topVer +
                                     ", group=" + grp.cacheOrGroupName() +

http://git-wip-us.apache.org/repos/asf/ignite/blob/808ad9fe/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java
----------------------------------------------------------------------
diff --git 
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java
 
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java
index 0c5cdd9..26406c6 100644
--- 
a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java
+++ 
b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java
@@ -35,7 +35,6 @@ import java.util.concurrent.atomic.AtomicReference;
 import java.util.concurrent.locks.ReadWriteLock;
 import org.apache.ignite.IgniteCheckedException;
 import org.apache.ignite.IgniteLogger;
-import org.apache.ignite.IgniteSystemProperties;
 import org.apache.ignite.cache.PartitionLossPolicy;
 import org.apache.ignite.cluster.ClusterNode;
 import org.apache.ignite.events.CacheEvent;
@@ -90,7 +89,10 @@ import org.apache.ignite.lang.IgniteRunnable;
 import org.jetbrains.annotations.Nullable;
 import org.jsr166.ConcurrentHashMap8;
 
+import static 
org.apache.ignite.IgniteSystemProperties.IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT;
 import static 
org.apache.ignite.IgniteSystemProperties.IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT;
+import static org.apache.ignite.IgniteSystemProperties.getBoolean;
+import static org.apache.ignite.IgniteSystemProperties.getLong;
 import static org.apache.ignite.cache.PartitionLossPolicy.READ_ONLY_ALL;
 import static org.apache.ignite.cache.PartitionLossPolicy.READ_ONLY_SAFE;
 import static org.apache.ignite.cache.PartitionLossPolicy.READ_WRITE_ALL;
@@ -107,10 +109,6 @@ import static 
org.apache.ignite.internal.managers.communication.GridIoPolicy.SYS
 @SuppressWarnings({"TypeMayBeWeakened", "unchecked"})
 public class GridDhtPartitionsExchangeFuture extends 
GridFutureAdapter<AffinityTopologyVersion>
     implements Comparable<GridDhtPartitionsExchangeFuture>, 
GridDhtTopologyFuture, CachePartitionExchangeWorkerTask {
-    /** */
-    public static final int DUMP_PENDING_OBJECTS_THRESHOLD =
-        
IgniteSystemProperties.getInteger(IgniteSystemProperties.IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD,
 10);
-
     /** Dummy flag. */
     private final boolean dummy;
 
@@ -880,20 +878,24 @@ public class GridDhtPartitionsExchangeFuture extends 
GridFutureAdapter<AffinityT
         if (log.isDebugEnabled())
             log.debug("Before waiting for partition release future: " + this);
 
-        int dumpedObjects = 0;
+        int dumpCnt = 0;
+
+        long nextDumpTime = 0;
+
+        long futTimeout = 2 * cctx.gridConfig().getNetworkTimeout();
 
         while (true) {
             try {
-                partReleaseFut.get(2 * cctx.gridConfig().getNetworkTimeout(), 
TimeUnit.MILLISECONDS);
+                partReleaseFut.get(futTimeout, TimeUnit.MILLISECONDS);
 
                 break;
             }
             catch (IgniteFutureTimeoutCheckedException ignored) {
                 // Print pending transactions and locks that might have led to 
hang.
-                if (dumpedObjects < DUMP_PENDING_OBJECTS_THRESHOLD) {
+                if (nextDumpTime <= U.currentTimeMillis()) {
                     dumpPendingObjects();
 
-                    dumpedObjects++;
+                    nextDumpTime = U.currentTimeMillis() + 
nextDumpTimeout(dumpCnt++, futTimeout);
                 }
             }
         }
@@ -903,16 +905,17 @@ public class GridDhtPartitionsExchangeFuture extends 
GridFutureAdapter<AffinityT
 
         IgniteInternalFuture<?> locksFut = 
cctx.mvcc().finishLocks(exchId.topologyVersion());
 
-        dumpedObjects = 0;
+        nextDumpTime = 0;
+        dumpCnt = 0;
 
         while (true) {
             try {
-                locksFut.get(2 * cctx.gridConfig().getNetworkTimeout(), 
TimeUnit.MILLISECONDS);
+                locksFut.get(futTimeout, TimeUnit.MILLISECONDS);
 
                 break;
             }
             catch (IgniteFutureTimeoutCheckedException ignored) {
-                if (dumpedObjects < DUMP_PENDING_OBJECTS_THRESHOLD) {
+                if (nextDumpTime <= U.currentTimeMillis()) {
                     U.warn(log, "Failed to wait for locks release future. " +
                         "Dumping pending objects that might be the cause: " + 
cctx.localNodeId());
 
@@ -930,9 +933,9 @@ public class GridDhtPartitionsExchangeFuture extends 
GridFutureAdapter<AffinityT
                     for (Map.Entry<IgniteTxKey, 
Collection<GridCacheMvccCandidate>> e : locks.entrySet())
                         U.warn(log, "Awaited locked entry [key=" + e.getKey() 
+ ", mvcc=" + e.getValue() + ']');
 
-                    dumpedObjects++;
+                    nextDumpTime = U.currentTimeMillis() + 
nextDumpTimeout(dumpCnt++, futTimeout);
 
-                    if 
(IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, 
false))
+                    if (getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, 
false))
                         U.dumpThreads(log);
                 }
             }
@@ -1026,7 +1029,7 @@ public class GridDhtPartitionsExchangeFuture extends 
GridFutureAdapter<AffinityT
             U.error(log, "Failed to dump debug information: " + e, e);
         }
 
-        if 
(IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, 
false))
+        if (getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false))
             U.dumpThreads(log);
     }
 
@@ -2195,4 +2198,27 @@ public class GridDhtPartitionsExchangeFuture extends 
GridFutureAdapter<AffinityT
             return S.toString(CounterWithNodes.class, this);
         }
     }
+
+    /**
+     * @param step Exponent coefficient.
+     * @param timeout Base timeout.
+     * @return Time to wait before next debug dump.
+     */
+    public static long nextDumpTimeout(int step, long timeout) {
+        long limit = getLong(IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT, 30 * 
60_000);
+
+        if (limit <= 0)
+            limit = 30 * 60_000;
+
+        assert step >= 0 : step;
+
+        long dumpFactor = Math.round(Math.pow(2, step));
+
+        long nextTimeout = timeout * dumpFactor;
+
+        if (nextTimeout <= 0)
+            return limit;
+
+        return nextTimeout <= limit ? nextTimeout : limit;
+    }
 }

Reply via email to