ignite-5125 Do not limit number of hanging objects dumps, instead increase time between dumps.
Project: http://git-wip-us.apache.org/repos/asf/ignite/repo Commit: http://git-wip-us.apache.org/repos/asf/ignite/commit/808ad9fe Tree: http://git-wip-us.apache.org/repos/asf/ignite/tree/808ad9fe Diff: http://git-wip-us.apache.org/repos/asf/ignite/diff/808ad9fe Branch: refs/heads/ignite-5414 Commit: 808ad9fe4b339dbe86ae96c1915c5aa5ea0521c7 Parents: ec5c3a2 Author: Evgeny Stanilovskiy <estanilovs...@gridgain.com> Authored: Tue Jun 6 17:58:44 2017 +0300 Committer: sboikov <sboi...@gridgain.com> Committed: Tue Jun 6 17:58:44 2017 +0300 ---------------------------------------------------------------------- .../apache/ignite/IgniteSystemProperties.java | 3 + .../GridCachePartitionExchangeManager.java | 129 ++++++++++--------- .../dht/GridDhtPartitionTopologyImpl.java | 6 +- .../GridDhtPartitionsExchangeFuture.java | 56 +++++--- 4 files changed, 117 insertions(+), 77 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ignite/blob/808ad9fe/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java b/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java index ea4b690..84f3732 100644 --- a/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java +++ b/modules/core/src/main/java/org/apache/ignite/IgniteSystemProperties.java @@ -451,6 +451,9 @@ public final class IgniteSystemProperties { /** Cache operations that take more time than value of this property will be output to log. Set to {@code 0} to disable. */ public static final String IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT = "IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT"; + /** Upper time limit between long running/hanging operations debug dumps. */ + public static final String IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT = "IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT"; + /** JDBC driver cursor remove delay. */ public static final String IGNITE_JDBC_DRIVER_CURSOR_REMOVE_DELAY = "IGNITE_JDBC_DRIVER_CURSOR_RMV_DELAY"; http://git-wip-us.apache.org/repos/asf/ignite/blob/808ad9fe/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java index 2d1aa05..2be4d0f 100644 --- a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java +++ b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/GridCachePartitionExchangeManager.java @@ -110,6 +110,7 @@ import static org.apache.ignite.events.EventType.EVT_NODE_LEFT; import static org.apache.ignite.internal.GridTopic.TOPIC_CACHE; import static org.apache.ignite.internal.events.DiscoveryCustomEvent.EVT_DISCOVERY_CUSTOM_EVT; import static org.apache.ignite.internal.managers.communication.GridIoPolicy.SYSTEM_POOL; +import static org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.*; import static org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPreloader.DFLT_PRELOAD_RESEND_TIMEOUT; /** @@ -175,7 +176,10 @@ public class GridCachePartitionExchangeManager<K, V> extends GridCacheSharedMana private volatile IgniteCheckedException stopErr; /** */ - private int longRunningOpsDumpCnt; + private long nextLongRunningOpsDumpTime; + + /** */ + private int longRunningOpsDumpStep; /** */ private DateFormat dateFormat = new SimpleDateFormat("HH:mm:ss.SSS"); @@ -1405,84 +1409,83 @@ public class GridCachePartitionExchangeManager<K, V> extends GridCacheSharedMana /** * @param timeout Operation timeout. + * @return {@code True} if found long running operations. */ - public void dumpLongRunningOperations(long timeout) { - try { - GridDhtPartitionsExchangeFuture lastFut = lastInitializedFut; + private boolean dumpLongRunningOperations0(long timeout) { + long curTime = U.currentTimeMillis(); - // If exchange is in progress it will dump all hanging operations if any. - if (lastFut != null && !lastFut.isDone()) - return; - - long curTime = U.currentTimeMillis(); + boolean found = false; - boolean found = false; + IgniteTxManager tm = cctx.tm(); - IgniteTxManager tm = cctx.tm(); + GridCacheMvccManager mvcc = cctx.mvcc(); - if (tm != null) { - for (IgniteInternalTx tx : tm.activeTransactions()) { - if (curTime - tx.startTime() > timeout) { - found = true; + if (tm != null) { + for (IgniteInternalTx tx : tm.activeTransactions()) { + if (curTime - tx.startTime() > timeout) { + found = true; - if (longRunningOpsDumpCnt < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { - U.warn(log, "Found long running transaction [startTime=" + formatTime(tx.startTime()) + - ", curTime=" + formatTime(curTime) + ", tx=" + tx + ']'); - } - else - break; - } + U.warn(log, "Found long running transaction [startTime=" + formatTime(tx.startTime()) + + ", curTime=" + formatTime(curTime) + ", tx=" + tx + ']'); } } + } - GridCacheMvccManager mvcc = cctx.mvcc(); - - if (mvcc != null) { - for (GridCacheFuture<?> fut : mvcc.activeFutures()) { - if (curTime - fut.startTime() > timeout) { - found = true; + if (mvcc != null) { + for (GridCacheFuture<?> fut : mvcc.activeFutures()) { + if (curTime - fut.startTime() > timeout) { + found = true; - if (longRunningOpsDumpCnt < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { - U.warn(log, "Found long running cache future [startTime=" + formatTime(fut.startTime()) + - ", curTime=" + formatTime(curTime) + ", fut=" + fut + ']'); - } - else - break; - } + U.warn(log, "Found long running cache future [startTime=" + formatTime(fut.startTime()) + + ", curTime=" + formatTime(curTime) + ", fut=" + fut + ']'); } + } - for (GridCacheFuture<?> fut : mvcc.atomicFutures()) { - if (curTime - fut.startTime() > timeout) { - found = true; + for (GridCacheFuture<?> fut : mvcc.atomicFutures()) { + if (curTime - fut.startTime() > timeout) { + found = true; - if (longRunningOpsDumpCnt < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { - U.warn(log, "Found long running cache future [startTime=" + formatTime(fut.startTime()) + - ", curTime=" + formatTime(curTime) + ", fut=" + fut + ']'); - } - else - break; - } + U.warn(log, "Found long running cache future [startTime=" + formatTime(fut.startTime()) + + ", curTime=" + formatTime(curTime) + ", fut=" + fut + ']'); } } + } - if (found) { - if (longRunningOpsDumpCnt < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { - longRunningOpsDumpCnt++; + return found; + } - if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) { - U.warn(log, "Found long running cache operations, dump threads."); + /** + * @param timeout Operation timeout. + */ + public void dumpLongRunningOperations(long timeout) { + try { + GridDhtPartitionsExchangeFuture lastFut = lastInitializedFut; - U.dumpThreads(log); - } + // If exchange is in progress it will dump all hanging operations if any. + if (lastFut != null && !lastFut.isDone()) + return; - U.warn(log, "Found long running cache operations, dump IO statistics."); + if (U.currentTimeMillis() < nextLongRunningOpsDumpTime) + return; - // Dump IO manager statistics. - cctx.gridIO().dumpStats(); + if (dumpLongRunningOperations0(timeout)) { + nextLongRunningOpsDumpTime = U.currentTimeMillis() + nextDumpTimeout(longRunningOpsDumpStep++, timeout); + + if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) { + U.warn(log, "Found long running cache operations, dump threads."); + + U.dumpThreads(log); } + + U.warn(log, "Found long running cache operations, dump IO statistics."); + + // Dump IO manager statistics. + cctx.gridIO().dumpStats(); + } + else { + nextLongRunningOpsDumpTime = 0; + longRunningOpsDumpStep = 0; } - else - longRunningOpsDumpCnt = 0; } catch (Exception e) { U.error(log, "Failed to dump debug information: " + e, e); @@ -1774,11 +1777,15 @@ public class GridCachePartitionExchangeManager<K, V> extends GridCacheSharedMana exchFut.init(); - int dumpedObjects = 0; + int dumpCnt = 0; + + final long futTimeout = 2 * cctx.gridConfig().getNetworkTimeout(); + + long nextDumpTime = 0; while (true) { try { - exchFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); + exchFut.get(futTimeout, TimeUnit.MILLISECONDS); break; } @@ -1788,7 +1795,7 @@ public class GridCachePartitionExchangeManager<K, V> extends GridCacheSharedMana ", node=" + cctx.localNodeId() + "]. " + "Dumping pending objects that might be the cause: "); - if (dumpedObjects < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { + if (nextDumpTime <= U.currentTimeMillis()) { try { dumpDebugInfo(exchFut.topologyVersion()); } @@ -1799,7 +1806,7 @@ public class GridCachePartitionExchangeManager<K, V> extends GridCacheSharedMana if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) U.dumpThreads(log); - dumpedObjects++; + nextDumpTime = U.currentTimeMillis() + nextDumpTimeout(dumpCnt++, futTimeout); } } catch (Exception e) { http://git-wip-us.apache.org/repos/asf/ignite/blob/808ad9fe/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java index 248d44e..89554f3 100644 --- a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java +++ b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/GridDhtPartitionTopologyImpl.java @@ -80,6 +80,10 @@ public class GridDhtPartitionTopologyImpl implements GridDhtPartitionTopology { private static final boolean FULL_MAP_DEBUG = false; /** */ + private static final int DUMP_PENDING_OBJECTS_THRESHOLD = + IgniteSystemProperties.getInteger(IgniteSystemProperties.IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD, 10); + + /** */ private static final Long ZERO = 0L; /** */ @@ -234,7 +238,7 @@ public class GridDhtPartitionTopologyImpl implements GridDhtPartitionTopology { break; } catch (IgniteFutureTimeoutCheckedException ignored) { - if (dumpCnt++ < GridDhtPartitionsExchangeFuture.DUMP_PENDING_OBJECTS_THRESHOLD) { + if (dumpCnt++ < DUMP_PENDING_OBJECTS_THRESHOLD) { U.warn(log, "Failed to wait for partition eviction [" + "topVer=" + topVer + ", group=" + grp.cacheOrGroupName() + http://git-wip-us.apache.org/repos/asf/ignite/blob/808ad9fe/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java ---------------------------------------------------------------------- diff --git a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java index 0c5cdd9..26406c6 100644 --- a/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java +++ b/modules/core/src/main/java/org/apache/ignite/internal/processors/cache/distributed/dht/preloader/GridDhtPartitionsExchangeFuture.java @@ -35,7 +35,6 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReadWriteLock; import org.apache.ignite.IgniteCheckedException; import org.apache.ignite.IgniteLogger; -import org.apache.ignite.IgniteSystemProperties; import org.apache.ignite.cache.PartitionLossPolicy; import org.apache.ignite.cluster.ClusterNode; import org.apache.ignite.events.CacheEvent; @@ -90,7 +89,10 @@ import org.apache.ignite.lang.IgniteRunnable; import org.jetbrains.annotations.Nullable; import org.jsr166.ConcurrentHashMap8; +import static org.apache.ignite.IgniteSystemProperties.IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT; import static org.apache.ignite.IgniteSystemProperties.IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT; +import static org.apache.ignite.IgniteSystemProperties.getBoolean; +import static org.apache.ignite.IgniteSystemProperties.getLong; import static org.apache.ignite.cache.PartitionLossPolicy.READ_ONLY_ALL; import static org.apache.ignite.cache.PartitionLossPolicy.READ_ONLY_SAFE; import static org.apache.ignite.cache.PartitionLossPolicy.READ_WRITE_ALL; @@ -107,10 +109,6 @@ import static org.apache.ignite.internal.managers.communication.GridIoPolicy.SYS @SuppressWarnings({"TypeMayBeWeakened", "unchecked"}) public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityTopologyVersion> implements Comparable<GridDhtPartitionsExchangeFuture>, GridDhtTopologyFuture, CachePartitionExchangeWorkerTask { - /** */ - public static final int DUMP_PENDING_OBJECTS_THRESHOLD = - IgniteSystemProperties.getInteger(IgniteSystemProperties.IGNITE_DUMP_PENDING_OBJECTS_THRESHOLD, 10); - /** Dummy flag. */ private final boolean dummy; @@ -880,20 +878,24 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT if (log.isDebugEnabled()) log.debug("Before waiting for partition release future: " + this); - int dumpedObjects = 0; + int dumpCnt = 0; + + long nextDumpTime = 0; + + long futTimeout = 2 * cctx.gridConfig().getNetworkTimeout(); while (true) { try { - partReleaseFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); + partReleaseFut.get(futTimeout, TimeUnit.MILLISECONDS); break; } catch (IgniteFutureTimeoutCheckedException ignored) { // Print pending transactions and locks that might have led to hang. - if (dumpedObjects < DUMP_PENDING_OBJECTS_THRESHOLD) { + if (nextDumpTime <= U.currentTimeMillis()) { dumpPendingObjects(); - dumpedObjects++; + nextDumpTime = U.currentTimeMillis() + nextDumpTimeout(dumpCnt++, futTimeout); } } } @@ -903,16 +905,17 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT IgniteInternalFuture<?> locksFut = cctx.mvcc().finishLocks(exchId.topologyVersion()); - dumpedObjects = 0; + nextDumpTime = 0; + dumpCnt = 0; while (true) { try { - locksFut.get(2 * cctx.gridConfig().getNetworkTimeout(), TimeUnit.MILLISECONDS); + locksFut.get(futTimeout, TimeUnit.MILLISECONDS); break; } catch (IgniteFutureTimeoutCheckedException ignored) { - if (dumpedObjects < DUMP_PENDING_OBJECTS_THRESHOLD) { + if (nextDumpTime <= U.currentTimeMillis()) { U.warn(log, "Failed to wait for locks release future. " + "Dumping pending objects that might be the cause: " + cctx.localNodeId()); @@ -930,9 +933,9 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT for (Map.Entry<IgniteTxKey, Collection<GridCacheMvccCandidate>> e : locks.entrySet()) U.warn(log, "Awaited locked entry [key=" + e.getKey() + ", mvcc=" + e.getValue() + ']'); - dumpedObjects++; + nextDumpTime = U.currentTimeMillis() + nextDumpTimeout(dumpCnt++, futTimeout); - if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) + if (getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) U.dumpThreads(log); } } @@ -1026,7 +1029,7 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT U.error(log, "Failed to dump debug information: " + e, e); } - if (IgniteSystemProperties.getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) + if (getBoolean(IGNITE_THREAD_DUMP_ON_EXCHANGE_TIMEOUT, false)) U.dumpThreads(log); } @@ -2195,4 +2198,27 @@ public class GridDhtPartitionsExchangeFuture extends GridFutureAdapter<AffinityT return S.toString(CounterWithNodes.class, this); } } + + /** + * @param step Exponent coefficient. + * @param timeout Base timeout. + * @return Time to wait before next debug dump. + */ + public static long nextDumpTimeout(int step, long timeout) { + long limit = getLong(IGNITE_LONG_OPERATIONS_DUMP_TIMEOUT_LIMIT, 30 * 60_000); + + if (limit <= 0) + limit = 30 * 60_000; + + assert step >= 0 : step; + + long dumpFactor = Math.round(Math.pow(2, step)); + + long nextTimeout = timeout * dumpFactor; + + if (nextTimeout <= 0) + return limit; + + return nextTimeout <= limit ? nextTimeout : limit; + } }