Hello I am looking for some general recommendation concerning configuration in a VM environment. We ran into an issue where our network people were doing some ESX maintenance and they confirmed that one of our hosts "moved" at the time we detected a segmentation failure. I am including our current configuration along with the errors we captured. We were thinking of changing the segmentation policy but not sure what if any timeouts we should change.
Configuration Dump: Starting ignite with following configuration "IgniteConfiguration [gridName=null, pubPoolSize=16, callbackPoolSize=16, sysPoolSize=16, mgmtPoolSize=4, igfsPoolSize=2, utilityCachePoolSize=16, utilityCacheKeepAliveTime=60000, marshCachePoolSize=16, marshCacheKeepAliveTime=60000, p2pPoolSize=2, igniteHome=null, igniteWorkDir=null, mbeanSrv=null, nodeId=null, marsh=null, marshLocJobs=false, daemon=false, p2pEnabled=false, netTimeout=5000, sndRetryDelay=1000, sndRetryCnt=3, clockSyncSamples=8, clockSyncFreq=120000, metricsHistSize=10000, metricsUpdateFreq=2000, metricsExpTime=9223372036854775807, discoSpi=TcpDiscoverySpi [addrRslvr=null, sockTimeout=0, ackTimeout=0, marsh=JdkMarshaller [], reconCnt=10, maxAckTimeout=600000, forceSrvMode=false, clientReconnectDisabled=false], segPlc=STOP, segResolveAttempts=2, waitForSegOnStart=true, allResolversPassReq=true, segChkFreq=10000, commSpi=TcpCommunicationSpi [connectGate=null, srvLsnr=org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi$2@211ff379, locAddr=null, locHost=null, locPort=47100, locPortRange=100, shmemPort=48100, directBuf=true, directSndBuf=false, idleConnTimeout=30000, connTimeout=5000, maxConnTimeout=600000, reconCnt=10, sockSndBuf=32768, sockRcvBuf=32768, msgQueueLimit=1024, slowClientQueueLimit=0, nioSrvr=null, shmemSrv=null, tcpNoDelay=true, ackSndThreshold=16, unackedMsgsBufSize=0, sockWriteTimeout=4000, lsnr=null, boundTcpPort=-1, boundTcpShmemPort=-1, selectorsCnt=2, addrRslvr=null, rcvdMsgsCnt=0, sentMsgsCnt=0, rcvdBytesCnt=0, sentBytesCnt=0, ctxInitLatch=java.util.concurrent.CountDownLatch@1eff331d[Count = 1], stopping=false, metricsLsnr=org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi$3@2111dd2c], evtSpi=null, colSpi=null, deploySpi=null, swapSpaceSpi=null, indexingSpi=null, addrRslvr=null, clientMode=null, rebalanceThreadPoolSize=1, txCfg=org.apache.ignite.configuration.TransactionConfiguration@7f4b764c, cacheSanityCheckEnabled=true, discoStartupDelay=60000, deployMode=SHARED, p2pMissedCacheSize=100, locHost=null, timeSrvPortBase=31100, timeSrvPortRange=100, failureDetectionTimeout=10000, metricsLogFreq=60000, hadoopCfg=null, connectorCfg=org.apache.ignite.configuration.ConnectorConfiguration@184bc563, odbcCfg=null, warmupClos=null, atomicCfg=AtomicConfiguration [seqReserveSize=1000, cacheMode=PARTITIONED, backups=0], classLdr=null, sslCtxFactory=null, platformCfg=null, binaryCfg=null, lateAffAssignment=true] Log File: 16:03:57.566 [tcp-disco-msg-worker-#2%null%] WARN org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi - Timed out waiting for message delivery receipt (most probably, the reason is in long GC pauses on remote node; consider tuning GC and increasing 'ackTimeout' configuration property). Will retry to send message with increased timeout. Current timeout: 10000. 16:03:57.568 [tcp-disco-msg-worker-#2%null%] WARN org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi - Failed to send message to next node [msg=TcpDiscoveryStatusCheckMessage [creatorNode=TcpDiscoveryNode [id=c3da99ae-456d-4b52-85e6-c24587fbf08e, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:47500, /0:0:0:0:0:0:0:1%lo:47500, /127.0.0.1:47500], discPort=47500, order=183, intOrder=102, lastExchangeTime=1487887425492, loc=true, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=false], failedNodeId=null, status=0, super=TcpDiscoveryAbstractMessage [sndNodeId=null, id=f9c0ee85a51-c3da99ae-456d-4b52-85e6-c24587fbf08e, verifierNodeId=null, topVer=0, pendingIdx=0, failedNodes=null, isClient=false]], next=TcpDiscoveryNode [id=95e261af-ce55-4843-8d9d-8bf046cc4118, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:47500, /0:0:0:0:0:0:0:1%lo:47500, /127.0.0.1:47500], discPort=47500, order=117, intOrder=69, lastExchangeTime=1487538461673, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=false], errMsg=Failed to send message to next node [msg=TcpDiscoveryStatusCheckMessage [creatorNode=TcpDiscoveryNode [id=c3da99ae-456d-4b52-85e6-c24587fbf08e, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:47500, /0:0:0:0:0:0:0:1%lo:47500, /127.0.0.1:47500], discPort=47500, order=183, intOrder=102, lastExchangeTime=1487887425492, loc=true, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=false], failedNodeId=null, status=0, super=TcpDiscoveryAbstractMessage [sndNodeId=null, id=f9c0ee85a51-c3da99ae-456d-4b52-85e6-c24587fbf08e, verifierNodeId=null, topVer=0, pendingIdx=0, failedNodes=null, isClient=false]], next=ClusterNode [id=95e261af-ce55-4843-8d9d-8bf046cc4118, order=117, addr=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], daemon=false]]] 16:03:57.594 [tcp-disco-msg-worker-#2%null%] WARN org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi - Local node has detected failed nodes and started cluster-wide procedure. To speed up failure detection please see 'Failure Detection' section under javadoc for 'TcpDiscoverySpi' 16:03:57.601 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=95e261af-ce55-4843-8d9d-8bf046cc4118, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:47500, /0:0:0:0:0:0:0:1%lo:47500, /127.0.0.1:47500], discPort=47500, order=117, intOrder=69, lastExchangeTime=1487538461673, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=false] 16:03:57.603 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=206, servers=1, clients=19, CPUs=10, heap=86.0GB] 16:04:05.670 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=5b65d779-8317-496c-b8f2-cdc4de33705c, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=50, intOrder=30, lastExchangeTime=1487538461443, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:05.672 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=207, servers=1, clients=18, CPUs=10, heap=85.0GB] 16:04:06.106 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=a9104ef7-da59-4575-acdc-dd98fece82c5, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=141, intOrder=81, lastExchangeTime=1487538461633, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:06.112 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=208, servers=1, clients=17, CPUs=10, heap=81.0GB] 16:04:06.233 [grid-nio-worker-1-#11%null%] WARN org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Communication SPI Session write timed out (consider increasing 'socketWriteTimeout' configuration property) [remoteAddr=/172.22.190.163:42506, writeTimeout=4000] 16:04:06.538 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=9c972234-7c36-40e9-88f2-bc9da61a02ec, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=145, intOrder=83, lastExchangeTime=1487538461633, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:06.540 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=209, servers=1, clients=16, CPUs=10, heap=76.0GB] 16:04:06.832 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=1addd9a8-7022-4286-a321-d4a5aef5985b, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=156, intOrder=84, lastExchangeTime=1487538461673, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:06.848 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=210, servers=1, clients=15, CPUs=10, heap=76.0GB] 16:04:06.998 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=6e92ece8-f7bc-482b-bfa6-a25276bed9cd, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=173, intOrder=94, lastExchangeTime=1487538461673, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:07.003 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=211, servers=1, clients=14, CPUs=10, heap=71.0GB] 16:04:07.104 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=b3d132d6-b315-4be3-b9ac-9ef427cef0b7, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=196, intOrder=108, lastExchangeTime=1487818281767, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:07.110 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=212, servers=1, clients=13, CPUs=10, heap=69.0GB] 16:04:07.204 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=1d1cae7c-fa06-49ed-b50a-f39146c8b06b, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=202, intOrder=111, lastExchangeTime=1487818535575, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:07.209 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=213, servers=1, clients=12, CPUs=10, heap=65.0GB] 16:04:07.261 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=79379bea-a740-4385-a2da-73d29d4d658c, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=205, intOrder=113, lastExchangeTime=1487819123943, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:07.264 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=214, servers=1, clients=11, CPUs=10, heap=62.0GB] 16:04:11.795 [exchange-worker-#22%null%] WARN org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Failed to send partitions full message [node=TcpDiscoveryNode [id=9a79ea6a-487e-4296-81a5-483ed93ceaeb, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=198, intOrder=109, lastExchangeTime=1487818313317, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true], err=class o.a.i.IgniteCheckedException: Failed to send message (node may have left the grid or TCP connection cannot be established due to firewall issues) [node=TcpDiscoveryNode [id=9a79ea6a-487e-4296-81a5-483ed93ceaeb, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.163], sockAddrs=[rh6-pr-ho-busacq-02/172.22.190.163:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=198, intOrder=109, lastExchangeTime=1487818313317, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true], topic=TOPIC_CACHE, msg=GridDhtPartitionsFullMessage [parts={-2100569601=GridDhtPartitionFullMap [nodeId=c3da99ae-456d-4b52-85e6-c24587fbf08e, nodeOrder=183, updateSeq=886, size=1], 689859866=GridDhtPartitionFullMap [nodeId=c3da99ae-456d-4b52-85e6-c24587fbf08e, nodeOrder=183, updateSeq=1293, size=1], -1728077271=GridDhtPartitionFullMap [nodeId=c3da99ae-456d-4b52-85e6-c24587fbf08e, nodeOrder=183, updateSeq=1300, size=1], 1597441201=GridDhtPartitionFullMap [nodeId=c3da99ae-456d-4b52-85e6-c24587fbf08e, nodeOrder=183, updateSeq=1299, size=1], 524260103=GridDhtPartitionFullMap [nodeId=c3da99ae-456d-4b52-85e6-c24587fbf08e, nodeOrder=183, updateSeq=1301, size=1], -667441411=GridDhtPartitionFullMap [nodeId=c3da99ae-456d-4b52-85e6-c24587fbf08e, nodeOrder=183, updateSeq=1302, size=1], 1325947219=GridDhtPartitionFullMap [nodeId=c3da99ae-456d-4b52-85e6-c24587fbf08e, nodeOrder=183, updateSeq=806, size=1], -486088806=GridDhtPartitionFullMap [nodeId=c3da99ae-456d-4b52-85e6-c24587fbf08e, nodeOrder=183, updateSeq=1299, size=1]}, partCntrs=null, topVer=AffinityTopologyVersion [topVer=-1, minorTopVer=0], compress=true, partCnt=8, super=GridDhtPartitionsAbstractMessage [exchId=null, lastVer=null, flags=1, super=GridCacheMessage [msgId=10020813, depInfo=null, err=null, skipPrepare=false, cacheId=0, cacheId=0]]], policy=2]] 16:04:11.861 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=206, minorTopVer=0], evt=NODE_FAILED, node=95e261af-ce55-4843-8d9d-8bf046cc4118] 16:04:11.894 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=207, minorTopVer=0], evt=NODE_FAILED, node=5b65d779-8317-496c-b8f2-cdc4de33705c] 16:04:11.922 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=208, minorTopVer=0], evt=NODE_FAILED, node=a9104ef7-da59-4575-acdc-dd98fece82c5] 16:04:11.940 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=209, minorTopVer=0], evt=NODE_FAILED, node=9c972234-7c36-40e9-88f2-bc9da61a02ec] 16:04:11.953 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=210, minorTopVer=0], evt=NODE_FAILED, node=1addd9a8-7022-4286-a321-d4a5aef5985b] 16:04:11.970 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=211, minorTopVer=0], evt=NODE_FAILED, node=6e92ece8-f7bc-482b-bfa6-a25276bed9cd] 16:04:11.996 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=212, minorTopVer=0], evt=NODE_FAILED, node=b3d132d6-b315-4be3-b9ac-9ef427cef0b7] 16:04:12.017 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=213, minorTopVer=0], evt=NODE_FAILED, node=1d1cae7c-fa06-49ed-b50a-f39146c8b06b] 16:04:12.076 [exchange-worker-#22%null%] INFO org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Skipping rebalancing (nothing scheduled) [top=AffinityTopologyVersion [topVer=214, minorTopVer=0], evt=NODE_FAILED, node=79379bea-a740-4385-a2da-73d29d4d658c] 16:04:15.323 [grid-nio-worker-1-#11%null%] WARN org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Failed to process selector key (will close): GridSelectorNioSessionImpl [selectorIdx=1, queueSize=0, writeBuf=java.nio.DirectByteBuffer[pos=0 lim=32768 cap=32768], readBuf=java.nio.DirectByteBuffer[pos=0 lim=32768 cap=32768], recovery=null, super=GridNioSessionImpl [locAddr=/172.22.190.162:47108, rmtAddr=/172.22.190.163:38916, createTime=1487887452150, closeTime=0, bytesSent=2182, bytesRcvd=1910, sndSchedTime=1487887455302, lastSndTime=1487887455322, lastRcvTime=1487887455322, readsPaused=false, filterChain=FilterChain[filters=[GridNioCodecFilter [parser=o.a.i.i.util.nio.GridDirectParser@6357fa17, directMode=true], GridConnectionBytesVerifyFilter, SSL filter], accepted=true]] 16:04:15.324 [grid-nio-worker-1-#11%null%] WARN org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Closing NIO session because of unhandled exception [cls=class o.a.i.i.util.nio.GridNioException, msg=Failed to create message writer.] 16:04:19.502 [tcp-disco-msg-worker-#2%null%] WARN org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi - Node is out of topology (probably, due to short-time network problems). 16:04:19.502 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Local node SEGMENTED: TcpDiscoveryNode [id=c3da99ae-456d-4b52-85e6-c24587fbf08e, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:47500, /0:0:0:0:0:0:0:1%lo:47500, /127.0.0.1:47500], discPort=47500, order=183, intOrder=102, lastExchangeTime=1487887459497, loc=true, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=false] 16:04:19.577 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Stopping local node according to configured segmentation policy. 16:04:19.583 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=69633095-feec-48fb-b96d-23a1a7b9b620, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=166, intOrder=91, lastExchangeTime=1487538461663, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:19.589 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=215, servers=1, clients=10, CPUs=10, heap=61.0GB] 16:04:19.603 [Thread-6376] INFO org.apache.ignite.internal.processors.rest.protocols.tcp.GridTcpRestProtocol - Command protocol successfully stopped: TCP binary 16:04:19.608 [exchange-worker-#22%null%] ERROR org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture - Failed to reinitialize local partitions (preloading will be stopped): GridDhtPartitionExchangeId [topVer=AffinityTopologyVersion [topVer=215, minorTopVer=0], nodeId=69633095, evt=NODE_FAILED] java.lang.IllegalStateException: Failed to process swap event (grid is stopping). at org.apache.ignite.internal.processors.query.GridQueryProcessor.onUnswap(GridQueryProcessor.java:1235) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.query.GridCacheQueryManager.onUnswap(GridCacheQueryManager.java:394) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheMapEntry.evictFailed(GridCacheMapEntry.java:4315) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheMapEntry.evictInternal(GridCacheMapEntry.java:4212) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheEvictionManager.evict0(GridCacheEvictionManager.java:709) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheEvictionManager.touch(GridCacheEvictionManager.java:798) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheMvccManager.removeExplicitNodeLocks(GridCacheMvccManager.java:330) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onLeft(GridDhtPartitionsExchangeFuture.java:831) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onClientNodeEvent(GridDhtPartitionsExchangeFuture.java:614) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.init(GridDhtPartitionsExchangeFuture.java:466) [ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body(GridCachePartitionExchangeManager.java:1656) [ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:110) [ignite-core-1.8.0.jar:1.8.0] at java.lang.Thread.run(Thread.java:745) [?:1.8.0_66] 16:04:19.609 [exchange-worker-#22%null%] ERROR org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager - Failed to wait for completion of partition map exchange (preloading will not start): GridDhtPartitionsExchangeFuture [dummy=false, forcePreload=false, reassign=false, discoEvt=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=69633095-feec-48fb-b96d-23a1a7b9b620, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=166, intOrder=91, lastExchangeTime=1487538461663, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true], topVer=215, nodeId8=c3da99ae, msg=Node failed: TcpDiscoveryNode [id=69633095-feec-48fb-b96d-23a1a7b9b620, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=166, intOrder=91, lastExchangeTime=1487538461663, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true], type=NODE_FAILED, tstamp=1487887459589], crd=TcpDiscoveryNode [id=c3da99ae-456d-4b52-85e6-c24587fbf08e, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:47500, /0:0:0:0:0:0:0:1%lo:47500, /127.0.0.1:47500], discPort=47500, order=183, intOrder=102, lastExchangeTime=1487887459497, loc=true, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=false], exchId=GridDhtPartitionExchangeId [topVer=AffinityTopologyVersion [topVer=215, minorTopVer=0], nodeId=69633095, evt=NODE_FAILED], added=true, initFut=GridFutureAdapter [resFlag=2, res=false, startTime=1487887459589, endTime=1487887459599, ignoreInterrupts=false, state=DONE], init=false, topSnapshot=null, lastVer=null, partReleaseFut=null, affChangeMsg=null, skipPreload=false, clientOnlyExchange=false, initTs=1487887459589, centralizedAff=false, evtLatch=0, remaining=[], srvNodes=[TcpDiscoveryNode [id=c3da99ae-456d-4b52-85e6-c24587fbf08e, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:47500, /0:0:0:0:0:0:0:1%lo:47500, /127.0.0.1:47500], discPort=47500, order=183, intOrder=102, lastExchangeTime=1487887459497, loc=true, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=false]], super=GridFutureAdapter [resFlag=1, res=java.lang.IllegalStateException: Failed to process swap event (grid is stopping)., startTime=1487887459589, endTime=1487887459599, ignoreInterrupts=false, state=DONE]] org.apache.ignite.IgniteCheckedException: Failed to process swap event (grid is stopping). at org.apache.ignite.internal.util.IgniteUtils.cast(IgniteUtils.java:7185) [ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.util.future.GridFutureAdapter.get0(GridFutureAdapter.java:197) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.util.future.GridFutureAdapter.get(GridFutureAdapter.java:138) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body(GridCachePartitionExchangeManager.java:1662) [ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:110) [ignite-core-1.8.0.jar:1.8.0] at java.lang.Thread.run(Thread.java:745) [?:1.8.0_66] Caused by: java.lang.IllegalStateException: Failed to process swap event (grid is stopping). at org.apache.ignite.internal.processors.query.GridQueryProcessor.onUnswap(GridQueryProcessor.java:1235) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.query.GridCacheQueryManager.onUnswap(GridCacheQueryManager.java:394) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheMapEntry.evictFailed(GridCacheMapEntry.java:4315) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheMapEntry.evictInternal(GridCacheMapEntry.java:4212) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheEvictionManager.evict0(GridCacheEvictionManager.java:709) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheEvictionManager.touch(GridCacheEvictionManager.java:798) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCacheMvccManager.removeExplicitNodeLocks(GridCacheMvccManager.java:330) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onLeft(GridDhtPartitionsExchangeFuture.java:831) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.onClientNodeEvent(GridDhtPartitionsExchangeFuture.java:614) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.init(GridDhtPartitionsExchangeFuture.java:466) ~[ignite-core-1.8.0.jar:1.8.0] at org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body(GridCachePartitionExchangeManager.java:1656) ~[ignite-core-1.8.0.jar:1.8.0] ... 2 more 16:04:19.668 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=85d9bab7-916b-4a16-9044-f18ff7722e56, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=170, intOrder=92, lastExchangeTime=1487538461673, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:19.677 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=216, servers=1, clients=9, CPUs=10, heap=61.0GB] 16:04:19.739 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=2e5d0db3-2b5b-4257-8d81-d31895ead461, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.162], sockAddrs=[rh6-pr-ho-busacq-01/172.22.190.162:0, /0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0], discPort=0, order=180, intOrder=100, lastExchangeTime=1487538461683, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:19.750 [grid-nio-worker-1-#11%null%] WARN org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi - Communication SPI Session write timed out (consider increasing 'socketWriteTimeout' configuration property) [remoteAddr=/172.22.190.80:39238, writeTimeout=4000] 16:04:19.760 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=217, servers=1, clients=8, CPUs=10, heap=57.0GB] 16:04:19.760 [disco-event-worker-#18%null%] WARN org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Node FAILED: TcpDiscoveryNode [id=79c207fe-504e-4bd3-8f1b-590a02219873, addrs=[0:0:0:0:0:0:0:1%lo, 127.0.0.1, 172.22.190.80], sockAddrs=[/0:0:0:0:0:0:0:1%lo:0, /127.0.0.1:0, rh6-pr-ho-img-01/172.22.190.80:0], discPort=0, order=185, intOrder=103, lastExchangeTime=1487818028251, loc=false, ver=1.8.0#20161205-sha1:9ca40dbe, isClient=true] 16:04:19.777 [disco-event-worker-#18%null%] INFO org.apache.ignite.internal.managers.discovery.GridDiscoveryManager - Topology snapshot [ver=218, servers=1, clients=7, CPUs=8, heap=45.0GB] 16:04:20.169 [Thread-6376] INFO org.apache.ignite.internal.processors.cache.GridCacheProcessor - Stopped cache: DocumentDataCache 16:04:20.172 [Thread-6376] INFO org.apache.ignite.internal.processors.cache.GridCacheProcessor - Stopped cache: RegistrationCache 16:04:20.174 [Thread-6376] INFO org.apache.ignite.internal.processors.cache.GridCacheProcessor - Stopped cache: ConversionStatusCache 16:04:20.435 [Thread-6376] INFO org.apache.ignite.internal.processors.cache.GridCacheProcessor - Stopped cache: ImageCache 16:04:20.436 [Thread-6376] INFO org.apache.ignite.internal.processors.cache.GridCacheProcessor - Stopped cache: ignite-marshaller-sys-cache 16:04:20.436 [Thread-6376] INFO org.apache.ignite.internal.processors.cache.GridCacheProcessor - Stopped cache: ignite-sys-cache 16:04:20.436 [Thread-6376] INFO org.apache.ignite.internal.processors.cache.GridCacheProcessor - Stopped cache: ignite-atomics-sys-cache 16:04:20.436 [Thread-6376] INFO org.apache.ignite.internal.processors.cache.GridCacheProcessor - Stopped cache: LoanContainerCache 16:04:20.468 [Thread-6376] INFO org.apache.ignite.internal.IgniteKernal - >>> +---------------------------------------------------------------------------------+ >>> Ignite ver. 1.8.0#20161205-sha1:9ca40dbeb7d559fcb299bdb6f5c90cdf8ce7e533 >>> stopped OK >>> +---------------------------------------------------------------------------------+ >>> Grid uptime: 36:56:35:190 Xml Config: <import resource="classpath:ignite-cacheDefs.xml"/> <bean id="ignite.cfg" class="org.apache.ignite.configuration.IgniteConfiguration"> <property name="gridLogger"> <bean class="org.apache.ignite.logger.slf4j.Slf4jLogger" /> </property> <property name="communicationSpi"> <bean class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi"> <property name="socketWriteTimeout" value="4000"/> </bean> </property> <property name="peerClassLoadingEnabled" value="false"/> <property name="userAttributes"> <map> <entry key="AppName" value="Bus Acq Cache Manager"/> </map> </property> <property name="cacheConfiguration"> <list> <bean parent="imageCache"> <property name="offHeapMaxMemory" value="#{10 * 1024L * 1024L * 1024L}" /> <property name="statisticsEnabled" value="true"/> </bean> <bean parent="conversionStatusCache"> </bean> <bean parent="registrationCache"> </bean> <bean parent="documentDataCache"> </bean> </list> </property> <property name="discoverySpi"> <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi"> <property name="ipFinder"> <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.multicast.TcpDiscoveryMulticastIpFinder"> <property name="multicastGroup" value="xxx.xx.xx.x" /> </bean> </property> </bean> </property> <property name="metricsLogFrequency" value="1800000" /> </bean> Cache Definitions: <bean id="imageCache" abstract="true" class="org.apache.ignite.configuration.CacheConfiguration"> <property name="name" value="ImageCache"/> <property name="cacheMode" value="REPLICATED"/> <property name="memoryMode" value="OFFHEAP_TIERED"/> <property name="swapEnabled" value="false"/> <property name="indexedTypes"> <util:list id="myList" value-type="java.lang.Class"> <value>java.util.UUID</value> <value>com.mgic.documentviewer.imaging.cache.beans.TiffPage</value> </util:list> </property> <property name="sqlOnheapRowCacheSize" value="1" /> <property name="atomicityMode" value="TRANSACTIONAL" /> </bean> <bean id="registrationCache" abstract="true" class="org.apache.ignite.configuration.CacheConfiguration"> <property name="name" value="RegistrationCache"/> <property name="cacheMode" value="REPLICATED"/> <property name="memoryMode" value="ONHEAP_TIERED"/> <property name="swapEnabled" value="false"/> <property name="atomicityMode" value="TRANSACTIONAL" /> </bean> <bean id="documentDataCache" abstract="true" class="org.apache.ignite.configuration.CacheConfiguration"> <property name="name" value="DocumentDataCache"/> <property name="cacheMode" value="REPLICATED"/> <property name="memoryMode" value="ONHEAP_TIERED"/> <property name="swapEnabled" value="false"/> <property name="atomicityMode" value="TRANSACTIONAL" /> </bean> <bean id="conversionStatusCache" abstract="true" class="org.apache.ignite.configuration.CacheConfiguration"> <property name="name" value="ConversionStatusCache"/> <property name="cacheMode" value="REPLICATED"/> <property name="memoryMode" value="ONHEAP_TIERED"/> <property name="swapEnabled" value="false"/> <property name="atomicityMode" value="TRANSACTIONAL" /> </bean> <bean id="loanContainerCache" abstract="true" class="org.apache.ignite.configuration.CacheConfiguration"> <property name="name" value="LoanContainerCache"/> <property name="cacheMode" value="REPLICATED"/> <property name="memoryMode" value="ONHEAP_TIERED"/> <property name="swapEnabled" value="false"/> <property name="atomicityMode" value="TRANSACTIONAL" /> </bean> <bean id="testLockCache" abstract="true" class="org.apache.ignite.configuration.CacheConfiguration"> <property name="name" value="TestLockCache"/> <property name="cacheMode" value="REPLICATED"/> <property name="memoryMode" value="ONHEAP_TIERED"/> <property name="swapEnabled" value="false"/> <property name="atomicityMode" value="TRANSACTIONAL" /> <property name="writeSynchronizationMode" value="FULL_SYNC"/> </bean> <bean id="testTransactionCache" abstract="true" class="org.apache.ignite.configuration.CacheConfiguration"> <property name="name" value="TestTransactionCache"/> <property name="cacheMode" value="REPLICATED"/> <property name="memoryMode" value="ONHEAP_TIERED"/> <property name="swapEnabled" value="false"/> <property name="atomicityMode" value="TRANSACTIONAL" /> <property name="writeSynchronizationMode" value="FULL_SYNC"/> </bean> -- View this message in context: http://apache-ignite-users.70518.x6.nabble.com/Configuration-Recommendations-after-ESX-maintenance-failure-tp10878.html Sent from the Apache Ignite Users mailing list archive at Nabble.com.