Hi,
I am using Apache Ignite 1.9.0, and made the topology with 6 nodes on VM, it
is stable.
but moved to docker latest, when run some time ago, the exception was
occurring by one node.
one node drop from cluster, and huge amount of TIME_WAIT connections.
I want to know what i have to do , what i have to check. 

Thanks in advance. 
Regards,
Wilson


--------------------------
That is iginte-config.xml.
--------------------------
<?xml version="1.0" encoding="UTF-8"?>




<beans xmlns="http://www.springframework.org/schema/beans";
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
        xsi:schemaLocation="
        http://www.springframework.org/schema/beans
        http://www.springframework.org/schema/beans/spring-beans.xsd";>
        <bean id="ignite.cfg"
class="org.apache.ignite.configuration.IgniteConfiguration">

                
                <property name="userAttributes">
                        <map>
                                <entry key="ROLE_MASTER" value="worker" />
                                <entry key="ROLE_TRANSACTION" value="worker" />
                        </map>
                </property>

                
                <property name="publicThreadPoolSize" value="128" />
                
                <property name="systemThreadPoolSize" value="64" />
                
                <property name="communicationSpi">
                        <bean
class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
                                
                                <property name="localPort" value="48588" />
                                <property name="socketWriteTimeout" 
value="30000" />
                                <property name="messageQueueLimit" value="1024" 
/>
                        </bean>
                </property>

                <property name="discoverySpi">
                        <bean 
class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
                                <property name="localPort" value="48505" />
                                <property name="localPortRange" value="2" />
                                
                                <property name="networkTimeout" value="60000" />
                                <property name="ackTimeout" value="30000" />
                                <property name="socketTimeout" value="10000" />
                                <property name="reconnectCount" value="100" />

                                <property name="ipFinder">
                                        <bean
                                        
class="org.apache.ignite.spi.discovery.tcp.ipfinder.vm.TcpDiscoveryVmIpFinder">

                                                <property name="addresses">
                                                        <list>
                                                                
<value>10.1.x.242:48505..48506</value>
                                                                
<value>10.1.x.243:48505..48506</value>
                                                                
<value>10.1.x.244:48505..48506</value>
                                                        </list>
                                                </property>
                                        </bean>
                                </property>
                        </bean>
                </property>
        </bean>
</beans>


--------------------------
That is exception log.
--------------------------
[21:26:07,108][INFO][grid-timeout-worker-#55%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=2f8e54aa, name=null, uptime=14:54:04:809]
    ^-- H/N/C [hosts=3, nodes=9, CPUs=144]
    ^-- CPU [cur=0.03%, avg=0.24%, GC=0%]
    ^-- Heap [used=2359MB, free=90.4%, comm=16384MB]
    ^-- Non heap [used=79MB, free=-1%, comm=82MB]
    ^-- Public thread pool [active=0, idle=48, qSize=0]
    ^-- System thread pool [active=0, idle=48, qSize=0]
    ^-- Outbound messages queue [size=0]
[21:26:39,181][WARNING][sys-stripe-43-#44%null%][TcpCommunicationSpi]
Connect timed out (consider increasing 'failureDetectionTimeout'
configuration property) [addr=/172.20.0.1:48588,
failureDetectionTimeout=10000]
[21:26:40,757][WARNING][grid-timeout-worker-#55%null%][G] >>> Possible
starvation in striped pool: sys-stripe-43-#44%null%
[]
deadlock: false
completed: 12520
Thread [name="sys-stripe-43-#44%null%", id=69, state=RUNNABLE, blockCnt=779,
waitCnt=12399]
        at sun.nio.ch.Net.poll(Native Method)
        at sun.nio.ch.SocketChannelImpl.poll(SocketChannelImpl.java:954)
        at sun.nio.ch.SocketAdaptor.connect(SocketAdaptor.java:110)
        at
o.a.i.spi.communication.tcp.TcpCommunicationSpi.createTcpClient(TcpCommunicationSpi.java:2822)
        at
o.a.i.spi.communication.tcp.TcpCommunicationSpi.createNioClient(TcpCommunicationSpi.java:2587)
        at
o.a.i.spi.communication.tcp.TcpCommunicationSpi.reserveClient(TcpCommunicationSpi.java:2479)
        at
o.a.i.spi.communication.tcp.TcpCommunicationSpi.sendMessage0(TcpCommunicationSpi.java:2340)
        at
o.a.i.spi.communication.tcp.TcpCommunicationSpi.sendMessage(TcpCommunicationSpi.java:2304)
        at
o.a.i.i.managers.communication.GridIoManager.send(GridIoManager.java:1293)
        at
o.a.i.i.managers.communication.GridIoManager.send(GridIoManager.java:1362)
        at
o.a.i.i.processors.cache.GridCacheIoManager.send(GridCacheIoManager.java:910)
        at
o.a.i.i.processors.cache.GridCacheIoManager.send(GridCacheIoManager.java:1059)
        at
o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicAbstractUpdateFuture.map(GridDhtAtomicAbstractUpdateFuture.java:358)
        at
o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.updateAllAsyncInternal0(GridDhtAtomicCache.java:1970)
        at
o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.updateAllAsyncInternal(GridDhtAtomicCache.java:1721)
        at
o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.processNearAtomicUpdateRequest(GridDhtAtomicCache.java:3181)
        at
o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.access$1600(GridDhtAtomicCache.java:126)
        at
o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache$6.apply(GridDhtAtomicCache.java:338)
        at
o.a.i.i.processors.cache.distributed.dht.atomic.GridDhtAtomicCache$6.apply(GridDhtAtomicCache.java:333)
        at
o.a.i.i.processors.cache.GridCacheIoManager.processMessage(GridCacheIoManager.java:827)
        at
o.a.i.i.processors.cache.GridCacheIoManager.onMessage0(GridCacheIoManager.java:369)
        at
o.a.i.i.processors.cache.GridCacheIoManager.handleMessage(GridCacheIoManager.java:293)
        at
o.a.i.i.processors.cache.GridCacheIoManager.access$000(GridCacheIoManager.java:95)
        at
o.a.i.i.processors.cache.GridCacheIoManager$1.onMessage(GridCacheIoManager.java:238)
        at
o.a.i.i.managers.communication.GridIoManager.invokeListener(GridIoManager.java:1222)
        at
o.a.i.i.managers.communication.GridIoManager.processRegularMessage0(GridIoManager.java:850)
        at
o.a.i.i.managers.communication.GridIoManager.access$2100(GridIoManager.java:108)
        at
o.a.i.i.managers.communication.GridIoManager$7.run(GridIoManager.java:790)
        at o.a.i.i.util.StripedExecutor$Stripe.run(StripedExecutor.java:428)
        at java.lang.Thread.run(Thread.java:745)

[21:26:59,319][SEVERE][tcp-disco-sock-reader-#4%null%][TcpDiscoverySpi]
Caught exception on message read
[sock=Socket[addr=/10.1.41.242,port=49985,localport=48506],
locNodeId=2f8e54aa-5546-4b83-8abf-f4651bf76276,
rmtNodeId=e0da5175-9f43-427a-b8be-7e588ed78492]
class org.apache.ignite.IgniteCheckedException: Failed to deserialize object
with given class loader: sun.misc.Launcher$AppClassLoader@764c12b6
        at
org.apache.ignite.marshaller.jdk.JdkMarshaller.unmarshal0(JdkMarshaller.java:127)
        at
org.apache.ignite.marshaller.AbstractNodeNameAwareMarshaller.unmarshal(AbstractNodeNameAwareMarshaller.java:94)
        at
org.apache.ignite.internal.util.IgniteUtils.unmarshal(IgniteUtils.java:9765)
        at
org.apache.ignite.spi.discovery.tcp.ServerImpl$SocketReader.body(ServerImpl.java:5790)
        at org.apache.ignite.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)
Caused by: java.io.EOFException
        at
java.io.ObjectInputStream$PeekInputStream.readFully(ObjectInputStream.java:2353)
        at
java.io.ObjectInputStream$BlockDataInputStream.readShort(ObjectInputStream.java:2822)
        at 
java.io.ObjectInputStream.readStreamHeader(ObjectInputStream.java:804)
        at java.io.ObjectInputStream.<init>(ObjectInputStream.java:301)
        at
org.apache.ignite.marshaller.jdk.JdkMarshallerObjectInputStream.<init>(JdkMarshallerObjectInputStream.java:39)
        at
org.apache.ignite.marshaller.jdk.JdkMarshaller.unmarshal0(JdkMarshaller.java:117)
        ... 4 more
[21:26:59,600][WARNING][sys-stripe-43-#44%null%][TcpCommunicationSpi]
*TcpCommunicationSpi failed to establish connection to node, node will be
dropped from cluster* [rmtNode=TcpDiscoveryNode
[id=6871b848-3b2e-495a-a0a7-81ce466f3139, addrs=[0:0:0:0:0:0:0:1%lo,
10.1.41.244, 127.0.0.1, 172.17.0.1, 172.20.0.1],
sockAddrs=[/0:0:0:0:0:0:0:1%lo:48505, /127.0.0.1:48505, /172.17.0.1:48505,
/10.1.41.244:48505, /172.20.0.1:48505], discPort=48505, order=1, intOrder=1,
lastExchangeTime=1561617110668, loc=false, ver=1.9.0#20170302-sha1:a8169d0a,
isClient=false], err=class o.a.i.IgniteCheckedException: Failed to connect
to node (is node still alive?). Make sure that each ComputeTask and cache
Transaction has a timeout set in order to prevent parties from waiting
forever in case of network issues
[nodeId=6871b848-3b2e-495a-a0a7-81ce466f3139, addrs=[/172.20.0.1:48588,
s7spkhdp04.buyabs.corp/10.1.41.244:48588, /172.17.0.1:48588,
/0:0:0:0:0:0:0:1%lo:48588, /127.0.0.1:48588]], connectErrs=[class
o.a.i.IgniteCheckedException: Failed to connect to address:
/172.20.0.1:48588, class o.a.i.IgniteCheckedException: Failed to connect to
address: s7spkhdp04.buyabs.corp/10.1.41.244:48588, class
o.a.i.IgniteCheckedException: Failed to connect to address:
/172.17.0.1:48588, class o.a.i.IgniteCheckedException: Failed to connect to
address: /0:0:0:0:0:0:0:1%lo:48588, class o.a.i.IgniteCheckedException:
Failed to connect to address: /127.0.0.1:48588]]
[21:27:07,119][INFO][grid-timeout-worker-#55%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=2f8e54aa, name=null, uptime=14:55:04:816]
    ^-- H/N/C [hosts=3, nodes=9, CPUs=144]
    ^-- CPU [cur=1.8%, avg=0.24%, GC=0%]
    ^-- Heap [used=3847MB, free=84.34%, comm=16384MB]
    ^-- Non heap [used=81MB, free=-1%, comm=83MB]
    ^-- Public thread pool [active=0, idle=39, qSize=0]
    ^-- System thread pool [active=0, idle=10, qSize=0]
    ^-- Outbound messages queue [size=0]
[21:27:10,763][WARNING][grid-timeout-worker-#55%null%][G] >>> Possible
starvation in striped pool: sys-stripe-35-#36%null%

---------------------------------------------
huge amount of TIME_WAIT connections
---------------------------------------------
 ~]$ netstat -an|awk '/tcp/ {print $6}'|sort|uniq -c
    114 CLOSE_WAIT
    220 ESTABLISHED
     49 LISTEN
  45018 TIME_WAIT

 ~]$ netstat -a
tcp6       0      0 localhost:48588         localhost:47334        
TIME_WAIT
tcp6       0      0 hostname:48588        hostname:44356 TIME_WAIT
tcp6       0      0 localhost:48505         localhost:49153        
TIME_WAIT
tcp6       0      0 hostname:48588        hostname:39694 TIME_WAIT
tcp6       0      0 172.19.0.1:48588      hostname:36024 TIME_WAIT
tcp6       0      0 localhost:48588         localhost:53930        
TIME_WAIT
.....





--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/

Reply via email to