Hi,

    Recently, I did some experiments to test the map-reduce job's
performance with ignite hadoop accelerator. It really make sense to
accelerate a map-reduce job. In my previous experiment, I store a 10GB file
into IGFS. I design a I/O intensive job whose duty is only to read the 10GB
file. I find that using IGFS can accelerate my job obviously.


    However, when I change to test on the map-reduce job which is used in
our production environment. The job is stuck when running. The job will read
more than 30GB data and produce more than 150GB intermediate data.


My basic info:

I have three node cluster.
host1:10.8.12.16:  40 cores 125GB memory
host2:10.8.12.17:  40 cores 125GB memory
host3:10.8.12.18:  40 cores 125GB memory


The full configuration file on host1:

<?xml version="1.0" encoding="UTF-8"?>




<beans xmlns="http://www.springframework.org/schema/beans";
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
xmlns:util="http://www.springframework.org/schema/util";
       xsi:schemaLocation="http://www.springframework.org/schema/beans
       http://www.springframework.org/schema/beans/spring-beans.xsd
       http://www.springframework.org/schema/util
       http://www.springframework.org/schema/util/spring-util.xsd";>

    
    <description>
        Spring file for Ignite node configuration with IGFS and Apache
Hadoop map-reduce support enabled.
        Ignite node will start with this configuration by default.
    </description>

    
    <bean id="propertyConfigurer"
class="org.springframework.beans.factory.config.PropertyPlaceholderConfigurer">
        <property name="systemPropertiesModeName"
value="SYSTEM_PROPERTIES_MODE_FALLBACK"/>
        <property name="searchSystemEnvironment" value="true"/>
    </bean>

    
    <bean id="igfsCfgBase"
class="org.apache.ignite.configuration.FileSystemConfiguration"
abstract="true">
        
                
        <property name="blockSize" value="#{1 * 1024L *1024L}"/>
                
        <property name="perNodeBatchSize" value="16"/>
                
        <property name="perNodeParallelBatchCount" value="4"/>
                
        <property name="prefetchBlocks" value="8"/>
    </bean>

    <bean class="org.apache.ignite.configuration.CacheConfiguration">
                
                <property name="memoryMode" value="ONHEAP_TIERED"/> 
                
                <property name="offHeapMaxMemory" value="10737418240"/>
                
                <property name="evictionPolicy">
                        <bean
class="org.apache.ignite.cache.eviction.igfs.IgfsPerBlockLruEvictionPolicy">
                                
                                <property name="maxSize" value="32212254720"/>
                        </bean>
                </property>

                
                <property name="swapEnabled" value="true"/>

        </bean>

    
    <bean id="dataCacheCfgBase"
class="org.apache.ignite.configuration.CacheConfiguration" abstract="true">
        <property name="cacheMode" value="PARTITIONED"/>
        <property name="atomicityMode" value="TRANSACTIONAL"/>
        <property name="writeSynchronizationMode" value="FULL_SYNC"/>
        <property name="backups" value="0"/>
        <property name="affinityMapper">
            <bean
class="org.apache.ignite.igfs.IgfsGroupDataBlocksKeyMapper">
                <constructor-arg value="512"/>
            </bean>
        </property>
    </bean>

    
    <bean id="metaCacheCfgBase"
class="org.apache.ignite.configuration.CacheConfiguration" abstract="true">
        <property name="cacheMode" value="REPLICATED"/>
        <property name="atomicityMode" value="TRANSACTIONAL"/>
        <property name="writeSynchronizationMode" value="FULL_SYNC"/>
    </bean>

    
    <bean id="grid.cfg"
class="org.apache.ignite.configuration.IgniteConfiguration">





        
        <property name="hadoopConfiguration">
            <bean
class="org.apache.ignite.configuration.HadoopConfiguration">
                
                <property name="finishedJobInfoTtl" value="300000"/>
            </bean>
        </property>

        
        <property name="connectorConfiguration">
            <bean
class="org.apache.ignite.configuration.ConnectorConfiguration">
                <property name="port" value="11211"/>
                <property name="idleTimeout" value="300000"/>
            </bean>
        </property>

        
        <property name="fileSystemConfiguration">
            <list>
                <bean
class="org.apache.ignite.configuration.FileSystemConfiguration"
parent="igfsCfgBase">
                    <property name="name" value="igfs"/>

                    
                    <property name="metaCacheName" value="igfs-meta"/>
                    <property name="dataCacheName" value="igfs-data"/>

                    
                    <property name="ipcEndpointConfiguration">
                        <bean
class="org.apache.ignite.igfs.IgfsIpcEndpointConfiguration">
                            <property name="type" value="TCP" />
                            <property name="host" value="0.0.0.0" />
                            <property name="port" value="10500" />
                        </bean>
                    </property>

                    
                    <property name="secondaryFileSystem">
                        <bean
class="org.apache.ignite.hadoop.fs.IgniteHadoopIgfsSecondaryFileSystem">
                            <constructor-arg name="uri"
value="hdfs://10.8.12.16:9000"/>
                            <constructor-arg
name="cfgPath"><null/></constructor-arg>
                            <constructor-arg name="userName"
value="client-user-name"/>
                        </bean>
                    </property>
                </bean>
            </list>
        </property>

        
        <property name="cacheConfiguration">
            <list>
                
                <bean
class="org.apache.ignite.configuration.CacheConfiguration"
parent="metaCacheCfgBase">
                    <property name="name" value="igfs-meta"/>
                </bean>

                
                <bean
class="org.apache.ignite.configuration.CacheConfiguration"
parent="dataCacheCfgBase">
                    <property name="name" value="igfs-data"/>
                </bean>
            </list>
        </property>

        
        <property name="includeEventTypes">
            <list>
                <util:constant
static-field="org.apache.ignite.events.EventType.EVT_TASK_FAILED"/>
                <util:constant
static-field="org.apache.ignite.events.EventType.EVT_TASK_FINISHED"/>
                <util:constant
static-field="org.apache.ignite.events.EventType.EVT_JOB_MAPPED"/>
            </list>
        </property>

        
                
                
        
        <property name="communicationSpi">
                <bean 
class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
                        <property name="socketWriteTimeout" value="600000"/>
                </bean>
        </property>
 
    </bean>
</beans>



The only difference in configuration among the hosts are "eviction maxsize".
host1 use small "maxSize" because there is not enough memory to use on
host1.

host2 and host3's only difference in configuration:

                <property name="evictionPolicy">
                        <bean
class="org.apache.ignite.cache.eviction.igfs.IgfsPerBlockLruEvictionPolicy">
                                
                                <property name="maxSize" value="64424509440"/>
                        </bean>
                </property>




The only difference in ignite.sh is the JVM option(I use JDK8):

host1:

if [ -z "$JVM_OPTS" ] ; then
    if [[ `"$JAVA" -version 2>&1 | egrep "1\.[7]\."` ]]; then
        JVM_OPTS="-Xms1g -Xmx1g -server -XX:+AggressiveOpts
-XX:MaxPermSize=256m"
    else
        JVM_OPTS="-Xms5g -Xmx30g -server -XX:+AggressiveOpts
-XX:MaxMetaspaceSize=10g"
    fi
fi


host2 and host3:
if [ -z "$JVM_OPTS" ] ; then
    if [[ `"$JAVA" -version 2>&1 | egrep "1\.[7]\."` ]]; then
        JVM_OPTS="-Xms1g -Xmx1g -server -XX:+AggressiveOpts
-XX:MaxPermSize=256m"
    else
        JVM_OPTS="-Xms8g -Xmx60g -server -XX:+AggressiveOpts
-XX:MaxMetaspaceSize=10g"
    fi
fi



Our production map-reduce job usually will finish in 30 minutes. However,
when using IGFS, I have waiting for about 2 hours.


The job is stuck in :

...
Nov 28, 2016 4:58:52 PM
org.apache.ignite.internal.client.impl.connection.GridClientNioTcpConnection
<init>
INFO: Client TCP connection established: /10.8.12.16:11211
Nov 28, 2016 4:58:52 PM
org.apache.ignite.internal.client.impl.GridClientImpl <init>
INFO: Client started [id=337ab72f-4c01-4fc6-849c-1e3920dbfcbb, protocol=TCP]
16/11/28 16:58:52 WARN mapreduce.JobResourceUploader: Hadoop command-line
option parsing not performed. Implement the Tool interface and execute your
application with ToolRunner to remedy this.
16/11/28 16:58:53 INFO input.FileInputFormat: Total input paths to process :
25
16/11/28 16:58:54 INFO mapreduce.JobSubmitter: number of splits:75
16/11/28 16:58:54 INFO mapreduce.JobSubmitter: Submitting tokens for job:
job_02392669-843d-4953-b6d0-03782e740c2c_0001
Nov 28, 2016 4:58:54 PM
org.apache.ignite.internal.client.impl.connection.GridClientNioTcpConnection
<init>
INFO: Client TCP connection established: /10.8.12.18:11211
16/11/28 16:58:55 INFO mapreduce.Job: The url to track the job: N/A
Nov 28, 2016 4:58:55 PM
org.apache.ignite.internal.client.impl.connection.GridClientNioTcpConnection
<init>
INFO: Client TCP connection established: /10.8.12.17:11211
Nov 28, 2016 5:03:57 PM
org.apache.ignite.internal.client.impl.connection.GridClientNioTcpConnection
close
INFO: Client TCP connection closed: /10.8.12.18:11211
Nov 28, 2016 5:03:58 PM
org.apache.ignite.internal.client.impl.connection.GridClientNioTcpConnection
close
INFO: Client TCP connection closed: /10.8.12.17:11211
...
....


log on host1(using  cat ignite-1a4fad25.log | grep -m40 -A 5 WARN to filter
some info):

PS: There is no error or exception in the logs.


[16:58:28,789][WARN ][main][NoopCheckpointSpi] Checkpoints are disabled (to
enable configure any GridCheckpointSpi implementation)
[16:58:28,807][WARN ][main][GridCollisionManager] Collision resolution is
disabled (all jobs will be activated upon arrival).
[16:58:28,809][WARN ][main][NoopSwapSpaceSpi] Swap space is disabled. To
enable use FileSwapSpaceSpi.
[16:58:28,810][INFO ][main][IgniteKernal] Security status
[authentication=off, tls/ssl=off]
[16:58:29,029][INFO ][main][GridTcpRestProtocol] Command protocol
successfully started [name=TCP binary, host=0.0.0.0/0.0.0.0, port=11211]
[16:58:29,111][INFO ][main][IpcServerTcpEndpoint] IPC server loopback
endpoint started [port=10500]
[16:58:29,112][INFO ][main][IpcServerTcpEndpoint] IPC server loopback
endpoint started [port=11400]
[16:58:29,123][INFO ][main][HadoopProcessor] HADOOP_HOME is set to
/home/appadmin/hadoop-2.7.2
--
[16:58:29,134][WARN ][main][NativeCodeLoader] Unable to load native-hadoop
library for your platform... using builtin-java classes where applicable
[16:58:29,189][INFO ][main][TcpDiscoverySpi] Successfully bound to TCP port
[port=47500, localHost=0.0.0.0/0.0.0.0]
[16:58:29,193][WARN ][main][TcpDiscoveryMulticastIpFinder]
TcpDiscoveryMulticastIpFinder has no pre-configured addresses (it is
recommended in production to specify at least one address in
TcpDiscoveryMulticastIpFinder.getAddresses() configuration property)
[16:58:30,594][INFO ][main][GridCacheProcessor] Started cache
[name=igfs-data, mode=PARTITIONED]
[16:58:30,602][INFO ][main][GridCacheProcessor] Started cache
[name=ignite-hadoop-mr-sys-cache, mode=REPLICATED]
[16:58:30,603][INFO ][main][GridCacheProcessor] Started cache
[name=ignite-sys-cache, mode=REPLICATED]
[16:58:30,604][INFO ][main][GridCacheProcessor] Started cache
[name=ignite-atomics-sys-cache, mode=PARTITIONED]
[16:58:30,607][INFO ][main][GridCacheProcessor] Started cache
[name=igfs-meta, mode=REPLICATED]
--
[16:58:54,556][WARN ][pub-#7%null%][NativeCodeLoader] Unable to load
native-hadoop library for your platform... using builtin-java classes where
applicable
[16:58:55,197][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-SETUP-0-0-#380%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:55,258][INFO
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-SETUP-0-0-#380%null%][FileOutputCommitter]
File Output Committer Algorithm version is 1
[16:58:55,959][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-34-0-#398%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:55,991][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-33-0-#397%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,009][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-38-0-#402%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,010][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-44-0-#408%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,011][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-30-0-#394%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,042][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-31-0-#395%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,087][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-48-0-#412%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,089][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-32-0-#396%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,091][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-37-0-#401%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,094][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-49-0-#413%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,094][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-45-0-#409%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,094][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-35-0-#399%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,095][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-46-0-#410%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,096][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-39-0-#403%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,096][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-51-0-#415%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,096][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-43-0-#407%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,096][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-42-0-#406%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,096][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-40-0-#404%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,097][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-47-0-#411%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,097][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-41-0-#405%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,097][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-50-0-#414%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,097][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-29-0-#393%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,098][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-36-0-#400%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,312][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-29-0-#393%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,411][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-29-0-#393%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,500][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-29-0-#393%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,587][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-29-0-#393%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:58:56,678][WARN
][Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-29-0-#393%null%][NativeCodeLoader]
Unable to load native-hadoop library for your platform... using builtin-java
classes where applicable
[16:59:30,726][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1a4fad25, name=null, uptime=00:01:00:014]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.1%, avg=12.27%, GC=0%]
--
[17:00:30,731][WARN
][grid-timeout-worker-#201%null%][GridCachePartitionExchangeManager] Found
long running transaction [startTime=16:59:19.949, curTime=17:00:30.721,
tx=GridNearTxLocal [mappings=IgniteTxMappingsSingleImpl
[mapping=GridDistributedTxMapping [entries=[IgniteTxEntry
[key=KeyCacheObjectImpl [val=02392669-843d-4953-b6d0-03782e740c2c_1,
hasValBytes=true], cacheId=-1448311745, partId=-1, txKey=IgniteTxKey
[key=KeyCacheObjectImpl [val=02392669-843d-4953-b6d0-03782e740c2c_1,
hasValBytes=true], cacheId=-1448311745], val=[op=TRANSFORM, val=null],
prevVal=[op=NOOP, val=null], entryProcessorsCol=[IgniteBiTuple
[val1=o.a.i.i.processors.hadoop.jobtracker.HadoopJobTracker$RemoveMappersProcessor@315b1fca,
val2=[Ljava.lang.Object;@13ad14dd]], ttl=-1, conflictExpireTime=-1,
conflictVer=null, explicitVer=null, dhtVer=null, filters=[],
filtersPassed=false, filtersSet=true, entry=GridDhtDetachedCacheEntry
[super=GridDistributedCacheEntry [super=GridCacheMapEntry
[key=KeyCacheObjectImpl [val=02392669-843d-4953-b6d0-03782e740c2c_1,
hasValBytes=true], val=null, startVer=1480323571002, ver=GridCacheVersion
[topVer=91803513, time=1480323559950, order=1480323571002, nodeOrder=1],
hash=-1251234477, extras=null, flags=0]]], prepared=0, locked=false,
nodeId=1988aad6-d1e1-4d5e-a26d-a86e8cba774e, locMapped=false,
expiryPlc=null, transferExpiryPlc=false, flags=0, partUpdateCntr=0,
serReadVer=null, xidVer=GridCacheVersion [topVer=91803513,
time=1480323559950, order=1480323571000, nodeOrder=1]]], explicitLock=false,
dhtVer=null, last=false, near=false, clientFirst=false,
node=1988aad6-d1e1-4d5e-a26d-a86e8cba774e]], nearLocallyMapped=false,
colocatedLocallyMapped=false, needCheckBackup=null, hasRemoteLocks=false,
thread=Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-32-0-#396%null%,
mappings=IgniteTxMappingsSingleImpl [mapping=GridDistributedTxMapping
[entries=[IgniteTxEntry [key=KeyCacheObjectImpl
[val=02392669-843d-4953-b6d0-03782e740c2c_1, hasValBytes=true],
cacheId=-1448311745, partId=-1, txKey=IgniteTxKey [key=KeyCacheObjectImpl
[val=02392669-843d-4953-b6d0-03782e740c2c_1, hasValBytes=true],
cacheId=-1448311745], val=[op=TRANSFORM, val=null], prevVal=[op=NOOP,
val=null], entryProcessorsCol=[IgniteBiTuple
[val1=o.a.i.i.processors.hadoop.jobtracker.HadoopJobTracker$RemoveMappersProcessor@315b1fca,
val2=[Ljava.lang.Object;@13ad14dd]], ttl=-1, conflictExpireTime=-1,
conflictVer=null, explicitVer=null, dhtVer=null, filters=[],
filtersPassed=false, filtersSet=true, entry=GridDhtDetachedCacheEntry
[super=GridDistributedCacheEntry [super=GridCacheMapEntry
[key=KeyCacheObjectImpl [val=02392669-843d-4953-b6d0-03782e740c2c_1,
hasValBytes=true], val=null, startVer=1480323571002, ver=GridCacheVersion
[topVer=91803513, time=1480323559950, order=1480323571002, nodeOrder=1],
hash=-1251234477, extras=null, flags=0]]], prepared=0, locked=false,
nodeId=1988aad6-d1e1-4d5e-a26d-a86e8cba774e, locMapped=false,
expiryPlc=null, transferExpiryPlc=false, flags=0, partUpdateCntr=0,
serReadVer=null, xidVer=GridCacheVersion [topVer=91803513,
time=1480323559950, order=1480323571000, nodeOrder=1]]], explicitLock=false,
dhtVer=null, last=false, near=false, clientFirst=false,
node=1988aad6-d1e1-4d5e-a26d-a86e8cba774e]], super=GridDhtTxLocalAdapter
[nearOnOriginatingNode=false, nearNodes=[], dhtNodes=[], explicitLock=false,
super=IgniteTxLocalAdapter [completedBase=null, sndTransformedVals=false,
depEnabled=false, txState=IgniteTxImplicitSingleStateImpl [init=true],
super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=91803513,
time=1480323559950, order=1480323571000, nodeOrder=1], writeVer=null,
implicit=true, loc=true, threadId=460, startTime=1480323559949,
nodeId=1a4fad25-d0fe-4dbd-80ad-5370dcb697ff, startVer=GridCacheVersion
[topVer=91803513, time=1480323559950, order=1480323571000, nodeOrder=1],
endVer=null, isolation=READ_COMMITTED, concurrency=OPTIMISTIC, timeout=0,
sysInvalidate=false, sys=true, plc=2, commitVer=GridCacheVersion
[topVer=91803513, time=1480323559950, order=1480323571000, nodeOrder=1],
finalizing=NONE, preparing=false, invalidParts=null, state=PREPARING,
timedOut=false, topVer=AffinityTopologyVersion [topVer=3, minorTopVer=1],
duration=70772ms, onePhaseCommit=false], size=1]]]]
[17:00:30,733][WARN
][grid-timeout-worker-#201%null%][GridCachePartitionExchangeManager] Found
long running transaction [startTime=16:59:20.856, curTime=17:00:30.721,
tx=GridNearTxLocal [mappings=IgniteTxMappingsSingleImpl
[mapping=GridDistributedTxMapping [entries=[IgniteTxEntry
[key=KeyCacheObjectImpl [val=02392669-843d-4953-b6d0-03782e740c2c_1,
hasValBytes=true], cacheId=-1448311745, partId=-1, txKey=IgniteTxKey
[key=KeyCacheObjectImpl [val=02392669-843d-4953-b6d0-03782e740c2c_1,
hasValBytes=true], cacheId=-1448311745], val=[op=TRANSFORM, val=null],
prevVal=[op=NOOP, val=null], entryProcessorsCol=[IgniteBiTuple
[val1=o.a.i.i.processors.hadoop.jobtracker.HadoopJobTracker$RemoveMappersProcessor@7d46700a,
val2=[Ljava.lang.Object;@75833b77]], ttl=-1, conflictExpireTime=-1,
conflictVer=null, explicitVer=null, dhtVer=null, filters=[],
filtersPassed=false, filtersSet=true, entry=GridDhtDetachedCacheEntry
[super=GridDistributedCacheEntry [super=GridCacheMapEntry
[key=KeyCacheObjectImpl [val=02392669-843d-4953-b6d0-03782e740c2c_1,
hasValBytes=true], val=null, startVer=1480323571195, ver=GridCacheVersion
[topVer=91803513, time=1480323560861, order=1480323571195, nodeOrder=1],
hash=-1251234477, extras=null, flags=0]]], prepared=0, locked=false,
nodeId=1988aad6-d1e1-4d5e-a26d-a86e8cba774e, locMapped=false,
expiryPlc=null, transferExpiryPlc=false, flags=0, partUpdateCntr=0,
serReadVer=null, xidVer=GridCacheVersion [topVer=91803513,
time=1480323560861, order=1480323571193, nodeOrder=1]]], explicitLock=false,
dhtVer=null, last=false, near=false, clientFirst=false,
node=1988aad6-d1e1-4d5e-a26d-a86e8cba774e]], nearLocallyMapped=false,
colocatedLocallyMapped=false, needCheckBackup=null, hasRemoteLocks=false,
thread=Hadoop-task-02392669-843d-4953-b6d0-03782e740c2c_1-MAP-33-0-#397%null%,
mappings=IgniteTxMappingsSingleImpl [mapping=GridDistributedTxMapping
[entries=[IgniteTxEntry [key=KeyCacheObjectImpl
[val=02392669-843d-4953-b6d0-03782e740c2c_1, hasValBytes=true],
cacheId=-1448311745, partId=-1, txKey=IgniteTxKey [key=KeyCacheObjectImpl
[val=02392669-843d-4953-b6d0-03782e740c2c_1, hasValBytes=true],
cacheId=-1448311745], val=[op=TRANSFORM, val=null], prevVal=[op=NOOP,
val=null], entryProcessorsCol=[IgniteBiTuple
[val1=o.a.i.i.processors.hadoop.jobtracker.HadoopJobTracker$RemoveMappersProcessor@7d46700a,
val2=[Ljava.lang.Object;@75833b77]], ttl=-1, conflictExpireTime=-1,
conflictVer=null, explicitVer=null, dhtVer=null, filters=[],
filtersPassed=false, filtersSet=true, entry=GridDhtDetachedCacheEntry
[super=GridDistributedCacheEntry [super=GridCacheMapEntry
[key=KeyCacheObjectImpl [val=02392669-843d-4953-b6d0-03782e740c2c_1,
hasValBytes=true], val=null, startVer=1480323571195, ver=GridCacheVersion
[topVer=91803513, time=1480323560861, order=1480323571195, nodeOrder=1],
hash=-1251234477, extras=null, flags=0]]], prepared=0, locked=false,
nodeId=1988aad6-d1e1-4d5e-a26d-a86e8cba774e, locMapped=false,
expiryPlc=null, transferExpiryPlc=false, flags=0, partUpdateCntr=0,
serReadVer=null, xidVer=GridCacheVersion [topVer=91803513,
time=1480323560861, order=1480323571193, nodeOrder=1]]], explicitLock=false,
dhtVer=null, last=false, near=false, clientFirst=false,
node=1988aad6-d1e1-4d5e-a26d-a86e8cba774e]], super=GridDhtTxLocalAdapter
[nearOnOriginatingNode=false, nearNodes=[], dhtNodes=[], explicitLock=false,
super=IgniteTxLocalAdapter [completedBase=null, sndTransformedVals=false,
depEnabled=false, txState=IgniteTxImplicitSingleStateImpl [init=true],
super=IgniteTxAdapter [xidVer=GridCacheVersion [topVer=91803513,
time=1480323560861, order=1480323571193, nodeOrder=1], writeVer=null,
implicit=true, loc=true, threadId=461, startTime=1480323560856,
nodeId=1a4fad25-d0fe-4dbd-80ad-5370dcb697ff, startVer=GridCacheVersion
[topVer=91803513, time=1480323560861, order=1480323571193, nodeOrder=1],
endVer=null, isolation=READ_COMMITTED, concurrency=OPTIMISTIC, timeout=0,
sysInvalidate=false, sys=true, plc=2, commitVer=GridCacheVersion
[topVer=91803513, time=1480323560861, order=1480323571193, nodeOrder=1],
finalizing=NONE, preparing=false, invalidParts=null, state=PREPARING,
timedOut=false, topVer=AffinityTopologyVersion [topVer=3, minorTopVer=1],
duration=69875ms, onePhaseCommit=false], size=1]]]]

...many same WARNs

[WARN ][grid-timeout-worker-#201%null%][GridCachePartitionExchangeManager]
Found long running transaction

...many same WARNs


many INFO like this:
[18:22:31,122][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1a4fad25, name=null, uptime=01:24:00:415]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0%, avg=0.16%, GC=0%]
    ^-- Heap [used=13923MB, free=49.01%, comm=21575MB]
    ^-- Non heap [used=148MB, free=98.71%, comm=153MB]
    ^-- Public thread pool [active=0, idle=80, qSize=0]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:23:31,130][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1a4fad25, name=null, uptime=01:25:00:417]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.16%, GC=0%]
    ^-- Heap [used=13939MB, free=48.95%, comm=21575MB]
    ^-- Non heap [used=148MB, free=98.71%, comm=153MB]
    ^-- Public thread pool [active=0, idle=80, qSize=0]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:24:31,133][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1a4fad25, name=null, uptime=01:26:00:423]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.16%, GC=0%]
    ^-- Heap [used=13948MB, free=48.92%, comm=21575MB]
    ^-- Non heap [used=148MB, free=98.71%, comm=153MB]
    ^-- Public thread pool [active=0, idle=80, qSize=0]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:25:31,139][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1a4fad25, name=null, uptime=01:27:00:431]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.16%, GC=0%]
    ^-- Heap [used=13957MB, free=48.89%, comm=21575MB]
    ^-- Non heap [used=148MB, free=98.71%, comm=153MB]
    ^-- Public thread pool [active=0, idle=80, qSize=0]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]


log status on host2 is the same as on host1.

logs on host3 has a little different at the tail:

[18:09:40,754][WARN ][grid-timeout-worker-#201%null%][IgniteKernal] Possible
thread pool starvation detected (no task completed in last 30000ms, is
executorService pool size large enough?)
[18:10:40,597][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:12:00:362]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.21%, GC=0%]
    ^-- Heap [used=19868MB, free=63.62%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:11:40,597][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:13:00:362]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0%, avg=0.21%, GC=0%]
    ^-- Heap [used=19874MB, free=63.61%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:12:40,600][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:14:00:363]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.21%, GC=0%]
    ^-- Heap [used=19882MB, free=63.59%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:13:40,598][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:15:00:363]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.2%, GC=0%]
    ^-- Heap [used=19898MB, free=63.56%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:14:40,605][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:16:00:366]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0%, avg=0.2%, GC=0%]
    ^-- Heap [used=19906MB, free=63.55%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:14:40,802][WARN ][grid-timeout-worker-#201%null%][IgniteKernal] Possible
thread pool starvation detected (no task completed in last 30000ms, is
executorService pool size large enough?)
[18:15:40,610][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:17:00:375]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.2%, GC=0%]
    ^-- Heap [used=19913MB, free=63.54%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:16:40,617][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:18:00:383]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.2%, GC=0%]
    ^-- Heap [used=19920MB, free=63.52%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:17:40,626][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:19:00:392]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0%, avg=0.19%, GC=0%]
    ^-- Heap [used=19931MB, free=63.5%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:18:40,636][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:20:00:399]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0%, avg=0.19%, GC=0%]
    ^-- Heap [used=19938MB, free=63.49%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:19:40,644][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:21:00:407]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.19%, GC=0%]
    ^-- Heap [used=19946MB, free=63.48%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:19:40,848][WARN ][grid-timeout-worker-#201%null%][IgniteKernal] Possible
thread pool starvation detected (no task completed in last 30000ms, is
executorService pool size large enough?)
[18:20:40,652][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:22:00:414]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.19%, GC=0%]
    ^-- Heap [used=19953MB, free=63.46%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:21:40,655][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:23:00:420]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.18%, GC=0%]
    ^-- Heap [used=19962MB, free=63.45%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:22:40,663][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:24:00:426]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.18%, GC=0%]
    ^-- Heap [used=19970MB, free=63.43%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:23:40,669][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:25:00:426]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.18%, GC=0%]
    ^-- Heap [used=19977MB, free=63.42%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:24:40,663][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:26:00:426]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.18%, GC=0%]
    ^-- Heap [used=19985MB, free=63.41%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:24:40,887][WARN ][grid-timeout-worker-#201%null%][IgniteKernal] Possible
thread pool starvation detected (no task completed in last 30000ms, is
executorService pool size large enough?)
[18:25:40,668][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:27:00:430]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.18%, GC=0%]
    ^-- Heap [used=19992MB, free=63.39%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]
[18:26:40,674][INFO ][grid-timeout-worker-#201%null%][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=1988aad6, name=null, uptime=01:28:00:439]
    ^-- H/N/C [hosts=3, nodes=3, CPUs=120]
    ^-- CPU [cur=0.03%, avg=0.18%, GC=0%]
    ^-- Heap [used=19999MB, free=63.38%, comm=39744MB]
    ^-- Non heap [used=148MB, free=98.58%, comm=153MB]
    ^-- Public thread pool [active=80, idle=0, qSize=944]
    ^-- System thread pool [active=0, idle=80, qSize=0]
    ^-- Outbound messages queue [size=0]




How to solve this problem and what cause this?


>From the log info, I find free heap space diminish by 0.02% every several
minutes. And "thread pool starvation detected" warnning appeared now and
then.




--
View this message in context: 
http://apache-ignite-users.70518.x6.nabble.com/MapReduce-Job-stuck-when-using-ignite-hadoop-accelerator-tp9216.html
Sent from the Apache Ignite Users mailing list archive at Nabble.com.

Reply via email to