Sure, my bad.

For context, we run it as distributed key/value storage and don't run any
computations on it nor SQL.

*Ignite XML config for the server pods:*

```
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans";
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
       xsi:schemaLocation="
        http://www.springframework.org/schema/beans
        http://www.springframework.org/schema/beans/spring-beans.xsd";>

    <bean class="org.apache.ignite.configuration.IgniteConfiguration">
        <property name="failureDetectionTimeout" value="10000"/>
        <property name="metricsLogFrequency" value="5000"/>

        
        <property name="segmentationPolicy" value="NOOP"/>

        <property name="discoverySpi">
            <bean
class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
                <property name="ipFinder">
                    
                    <bean
class="org.apache.ignite.spi.discovery.tcp.ipfinder.kubernetes.TcpDiscoveryKubernetesIpFinder">
                        <property name="namespace" value="buildcache"/>
                        <property name="serviceName" value="ignite"/>
                    </bean>
                </property>

                
                <property name="joinTimeout" value="10000" />
            </bean>
        </property>

        <property name="communicationSpi">
            <bean
class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
                
                <property name="messageQueueLimit" value="1024"/>

                
                <property name="slowClientQueueLimit" value="512"/>
            </bean>
        </property>

        <property name="dataStorageConfiguration">
          <bean
class="org.apache.ignite.configuration.DataStorageConfiguration">
            
            <property name="pageSize" value="8192"/>

            <property name="defaultDataRegionConfiguration">
              <bean
class="org.apache.ignite.configuration.DataRegionConfiguration">
                <property name="name" value="Default_Region"/>
                
                <property name="initialSize" value="#{24L * 1024 * 1024 *
1024}"/>
                
                <property name="maxSize" value="#{24L * 1024 * 1024 *
1024}"/>
                
                <property name="pageEvictionMode" value="RANDOM_2_LRU"/>
                
                <property name="emptyPagesPoolSize" value="25600"/>
                
                <property name="metricsEnabled" value="true"/>
              </bean>
            </property>
          </bean>
        </property>

        <property name="cacheConfiguration">
            <bean
class="org.apache.ignite.configuration.CacheConfiguration">
                <property name="name" value="buildcache"/>
                <property name="dataRegionName" value="Default_Region"/>

                
                <property name="cacheMode" value="PARTITIONED"/>

                
                <property name="writeSynchronizationMode"
value="FULL_ASYNC"/>
            </bean>
        </property>
    </bean>
</beans>
```

*Ignite Client XML config* (embedded into a JVM app that acts as HTTP server
and reads/writes key/value pairs from Ignite):

```
<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans";
       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
       xsi:schemaLocation="
            http://www.springframework.org/schema/beans
            http://www.springframework.org/schema/beans/spring-beans.xsd";>

    <bean class="org.apache.ignite.configuration.IgniteConfiguration">
        
        <property name="clientMode" value="true"/>

        <property name="failureDetectionTimeout" value="10000"/>

        <property name="discoverySpi">
            <bean
class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
                <property name="ipFinder">
                    
                    <bean
class="org.apache.ignite.spi.discovery.tcp.ipfinder.kubernetes.TcpDiscoveryKubernetesIpFinder">
                        <property name="namespace" value="buildcache"/>
                        <property name="serviceName" value="ignite"/>
                    </bean>
                </property>

                
                <property name="joinTimeout" value="4500" />
                
                <property name="networkTimeout" value="4000" />
            </bean>
        </property>

        <property name="communicationSpi">
            <bean
class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
                
                <property name="messageQueueLimit" value="512"/>
            </bean>
        </property>

        <property name="dataStorageConfiguration">
            <bean
class="org.apache.ignite.configuration.DataStorageConfiguration">
                
                <property name="pageSize" value="8192"/>

                <property name="defaultDataRegionConfiguration">
                    
                    <bean
class="org.apache.ignite.configuration.DataRegionConfiguration">
                        <property name="name" value="Default_Region"/>
                        
                        <property name="initialSize" value="#{50 * 1024 *
1024}"/>
                        
                        <property name="maxSize" value="#{50 * 1024 *
1024}"/>
                    </bean>
                </property>
            </bean>
        </property>
    </bean>
</beans>
```

*Kubernetes Ignite StatefulSet*

```
apiVersion: v1
kind: Service
metadata:
  name: ignite
  namespace: buildcache
spec:
  selector:
    app: ignite-node
  type: LoadBalancer
  ports:
  - name: thinclients
    port: 10900
  - name: spicommunication
    port: 47100
  - name: discoveryspi
    port: 47500
---
apiVersion: apps/v1
kind: StatefulSet # StatefulSet allows us to start/stop pods one by one
(important for Ignite) (and add disk persistence in future).
metadata:
  name: ignite
  namespace: buildcache
spec:
  replicas: 12
  selector:
    matchLabels:
      app: ignite-node
  serviceName: ignite
  template:
    metadata:
      labels:
        app: ignite-node
    spec:
      serviceAccountName: ignite
      volumes:
      - name: config
        configMap:
          name: ignite-config
          items:
          - key: ignite-server-xml
            path: ignite-server-config.xml
      containers:
      - name: ignite-node
        image: apacheignite/ignite:2.6.0
        env:
        - name: OPTION_LIBS
          value: "ignite-kubernetes,ignite-rest-http"
        - name: JVM_OPTS
          # See https://apacheignite.readme.io/docs/jvm-and-system-tuning
          # 24 GB is actual data limit (off heap), see ignite-config-map.
          # 25 GB as JVM off heap limit (JVM can also use off heap for its
needs).
          # 4  GB as heap limit for intermediate operations (normally Ignite
uses about 500 MB, but can jump to 1GB+ on topology change or load spike).
          # 30 GB as container limit (+1 GB of expected normal memory use
just in case).
          value: "-server -Djava.net.preferIPv4Stack=true
-XX:+UnlockExperimentalVMOptions -XX:+UseCGroupMemoryLimitForHeap
-XX:MaxDirectMemorySize=25g -Xms4g -Xmx4g -XX:+UseG1GC -XX:+AlwaysPreTouch
-XX:+ScavengeBeforeFullGC -XX:+DisableExplicitGC -XX:MaxGCPauseMillis=50
-XX:G1NewSizePercent=20"
        - name: CONFIG_URI
          value: /config/ignite-server-config.xml
        - name: IGNITE_QUIET
          value: "false"
        resources:
          requests:
            memory: "30G"
            cpu: 1
          limits:
            memory: "30G"
            cpu: 2
        volumeMounts:
        - mountPath: /config
          name: config
        ports:
        - containerPort: 8080  # Ignite REST API.
        - containerPort: 47100 # Ignite communication SPI port.
        - containerPort: 47500 # Ignite discovery SPI port number.
        - containerPort: 10900 # Ignite Thin Clients Protocol.
        readinessProbe:
          httpGet:
            path: /ignite?cmd=top
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 1
          periodSeconds: 5
          failureThreshold: 3
        livenessProbe:
          httpGet:
            path: /ignite?cmd=top
            port: 8080
          initialDelaySeconds: 20
          timeoutSeconds: 1
          periodSeconds: 2
          failureThreshold: 1
```

Java service that acts as Ignite clients has similar GC tuning options and
we run 4 instances of clients atm.

So far I see two problems with this deployment:

1) Ignite clients have sudden latency spike for subset of cache reads while
others reads surved fine, GC doesn't seem to be the issue as I've not
noticed it in logs and heap at that time has gigabytes available.

2) Ignite servers nodes fail one by one as they think that topology
segmentation has occurred, logs look like this. You can notice that there is
enough Java heap and CPU is feeling okay on Ignite nodes. (log below)

At this point I think it's either networking problem (but we allow Ignite
ports like 47100, 47500, 10900) or a bug in Ignite 2.6.0

```
[19:12:12,527][INFO][grid-timeout-worker-#23][IgniteKernal] FreeList
[name=null, buckets=256, dataPages=36186, reusePages=702]
[19:12:13,852][INFO][tcp-disco-sock-reader-#116][TcpDiscoverySpi] Finished
serving remote node connection [rmtAddr=/10.42.46.176:56691, rmtPort=56691
[19:12:17,534][INFO][grid-timeout-worker-#23][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=8a06dcdd, uptime=01:07:04.517]
    ^-- H/N/C [hosts=7, nodes=7, CPUs=56]
    ^-- CPU [cur=1.97%, avg=1.2%, GC=0.1%]
    ^-- PageMemory [pages=2824373]
    ^-- Heap [used=511MB, free=87.51%, comm=4096MB]
    ^-- Non heap [used=75MB, free=-1%, comm=77MB]
    ^-- Outbound messages queue [size=0]
    ^-- Public thread pool [active=0, idle=2, qSize=0]
    ^-- System thread pool [active=0, idle=8, qSize=0]
[19:12:17,534][INFO][grid-timeout-worker-#23][IgniteKernal] FreeList
[name=null, buckets=256, dataPages=35867, reusePages=702]
[19:12:22,536][INFO][grid-timeout-worker-#23][IgniteKernal] 
Metrics for local node (to disable set 'metricsLogFrequency' to 0)
    ^-- Node [id=8a06dcdd, uptime=01:07:09.517]
    ^-- H/N/C [hosts=7, nodes=7, CPUs=56]
    ^-- CPU [cur=0.6%, avg=1.2%, GC=0%]
    ^-- PageMemory [pages=2824373]
    ^-- Heap [used=658MB, free=83.92%, comm=4096MB]
    ^-- Non heap [used=75MB, free=-1%, comm=77MB]
    ^-- Outbound messages queue [size=0]
    ^-- Public thread pool [active=0, idle=0, qSize=0]
    ^-- System thread pool [active=0, idle=8, qSize=0]
[19:12:22,536][INFO][grid-timeout-worker-#23][IgniteKernal] FreeList
[name=null, buckets=256, dataPages=36632, reusePages=702]
[19:12:24,699][INFO][tcp-comm-worker-#1][TcpDiscoverySpi] Pinging node:
74330756-9301-401b-bd7f-7d840e921d04
[19:12:24,700][INFO][tcp-comm-worker-#1][TcpDiscoverySpi] Finished node ping
[nodeId=74330756-9301-401b-bd7f-7d840e921d04, res=true, time=5ms]
[19:12:24,957][INFO][tcp-disco-srvr-#3][TcpDiscoverySpi] TCP discovery
accepted incoming connection [rmtAddr=/10.42.66.84, rmtPort=60023]
[19:12:24,957][INFO][tcp-disco-srvr-#3][TcpDiscoverySpi] TCP discovery
spawning a new thread for connection [rmtAddr=/10.42.66.84, rmtPort=60023]
[19:12:24,957][INFO][tcp-disco-sock-reader-#117][TcpDiscoverySpi] Started
serving remote node connection [rmtAddr=/10.42.66.84:60023, rmtPort=60023]
[19:12:24,958][WARNING][tcp-disco-msg-worker-#2][TcpDiscoverySpi] Node is
out of topology (probably, due to short-time network problems).
[19:12:24,958][INFO][tcp-disco-sock-reader-#117][TcpDiscoverySpi] Finished
serving remote node connection [rmtAddr=/10.42.66.84:60023, rmtPort=60023
[19:12:24,959][WARNING][disco-event-worker-#41][GridDiscoveryManager] Local
node SEGMENTED: TcpDiscoveryNode [id=8a06dcdd-3d20-4625-b23c-f7d33a2cfceb,
addrs=[10.42.32.222, 127.0.0.1],
sockAddrs=[ignite-0.ignite.buildcache.svc.cluster.local/10.42.32.222:47500,
/127.0.0.1:47500], discPort=47500, order=1, intOrder=1,
lastExchangeTime=1537989144955, loc=true, ver=2.6.0#20180710-sha1:669feacc,
isClient=false]
[19:12:24,960][SEVERE][tcp-disco-srvr-#3][] Critical system error detected.
Will be handled accordingly to configured handler [hnd=class
o.a.i.failure.StopNodeOrHaltFailureHandler, failureCtx=FailureContext
[type=SYSTEM_WORKER_TERMINATION, err=java.lang.IllegalStateException: Thread
tcp-disco-srvr-#3 is terminated unexpectedly.]]
java.lang.IllegalStateException: Thread tcp-disco-srvr-#3 is terminated
unexpectedly.
        at
org.apache.ignite.spi.discovery.tcp.ServerImpl$TcpServer.body(ServerImpl.java:5686)
        at org.apache.ignite.spi.IgniteSpiThread.run(IgniteSpiThread.java:62)
[19:12:24,961][SEVERE][tcp-disco-srvr-#3][] JVM will be halted immediately
due to the failure: [failureCtx=FailureContext
[type=SYSTEM_WORKER_TERMINATION, err=java.lang.IllegalStateException: Thread
tcp-disco-srvr-#3 is terminated unexpectedly.]]
```



--
Sent from: http://apache-ignite-users.70518.x6.nabble.com/

Reply via email to