[ https://issues.apache.org/jira/browse/RATIS-2143?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
weiming updated RATIS-2143: --------------------------- Description: In our ozone cluster, a DN was found in the SCM page to be in the DEAD state. When restarting, the DN could not start normally, and an off-heap memory OOM was found in the log. ENV: ratis version release-3.0.1 JDK: openjdk 17.0.2 2022-01-18 OpenJDK Runtime Environment (build 17.0.2+8-86) OpenJDK 64-Bit Server VM (build 17.0.2+8-86, mixed mode, sharing) Ozone DN JVM param: {code:java} //代码占位符 export OZONE_DATANODE_OPTS="-Xms24g -Xmx48g -Xmn16g -XX:MetaspaceSize=512m -XX:MaxDirectMemorySize=48g -XX:+UseG1GC -XX:MaxGCPauseMillis=60 -XX:ParallelGCThreads=32 -XX:ConcGCThreads=16 -XX:+AlwaysPreTouc h -XX:+TieredCompilation -XX:+UseStringDeduplication -XX:+OptimizeStringConcat -XX:G1HeapRegionSize=32M -XX:+ParallelRefProcEnabled -XX:ReservedCodeCacheSize=1024M -XX:+UnlockExperimentalVMOptions -XX:G1M ixedGCLiveThresholdPercent=85 -XX:G1HeapWastePercent=10 -XX:InitiatingHeapOccupancyPercent=40 -XX:-G1UseAdaptiveIHOP -verbose:gc -XX:+PrintGCDetails -XX:+PrintGC -XX:+ExitOnOutOfMemoryError -Dorg.apache.r atis.thirdparty.io.netty.tryReflectionSetAccessible=true -Xlog:gc*=info:file=${OZONE_LOG_DIR}/dn_gc-%p.log:time,level,tags:filecount=50,filesize=100M -XX:NativeMemoryTracking=detail " {code} ERROR LOG: java.lang.OutOfMemoryError: Cannot reserve 8192 bytes of direct buffer memory (allocated: 51539599490, limit: 51539607552) at java.base/java.nio.Bits.reserveMemory(Bits.java:178) at java.base/java.nio.DirectByteBuffer.<init>(DirectByteBuffer.java:121) at java.base/java.nio.ByteBuffer.allocateDirect(ByteBuffer.java:332) at java.base/sun.nio.ch.Util.getTemporaryDirectBuffer(Util.java:243) at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:293) at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:273) at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:232) at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:65) at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:107) at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:101) at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244) at java.base/java.io.BufferedInputStream.read1(BufferedInputStream.java:284) at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:343) at java.base/java.io.FilterInputStream.read(FilterInputStream.java:132) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogReader$LimitedInputStream.read(SegmentedRaftLogReader.java:96) at java.base/java.io.DataInputStream.read(DataInputStream.java:151) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogReader.verifyHeader(SegmentedRaftLogReader.java:172) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream.init(SegmentedRaftLogInputStream.java:95) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream.nextEntry(SegmentedRaftLogInputStream.java:122) at org.apache.ratis.server.raftlog.segmented.LogSegment.readSegmentFile(LogSegment.java:131) at org.apache.ratis.server.raftlog.segmented.LogSegment$LogEntryLoader.load(LogSegment.java:236) at org.apache.ratis.server.raftlog.segmented.LogSegment.loadCache(LogSegment.java:346) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLog.get(SegmentedRaftLog.java:295) at org.apache.ratis.server.impl.StateMachineUpdater.applyLog(StateMachineUpdater.java:236) at org.apache.ratis.server.impl.StateMachineUpdater.run(StateMachineUpdater.java:186) at java.base/java.lang.Thread.run(Thread.java:833) !image-2024-08-21-15-17-45-705.png! was: ENV: ratis version release-3.0.1 JDK: openjdk 17.0.2 2022-01-18 OpenJDK Runtime Environment (build 17.0.2+8-86) OpenJDK 64-Bit Server VM (build 17.0.2+8-86, mixed mode, sharing) Ozone DN JVM param: {code:java} //代码占位符 export OZONE_DATANODE_OPTS="-Xms24g -Xmx48g -Xmn16g -XX:MetaspaceSize=512m -XX:MaxDirectMemorySize=48g -XX:+UseG1GC -XX:MaxGCPauseMillis=60 -XX:ParallelGCThreads=32 -XX:ConcGCThreads=16 -XX:+AlwaysPreTouc h -XX:+TieredCompilation -XX:+UseStringDeduplication -XX:+OptimizeStringConcat -XX:G1HeapRegionSize=32M -XX:+ParallelRefProcEnabled -XX:ReservedCodeCacheSize=1024M -XX:+UnlockExperimentalVMOptions -XX:G1M ixedGCLiveThresholdPercent=85 -XX:G1HeapWastePercent=10 -XX:InitiatingHeapOccupancyPercent=40 -XX:-G1UseAdaptiveIHOP -verbose:gc -XX:+PrintGCDetails -XX:+PrintGC -XX:+ExitOnOutOfMemoryError -Dorg.apache.r atis.thirdparty.io.netty.tryReflectionSetAccessible=true -Xlog:gc*=info:file=${OZONE_LOG_DIR}/dn_gc-%p.log:time,level,tags:filecount=50,filesize=100M -XX:NativeMemoryTracking=detail " {code} ERROR LOG: java.lang.OutOfMemoryError: Cannot reserve 8192 bytes of direct buffer memory (allocated: 51539599490, limit: 51539607552) at java.base/java.nio.Bits.reserveMemory(Bits.java:178) at java.base/java.nio.DirectByteBuffer.<init>(DirectByteBuffer.java:121) at java.base/java.nio.ByteBuffer.allocateDirect(ByteBuffer.java:332) at java.base/sun.nio.ch.Util.getTemporaryDirectBuffer(Util.java:243) at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:293) at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:273) at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:232) at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:65) at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:107) at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:101) at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244) at java.base/java.io.BufferedInputStream.read1(BufferedInputStream.java:284) at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:343) at java.base/java.io.FilterInputStream.read(FilterInputStream.java:132) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogReader$LimitedInputStream.read(SegmentedRaftLogReader.java:96) at java.base/java.io.DataInputStream.read(DataInputStream.java:151) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogReader.verifyHeader(SegmentedRaftLogReader.java:172) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream.init(SegmentedRaftLogInputStream.java:95) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream.nextEntry(SegmentedRaftLogInputStream.java:122) at org.apache.ratis.server.raftlog.segmented.LogSegment.readSegmentFile(LogSegment.java:131) at org.apache.ratis.server.raftlog.segmented.LogSegment$LogEntryLoader.load(LogSegment.java:236) at org.apache.ratis.server.raftlog.segmented.LogSegment.loadCache(LogSegment.java:346) at org.apache.ratis.server.raftlog.segmented.SegmentedRaftLog.get(SegmentedRaftLog.java:295) at org.apache.ratis.server.impl.StateMachineUpdater.applyLog(StateMachineUpdater.java:236) at org.apache.ratis.server.impl.StateMachineUpdater.run(StateMachineUpdater.java:186) at java.base/java.lang.Thread.run(Thread.java:833) !image-2024-08-21-15-17-45-705.png! > Off-heap memory oom issue in SegmentedRaftLogReader > --------------------------------------------------- > > Key: RATIS-2143 > URL: https://issues.apache.org/jira/browse/RATIS-2143 > Project: Ratis > Issue Type: Bug > Affects Versions: 3.0.1 > Reporter: weiming > Priority: Major > Attachments: image-2024-08-21-15-17-45-705.png > > > In our ozone cluster, a DN was found in the SCM page to be in the DEAD state. > When restarting, the DN could not start normally, and an off-heap memory OOM > was found in the log. > > ENV: > ratis version release-3.0.1 > > JDK: > openjdk 17.0.2 2022-01-18 > OpenJDK Runtime Environment (build 17.0.2+8-86) > OpenJDK 64-Bit Server VM (build 17.0.2+8-86, mixed mode, sharing) > > Ozone DN JVM param: > {code:java} > //代码占位符 > export OZONE_DATANODE_OPTS="-Xms24g -Xmx48g -Xmn16g -XX:MetaspaceSize=512m > -XX:MaxDirectMemorySize=48g -XX:+UseG1GC -XX:MaxGCPauseMillis=60 > -XX:ParallelGCThreads=32 -XX:ConcGCThreads=16 -XX:+AlwaysPreTouc > h -XX:+TieredCompilation -XX:+UseStringDeduplication > -XX:+OptimizeStringConcat -XX:G1HeapRegionSize=32M > -XX:+ParallelRefProcEnabled -XX:ReservedCodeCacheSize=1024M > -XX:+UnlockExperimentalVMOptions -XX:G1M > ixedGCLiveThresholdPercent=85 -XX:G1HeapWastePercent=10 > -XX:InitiatingHeapOccupancyPercent=40 -XX:-G1UseAdaptiveIHOP -verbose:gc > -XX:+PrintGCDetails -XX:+PrintGC -XX:+ExitOnOutOfMemoryError -Dorg.apache.r > atis.thirdparty.io.netty.tryReflectionSetAccessible=true > -Xlog:gc*=info:file=${OZONE_LOG_DIR}/dn_gc-%p.log:time,level,tags:filecount=50,filesize=100M > -XX:NativeMemoryTracking=detail " {code} > > ERROR LOG: > > java.lang.OutOfMemoryError: Cannot reserve 8192 bytes of direct buffer memory > (allocated: 51539599490, limit: 51539607552) > at java.base/java.nio.Bits.reserveMemory(Bits.java:178) > at java.base/java.nio.DirectByteBuffer.<init>(DirectByteBuffer.java:121) > at java.base/java.nio.ByteBuffer.allocateDirect(ByteBuffer.java:332) > at java.base/sun.nio.ch.Util.getTemporaryDirectBuffer(Util.java:243) > at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:293) > at java.base/sun.nio.ch.IOUtil.read(IOUtil.java:273) > at java.base/sun.nio.ch.FileChannelImpl.read(FileChannelImpl.java:232) > at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:65) > at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:107) > at java.base/sun.nio.ch.ChannelInputStream.read(ChannelInputStream.java:101) > at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:244) > at java.base/java.io.BufferedInputStream.read1(BufferedInputStream.java:284) > at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:343) > at java.base/java.io.FilterInputStream.read(FilterInputStream.java:132) > at > org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogReader$LimitedInputStream.read(SegmentedRaftLogReader.java:96) > at java.base/java.io.DataInputStream.read(DataInputStream.java:151) > at > org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogReader.verifyHeader(SegmentedRaftLogReader.java:172) > at > org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream.init(SegmentedRaftLogInputStream.java:95) > at > org.apache.ratis.server.raftlog.segmented.SegmentedRaftLogInputStream.nextEntry(SegmentedRaftLogInputStream.java:122) > at > org.apache.ratis.server.raftlog.segmented.LogSegment.readSegmentFile(LogSegment.java:131) > at > org.apache.ratis.server.raftlog.segmented.LogSegment$LogEntryLoader.load(LogSegment.java:236) > at > org.apache.ratis.server.raftlog.segmented.LogSegment.loadCache(LogSegment.java:346) > at > org.apache.ratis.server.raftlog.segmented.SegmentedRaftLog.get(SegmentedRaftLog.java:295) > at > org.apache.ratis.server.impl.StateMachineUpdater.applyLog(StateMachineUpdater.java:236) > at > org.apache.ratis.server.impl.StateMachineUpdater.run(StateMachineUpdater.java:186) > at java.base/java.lang.Thread.run(Thread.java:833) > !image-2024-08-21-15-17-45-705.png! -- This message was sent by Atlassian Jira (v8.20.10#820010)