[ https://issues.apache.org/jira/browse/FLINK-16417?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17051901#comment-17051901 ]
Robert Metzger commented on FLINK-16417: ---------------------------------------- The calculated off-heap size is 130 MB. The test passes on JDK 11 when setting it to 270 MB. The test fails on JDK 11 when setting it to 135 MB. The test passes on JDK 11 when setting it to 160 MB. I will open a PR for 160MB, ok? > ConnectedComponents iterations with high parallelism end-to-end test fails > with OutOfMemoryError: Direct buffer memory > ---------------------------------------------------------------------------------------------------------------------- > > Key: FLINK-16417 > URL: https://issues.apache.org/jira/browse/FLINK-16417 > Project: Flink > Issue Type: Bug > Components: API / DataSet, Tests > Reporter: Robert Metzger > Assignee: Robert Metzger > Priority: Major > Labels: test-stability > > Logs: > https://dev.azure.com/georgeryan1322/Flink/_build/results?buildId=74&view=logs&j=1f3ed471-1849-5d3c-a34c-19792af4ad16&t=ce095137-3e3b-5f73-4b79-c42d3d5f8283 > {code} > 2020-03-04T08:03:46.0786078Z 2020-03-04 08:03:42,628 INFO > org.apache.flink.runtime.iterative.task.IterationIntermediateTask [] - > starting iteration [1]: Reduce (MIN(1), at > main(HighParallelismIterationsTestProgram.java:61) (12/25) > 2020-03-04T08:03:46.0787503Z 2020-03-04 08:03:42,875 ERROR > org.apache.flink.runtime.io.network.netty.PartitionRequestQueue [] - > Encountered error while consuming partitions > 2020-03-04T08:03:46.0788060Z java.lang.OutOfMemoryError: Direct buffer memory > 2020-03-04T08:03:46.0788460Z at java.nio.Bits.reserveMemory(Bits.java:175) > ~[?:?] > 2020-03-04T08:03:46.0788904Z at > java.nio.DirectByteBuffer.<init>(DirectByteBuffer.java:118) ~[?:?] > 2020-03-04T08:03:46.0789537Z at > java.nio.ByteBuffer.allocateDirect(ByteBuffer.java:317) ~[?:?] > 2020-03-04T08:03:46.0790381Z at > org.apache.flink.shaded.netty4.io.netty.buffer.PoolArena$DirectArena.allocateDirect(PoolArena.java:772) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0791491Z at > org.apache.flink.shaded.netty4.io.netty.buffer.PoolArena$DirectArena.newChunk(PoolArena.java:748) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0792483Z at > org.apache.flink.shaded.netty4.io.netty.buffer.PoolArena.allocateNormal(PoolArena.java:245) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0793416Z at > org.apache.flink.shaded.netty4.io.netty.buffer.PoolArena.allocate(PoolArena.java:215) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0794359Z at > org.apache.flink.shaded.netty4.io.netty.buffer.PoolArena.allocate(PoolArena.java:147) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0795385Z at > org.apache.flink.shaded.netty4.io.netty.buffer.PooledByteBufAllocator.newDirectBuffer(PooledByteBufAllocator.java:342) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0796471Z at > org.apache.flink.shaded.netty4.io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:187) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0797575Z at > org.apache.flink.shaded.netty4.io.netty.buffer.AbstractByteBufAllocator.directBuffer(AbstractByteBufAllocator.java:178) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0798718Z at > org.apache.flink.shaded.netty4.io.netty.channel.unix.PreferredDirectByteBufAllocator.ioBuffer(PreferredDirectByteBufAllocator.java:53) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0799951Z at > org.apache.flink.shaded.netty4.io.netty.channel.DefaultMaxMessagesRecvByteBufAllocator$MaxMessageHandle.allocate(DefaultMaxMessagesRecvByteBufAllocator.java:114) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0801172Z at > org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollRecvByteAllocatorHandle.allocate(EpollRecvByteAllocatorHandle.java:75) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0802572Z at > org.apache.flink.shaded.netty4.io.netty.channel.epoll.AbstractEpollStreamChannel$EpollStreamUnsafe.epollInReady(AbstractEpollStreamChannel.java:779) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0803719Z at > org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:424) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0804763Z at > org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:326) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0806007Z at > org.apache.flink.shaded.netty4.io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:918) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0807050Z at > org.apache.flink.shaded.netty4.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0807612Z at java.lang.Thread.run(Thread.java:834) [?:?] > 2020-03-04T08:03:46.0808499Z 2020-03-04 08:03:43,572 ERROR > org.apache.flink.runtime.operators.BatchTask [] - Error in > task code: Reduce (MIN(1), at > main(HighParallelismIterationsTestProgram.java:61) (5/25) > 2020-03-04T08:03:46.0810179Z java.lang.Exception: The data preparation for > task 'Reduce (MIN(1), at main(HighParallelismIterationsTestProgram.java:61)' > , caused an error: Error obtaining the sorted input: Thread 'SortMerger > Reading Thread' terminated due to an exception: readAddress(..) failed: > Connection reset by peer (connection to '10.1.0.4/10.1.0.4:44453') > 2020-03-04T08:03:46.0811472Z at > org.apache.flink.runtime.operators.BatchTask.run(BatchTask.java:480) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0813477Z at > org.apache.flink.runtime.iterative.task.AbstractIterativeTask.run(AbstractIterativeTask.java:157) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0814813Z at > org.apache.flink.runtime.iterative.task.IterationIntermediateTask.run(IterationIntermediateTask.java:107) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0816257Z at > org.apache.flink.runtime.operators.BatchTask.invoke(BatchTask.java:369) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0817111Z at > org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:717) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0817911Z at > org.apache.flink.runtime.taskmanager.Task.run(Task.java:541) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0818381Z at java.lang.Thread.run(Thread.java:834) [?:?] > 2020-03-04T08:03:46.0819353Z Caused by: java.lang.RuntimeException: Error > obtaining the sorted input: Thread 'SortMerger Reading Thread' terminated due > to an exception: readAddress(..) failed: Connection reset by peer (connection > to '10.1.0.4/10.1.0.4:44453') > 2020-03-04T08:03:46.0820498Z at > org.apache.flink.runtime.operators.sort.UnilateralSortMerger.getIterator(UnilateralSortMerger.java:650) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0821448Z at > org.apache.flink.runtime.operators.BatchTask.getInput(BatchTask.java:1110) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0822376Z at > org.apache.flink.runtime.operators.GroupReduceDriver.prepare(GroupReduceDriver.java:99) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0823248Z at > org.apache.flink.runtime.operators.BatchTask.run(BatchTask.java:474) > [flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0823661Z ... 6 more > 2020-03-04T08:03:46.0824426Z Caused by: java.io.IOException: Thread > 'SortMerger Reading Thread' terminated due to an exception: readAddress(..) > failed: Connection reset by peer (connection to '10.1.0.4/10.1.0.4:44453') > 2020-03-04T08:03:46.0825507Z at > org.apache.flink.runtime.operators.sort.UnilateralSortMerger$ThreadBase.run(UnilateralSortMerger.java:831) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0826579Z Caused by: > org.apache.flink.runtime.io.network.netty.exception.LocalTransportException: > readAddress(..) failed: Connection reset by peer (connection to > '10.1.0.4/10.1.0.4:44453') > 2020-03-04T08:03:46.0827970Z at > org.apache.flink.runtime.io.network.netty.CreditBasedPartitionRequestClientHandler.exceptionCaught(CreditBasedPartitionRequestClientHandler.java:165) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0829232Z at > org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:297) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0830423Z at > org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:276) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0831611Z at > org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:268) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0832773Z at > org.apache.flink.shaded.netty4.io.netty.channel.DefaultChannelPipeline$HeadContext.exceptionCaught(DefaultChannelPipeline.java:1388) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0834969Z at > org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:297) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0836413Z at > org.apache.flink.shaded.netty4.io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:276) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0838310Z at > org.apache.flink.shaded.netty4.io.netty.channel.DefaultChannelPipeline.fireExceptionCaught(DefaultChannelPipeline.java:918) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0839629Z at > org.apache.flink.shaded.netty4.io.netty.channel.epoll.AbstractEpollStreamChannel$EpollStreamUnsafe.handleReadException(AbstractEpollStreamChannel.java:730) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0841070Z at > org.apache.flink.shaded.netty4.io.netty.channel.epoll.AbstractEpollStreamChannel$EpollStreamUnsafe.epollInReady(AbstractEpollStreamChannel.java:820) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0842211Z at > org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop.processReady(EpollEventLoop.java:424) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0843214Z at > org.apache.flink.shaded.netty4.io.netty.channel.epoll.EpollEventLoop.run(EpollEventLoop.java:326) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0844284Z at > org.apache.flink.shaded.netty4.io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:918) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0845351Z at > org.apache.flink.shaded.netty4.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) > ~[flink-dist_2.11-1.11-SNAPSHOT.jar:1.11-SNAPSHOT] > 2020-03-04T08:03:46.0845828Z ... 1 more > 2020-03-04T08:03:46.0846253Z Caused by: > org.apache.flink.shaded.netty4.io.netty.channel.unix.Errors$NativeIoException: > readAddress(..) failed: Connection reset by peer > {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)