[ 
https://issues.apache.org/jira/browse/HDFS-16292?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17437084#comment-17437084
 ] 

tomscut edited comment on HDFS-16292 at 11/2/21, 12:57 AM:
-----------------------------------------------------------

Coincidentally, our cluster also encountered this problem yesterday, but our 
version is 3.1.0 (this patch HDFS-10223 has been merged).

Client stack:

!image-2021-11-02-08-54-27-273.png|width=607,height=341!
{code:java}
"Executor task launch worker for task 2690" #47 daemon prio=5 os_prio=0 
tid=0x00007f3730286800 nid=0x1abc4 runnable [0x00007f37109ed000]"Executor task 
launch worker for task 2690" #47 daemon prio=5 os_prio=0 tid=0x00007f3730286800 
nid=0x1abc4 runnable [0x00007f37109ed000]   java.lang.Thread.State: RUNNABLE at 
sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at 
sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at 
sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79) at 
sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked 
<0x00000006cb9cf3a0> (a sun.nio.ch.Util$2) - locked <0x00000006cb9cf390> (a 
java.util.Collections$UnmodifiableSet) - locked <0x00000006cb9cf168> (a 
sun.nio.ch.EPollSelectorImpl) at 
sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at 
org.apache.hadoop.net.SocketIOWithTimeout$SelectorPool.select(SocketIOWithTimeout.java:335)
 at 
org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:157) at 
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161) at 
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:131) at 
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:118) at 
java.io.FilterInputStream.read(FilterInputStream.java:83) at 
org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)
 at 
org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)
 at 
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)
 at 
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)
 at 
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)
 at 
org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:669) 
at 
org.apache.hadoop.hdfs.DFSInputStream.actualGetFromOneDataNode(DFSInputStream.java:1117)
 at 
org.apache.hadoop.hdfs.DFSInputStream.fetchBlockByteRange(DFSInputStream.java:1069)
 at org.apache.hadoop.hdfs.DFSInputStream.pread(DFSInputStream.java:1501) at 
org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:1465) at 
org.apache.hadoop.fs.FSInputStream.readFully(FSInputStream.java:121) at 
org.apache.hadoop.fs.FSDataInputStream.readFully(FSDataInputStream.java:111) at 
org.apache.orc.impl.RecordReaderUtils.readDiskRanges(RecordReaderUtils.java:566)
 at 
org.apache.orc.impl.RecordReaderUtils$DefaultDataReader.readRowIndex(RecordReaderUtils.java:219)
 at 
org.apache.orc.impl.RecordReaderImpl.readRowIndex(RecordReaderImpl.java:1419) 
at 
org.apache.orc.impl.RecordReaderImpl.readRowIndex(RecordReaderImpl.java:1402) 
at 
org.apache.orc.impl.RecordReaderImpl.pickRowGroups(RecordReaderImpl.java:1056) 
at org.apache.orc.impl.RecordReaderImpl.readStripe(RecordReaderImpl.java:1087) 
at 
org.apache.orc.impl.RecordReaderImpl.advanceStripe(RecordReaderImpl.java:1254) 
at 
org.apache.orc.impl.RecordReaderImpl.advanceToNextRow(RecordReaderImpl.java:1289)
 at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1325) 
at 
org.apache.spark.sql.execution.datasources.orc.OrcColumnarBatchReader.nextBatch(OrcColumnarBatchReader.java:196)
 at 
org.apache.spark.sql.execution.datasources.orc.OrcColumnarBatchReader.nextKeyValue(OrcColumnarBatchReader.java:99)
 at 
org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
 at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:145)
 at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73) at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
 at 
org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:492)
 at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.columnartorow_nextBatch_0$(Unknown
 Source) at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.agg_doAggregateWithKeys_0$(Unknown
 Source) at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.processNext(Unknown
 Source) at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
 at 
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
 at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at 
org.apache.spark.shuffle.sort.UnsafeShuffleWriter.write(UnsafeShuffleWriter.java:181)
 at 
org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
 at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) 
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) 
at org.apache.spark.scheduler.Task.run(Task.scala:127) at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
 at 
org.apache.spark.executor.Executor$TaskRunner$$Lambda$386/1797753089.apply(Unknown
 Source) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) 
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449) at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
at java.lang.Thread.run(Thread.java:745)
   Locked ownable synchronizers: - <0x00000006c06fa4c8> (a 
java.util.concurrent.ThreadPoolExecutor$Worker)
{code}


was (Author: tomscut):
Coincidentally, our cluster also encountered this problem yesterday, but our 
version is 3.1.0(This patch 
[HDFS-10223|https://issues.apache.org/jira/browse/HDFS-10223] has been merged).

Client stack:

!image-2021-11-02-08-54-27-273.png|width=607,height=341!
{code:java}
"Executor task launch worker for task 2690" #47 daemon prio=5 os_prio=0 
tid=0x00007f3730286800 nid=0x1abc4 runnable [0x00007f37109ed000]"Executor task 
launch worker for task 2690" #47 daemon prio=5 os_prio=0 tid=0x00007f3730286800 
nid=0x1abc4 runnable [0x00007f37109ed000]   java.lang.Thread.State: RUNNABLE at 
sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at 
sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at 
sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79) at 
sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked 
<0x00000006cb9cf3a0> (a sun.nio.ch.Util$2) - locked <0x00000006cb9cf390> (a 
java.util.Collections$UnmodifiableSet) - locked <0x00000006cb9cf168> (a 
sun.nio.ch.EPollSelectorImpl) at 
sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at 
org.apache.hadoop.net.SocketIOWithTimeout$SelectorPool.select(SocketIOWithTimeout.java:335)
 at 
org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:157) at 
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:161) at 
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:131) at 
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:118) at 
java.io.FilterInputStream.read(FilterInputStream.java:83) at 
org.apache.hadoop.hdfs.protocolPB.PBHelperClient.vintPrefixed(PBHelperClient.java:547)
 at 
org.apache.hadoop.hdfs.client.impl.BlockReaderRemote.newBlockReader(BlockReaderRemote.java:407)
 at 
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReader(BlockReaderFactory.java:853)
 at 
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.getRemoteBlockReaderFromTcp(BlockReaderFactory.java:749)
 at 
org.apache.hadoop.hdfs.client.impl.BlockReaderFactory.build(BlockReaderFactory.java:379)
 at 
org.apache.hadoop.hdfs.DFSInputStream.getBlockReader(DFSInputStream.java:669) 
at 
org.apache.hadoop.hdfs.DFSInputStream.actualGetFromOneDataNode(DFSInputStream.java:1117)
 at 
org.apache.hadoop.hdfs.DFSInputStream.fetchBlockByteRange(DFSInputStream.java:1069)
 at org.apache.hadoop.hdfs.DFSInputStream.pread(DFSInputStream.java:1501) at 
org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:1465) at 
org.apache.hadoop.fs.FSInputStream.readFully(FSInputStream.java:121) at 
org.apache.hadoop.fs.FSDataInputStream.readFully(FSDataInputStream.java:111) at 
org.apache.orc.impl.RecordReaderUtils.readDiskRanges(RecordReaderUtils.java:566)
 at 
org.apache.orc.impl.RecordReaderUtils$DefaultDataReader.readRowIndex(RecordReaderUtils.java:219)
 at 
org.apache.orc.impl.RecordReaderImpl.readRowIndex(RecordReaderImpl.java:1419) 
at 
org.apache.orc.impl.RecordReaderImpl.readRowIndex(RecordReaderImpl.java:1402) 
at 
org.apache.orc.impl.RecordReaderImpl.pickRowGroups(RecordReaderImpl.java:1056) 
at org.apache.orc.impl.RecordReaderImpl.readStripe(RecordReaderImpl.java:1087) 
at 
org.apache.orc.impl.RecordReaderImpl.advanceStripe(RecordReaderImpl.java:1254) 
at 
org.apache.orc.impl.RecordReaderImpl.advanceToNextRow(RecordReaderImpl.java:1289)
 at org.apache.orc.impl.RecordReaderImpl.nextBatch(RecordReaderImpl.java:1325) 
at 
org.apache.spark.sql.execution.datasources.orc.OrcColumnarBatchReader.nextBatch(OrcColumnarBatchReader.java:196)
 at 
org.apache.spark.sql.execution.datasources.orc.OrcColumnarBatchReader.nextKeyValue(OrcColumnarBatchReader.java:99)
 at 
org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
 at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1$$anon$2.getNext(FileScanRDD.scala:145)
 at org.apache.spark.util.NextIterator.hasNext(NextIterator.scala:73) at 
org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
 at 
org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:492)
 at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.columnartorow_nextBatch_0$(Unknown
 Source) at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.agg_doAggregateWithKeys_0$(Unknown
 Source) at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.processNext(Unknown
 Source) at 
org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
 at 
org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
 at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at 
org.apache.spark.shuffle.sort.UnsafeShuffleWriter.write(UnsafeShuffleWriter.java:181)
 at 
org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
 at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) 
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) 
at org.apache.spark.scheduler.Task.run(Task.scala:127) at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
 at 
org.apache.spark.executor.Executor$TaskRunner$$Lambda$386/1797753089.apply(Unknown
 Source) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) 
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449) at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
at java.lang.Thread.run(Thread.java:745)
   Locked ownable synchronizers: - <0x00000006c06fa4c8> (a 
java.util.concurrent.ThreadPoolExecutor$Worker)
{code}

> The DFS Input Stream is waiting to be read
> ------------------------------------------
>
>                 Key: HDFS-16292
>                 URL: https://issues.apache.org/jira/browse/HDFS-16292
>             Project: Hadoop HDFS
>          Issue Type: Improvement
>          Components: datanode
>    Affects Versions: 2.5.2
>            Reporter: Hualong Zhang
>            Priority: Minor
>         Attachments: HDFS-16292.path, image-2021-11-01-18-36-54-329.png, 
> image-2021-11-02-08-54-27-273.png
>
>
> The input stream has been waiting.The problem seems to be that 
> BlockReaderPeer#peer does not set ReadTimeout and WriteTimeout.We can solve 
> this problem by setting the timeout in BlockReaderFactory#nextTcpPeer
> Jstack as follows
> !image-2021-11-01-18-36-54-329.png!



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org

Reply via email to