[ https://issues.apache.org/jira/browse/HDFS-15240?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17240532#comment-17240532 ]
Hadoop QA commented on HDFS-15240: ---------------------------------- | (x) *{color:red}-1 overall{color}* | \\ \\ || Vote || Subsystem || Runtime || Logfile || Comment || | {color:blue}0{color} | {color:blue} reexec {color} | {color:blue} 1m 20s{color} | {color:blue}{color} | {color:blue} Docker mode activated. {color} | || || || || {color:brown} Prechecks {color} || || | {color:green}+1{color} | {color:green} dupname {color} | {color:green} 0m 0s{color} | {color:green}{color} | {color:green} No case conflicting files found. {color} | | {color:green}+1{color} | {color:green} @author {color} | {color:green} 0m 0s{color} | {color:green}{color} | {color:green} The patch does not contain any @author tags. {color} | | {color:green}+1{color} | {color:green} {color} | {color:green} 0m 0s{color} | {color:green}test4tests{color} | {color:green} The patch appears to include 2 new or modified test files. {color} | || || || || {color:brown} trunk Compile Tests {color} || || | {color:blue}0{color} | {color:blue} mvndep {color} | {color:blue} 0m 51s{color} | {color:blue}{color} | {color:blue} Maven dependency ordering for branch {color} | | {color:green}+1{color} | {color:green} mvninstall {color} | {color:green} 22m 36s{color} | {color:green}{color} | {color:green} trunk passed {color} | | {color:green}+1{color} | {color:green} compile {color} | {color:green} 21m 15s{color} | {color:green}{color} | {color:green} trunk passed with JDK Ubuntu-11.0.9.1+1-Ubuntu-0ubuntu1.18.04 {color} | | {color:green}+1{color} | {color:green} compile {color} | {color:green} 18m 0s{color} | {color:green}{color} | {color:green} trunk passed with JDK Private Build-1.8.0_275-8u275-b01-0ubuntu1~18.04-b01 {color} | | {color:green}+1{color} | {color:green} checkstyle {color} | {color:green} 2m 42s{color} | {color:green}{color} | {color:green} trunk passed {color} | | {color:green}+1{color} | {color:green} mvnsite {color} | {color:green} 2m 51s{color} | {color:green}{color} | {color:green} trunk passed {color} | | {color:green}+1{color} | {color:green} shadedclient {color} | {color:green} 22m 48s{color} | {color:green}{color} | {color:green} branch has no errors when building and testing our client artifacts. {color} | | {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 1m 57s{color} | {color:green}{color} | {color:green} trunk passed with JDK Ubuntu-11.0.9.1+1-Ubuntu-0ubuntu1.18.04 {color} | | {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 2m 58s{color} | {color:green}{color} | {color:green} trunk passed with JDK Private Build-1.8.0_275-8u275-b01-0ubuntu1~18.04-b01 {color} | | {color:blue}0{color} | {color:blue} spotbugs {color} | {color:blue} 3m 18s{color} | {color:blue}{color} | {color:blue} Used deprecated FindBugs config; considering switching to SpotBugs. {color} | | {color:green}+1{color} | {color:green} findbugs {color} | {color:green} 5m 30s{color} | {color:green}{color} | {color:green} trunk passed {color} | || || || || {color:brown} Patch Compile Tests {color} || || | {color:blue}0{color} | {color:blue} mvndep {color} | {color:blue} 0m 22s{color} | {color:blue}{color} | {color:blue} Maven dependency ordering for patch {color} | | {color:green}+1{color} | {color:green} mvninstall {color} | {color:green} 2m 3s{color} | {color:green}{color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} compile {color} | {color:green} 20m 40s{color} | {color:green}{color} | {color:green} the patch passed with JDK Ubuntu-11.0.9.1+1-Ubuntu-0ubuntu1.18.04 {color} | | {color:green}+1{color} | {color:green} javac {color} | {color:green} 20m 40s{color} | {color:green}{color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} compile {color} | {color:green} 18m 1s{color} | {color:green}{color} | {color:green} the patch passed with JDK Private Build-1.8.0_275-8u275-b01-0ubuntu1~18.04-b01 {color} | | {color:green}+1{color} | {color:green} javac {color} | {color:green} 18m 1s{color} | {color:green}{color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} checkstyle {color} | {color:green} 2m 47s{color} | {color:green}{color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} mvnsite {color} | {color:green} 3m 24s{color} | {color:green}{color} | {color:green} the patch passed {color} | | {color:green}+1{color} | {color:green} whitespace {color} | {color:green} 0m 1s{color} | {color:green}{color} | {color:green} The patch has no whitespace issues. {color} | | {color:green}+1{color} | {color:green} shadedclient {color} | {color:green} 18m 9s{color} | {color:green}{color} | {color:green} patch has no errors when building and testing our client artifacts. {color} | | {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 1m 59s{color} | {color:green}{color} | {color:green} the patch passed with JDK Ubuntu-11.0.9.1+1-Ubuntu-0ubuntu1.18.04 {color} | | {color:green}+1{color} | {color:green} javadoc {color} | {color:green} 3m 18s{color} | {color:green}{color} | {color:green} the patch passed with JDK Private Build-1.8.0_275-8u275-b01-0ubuntu1~18.04-b01 {color} | | {color:green}+1{color} | {color:green} findbugs {color} | {color:green} 5m 53s{color} | {color:green}{color} | {color:green} the patch passed {color} | || || || || {color:brown} Other Tests {color} || || | {color:green}+1{color} | {color:green} unit {color} | {color:green} 9m 53s{color} | {color:green}{color} | {color:green} hadoop-common in the patch passed. {color} | | {color:red}-1{color} | {color:red} unit {color} | {color:red}126m 9s{color} | {color:red}https://ci-hadoop.apache.org/job/PreCommit-HDFS-Build/321/artifact/out/patch-unit-hadoop-hdfs-project_hadoop-hdfs.txt{color} | {color:red} hadoop-hdfs in the patch passed. {color} | | {color:green}+1{color} | {color:green} asflicense {color} | {color:green} 1m 3s{color} | {color:green}{color} | {color:green} The patch does not generate ASF License warnings. {color} | | {color:black}{color} | {color:black} {color} | {color:black}314m 22s{color} | {color:black}{color} | {color:black}{color} | \\ \\ || Reason || Tests || | Failed junit tests | hadoop.hdfs.server.namenode.ha.TestStandbyCheckpoints | | | hadoop.hdfs.server.balancer.TestBalancer | | | hadoop.hdfs.TestRollingUpgrade | \\ \\ || Subsystem || Report/Notes || | Docker | ClientAPI=1.40 ServerAPI=1.40 base: https://ci-hadoop.apache.org/job/PreCommit-HDFS-Build/321/artifact/out/Dockerfile | | JIRA Issue | HDFS-15240 | | JIRA Patch URL | https://issues.apache.org/jira/secure/attachment/13016191/HDFS-15240.012.patch | | Optional Tests | dupname asflicense compile javac javadoc mvninstall mvnsite unit shadedclient findbugs checkstyle | | uname | Linux 4a7b019fda3e 4.15.0-112-generic #113-Ubuntu SMP Thu Jul 9 23:41:39 UTC 2020 x86_64 x86_64 x86_64 GNU/Linux | | Build tool | maven | | Personality | personality/hadoop.sh | | git revision | trunk / 4d2ae5b3989 | | Default Java | Private Build-1.8.0_275-8u275-b01-0ubuntu1~18.04-b01 | | Multi-JDK versions | /usr/lib/jvm/java-11-openjdk-amd64:Ubuntu-11.0.9.1+1-Ubuntu-0ubuntu1.18.04 /usr/lib/jvm/java-8-openjdk-amd64:Private Build-1.8.0_275-8u275-b01-0ubuntu1~18.04-b01 | | Test Results | https://ci-hadoop.apache.org/job/PreCommit-HDFS-Build/321/testReport/ | | Max. process+thread count | 2828 (vs. ulimit of 5500) | | modules | C: hadoop-common-project/hadoop-common hadoop-hdfs-project/hadoop-hdfs U: . | | Console output | https://ci-hadoop.apache.org/job/PreCommit-HDFS-Build/321/console | | versions | git=2.17.1 maven=3.6.0 findbugs=4.0.6 | | Powered by | Apache Yetus 0.13.0-SNAPSHOT https://yetus.apache.org | This message was automatically generated. > Erasure Coding: dirty buffer causes reconstruction block error > -------------------------------------------------------------- > > Key: HDFS-15240 > URL: https://issues.apache.org/jira/browse/HDFS-15240 > Project: Hadoop HDFS > Issue Type: Bug > Components: datanode, erasure-coding > Reporter: HuangTao > Assignee: HuangTao > Priority: Major > Attachments: HDFS-15240.001.patch, HDFS-15240.002.patch, > HDFS-15240.003.patch, HDFS-15240.004.patch, HDFS-15240.005.patch, > HDFS-15240.006.patch, HDFS-15240.007.patch, HDFS-15240.008.patch, > HDFS-15240.009.patch, HDFS-15240.010.patch, HDFS-15240.011.patch, > HDFS-15240.012.patch, image-2020-07-16-15-56-38-608.png, > org.apache.hadoop.hdfs.TestReconstructStripedFile-output.txt, > org.apache.hadoop.hdfs.TestReconstructStripedFile.txt, > test-HDFS-15240.006.patch > > > When read some lzo files we found some blocks were broken. > I read back all internal blocks(b0-b8) of the block group(RS-6-3-1024k) from > DN directly, and choose 6(b0-b5) blocks to decode other 3(b6', b7', b8') > blocks. And find the longest common sequenece(LCS) between b6'(decoded) and > b6(read from DN)(b7'/b7 and b8'/b8). > After selecting 6 blocks of the block group in combinations one time and > iterating through all cases, I find one case that the length of LCS is the > block length - 64KB, 64KB is just the length of ByteBuffer used by > StripedBlockReader. So the corrupt reconstruction block is made by a dirty > buffer. > The following log snippet(only show 2 of 28 cases) is my check program > output. In my case, I known the 3th block is corrupt, so need other 5 blocks > to decode another 3 blocks, then find the 1th block's LCS substring is block > length - 64kb. > It means (0,1,2,4,5,6)th blocks were used to reconstruct 3th block, and the > dirty buffer was used before read the 1th block. > Must be noted that StripedBlockReader read from the offset 0 of the 1th block > after used the dirty buffer. > EDITED for readability. > {code:java} > decode from block[0, 2, 3, 4, 5, 7] to generate block[1', 6', 8'] > Check the first 131072 bytes between block[1] and block[1'], the longest > common substring length is 4 > Check the first 131072 bytes between block[6] and block[6'], the longest > common substring length is 4 > Check the first 131072 bytes between block[8] and block[8'], the longest > common substring length is 4 > decode from block[0, 2, 3, 4, 5, 6] to generate block[1', 7', 8'] > Check the first 131072 bytes between block[1] and block[1'], the longest > common substring length is 65536 > CHECK AGAIN: all 27262976 bytes between block[1] and block[1'], the longest > common substring length is 27197440 # this one > Check the first 131072 bytes between block[7] and block[7'], the longest > common substring length is 4 > Check the first 131072 bytes between block[8] and block[8'], the longest > common substring length is 4{code} > Now I know the dirty buffer causes reconstruction block error, but how does > the dirty buffer come about? > After digging into the code and DN log, I found this following DN log is the > root reason. > {code:java} > [INFO] [stripedRead-1017] : Interrupted while waiting for IO on channel > java.nio.channels.SocketChannel[connected local=/xxxxxxxx:52586 > remote=/xxxxxxxx:50010]. 180000 millis timeout left. > [WARN] [StripedBlockReconstruction-199] : Failed to reconstruct striped > block: BP-714356632-xxxxxxxx-1519726836856:blk_-YYYYYYYYYYYYYY_3472979393 > java.lang.NullPointerException > at > org.apache.hadoop.hdfs.util.StripedBlockUtil.getNextCompletedStripedRead(StripedBlockUtil.java:314) > at > org.apache.hadoop.hdfs.server.datanode.erasurecode.StripedReader.doReadMinimumSources(StripedReader.java:308) > at > org.apache.hadoop.hdfs.server.datanode.erasurecode.StripedReader.readMinimumSources(StripedReader.java:269) > at > org.apache.hadoop.hdfs.server.datanode.erasurecode.StripedBlockReconstructor.reconstruct(StripedBlockReconstructor.java:94) > at > org.apache.hadoop.hdfs.server.datanode.erasurecode.StripedBlockReconstructor.run(StripedBlockReconstructor.java:60) > at > java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515) > at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:834) {code} > Reading from DN may timeout(hold by a future(F)) and output the INFO log, but > the futures that contains the future(F) is cleared, > {code:java} > return new StripingChunkReadResult(futures.remove(future), > StripingChunkReadResult.CANCELLED); {code} > futures.remove(future) cause NPE. So the EC reconstruction is failed. In the > finally phase, the code snippet in *getStripedReader().close()* > {code:java} > reconstructor.freeBuffer(reader.getReadBuffer()); > reader.freeReadBuffer(); > reader.closeBlockReader(); {code} > free buffer firstly, but the StripedBlockReader still holds the buffer and > write it, that pollute the buffer of BufferPool. -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org