[ https://issues.apache.org/jira/browse/HDFS-13596?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16826162#comment-16826162 ]
Yuriy Malygin edited comment on HDFS-13596 at 4/25/19 3:31 PM: --------------------------------------------------------------- I'm testing last patch on a cluster with Kerberos: # download sources from [https://github.com/apache/hadoop] # apply patch _HDFS-13596.007.patch_ # build patched version of _hadoop-3.3.0-SNAPSHOT_ # prepare running cluster (_hadoop-2.7.3_) to _rollingUpgrade_ # run _hadoop-3.3.0-SNAPSHOT_ in _rollingUpgrade_ mode # upload test data to HDFS # restart NN and all is fine # stop cluster for rollback to hadoop-2.7.3 # hadoop-2.7.3 NN start fails with _ArrayIndexOutOfBoundsException_: {code:java} INFO | jvm 1 | 2019/04/25 18:01:24 | STARTUP_MSG: build = https://git-wip-us.apache.org/repos/asf/hadoop.git -r baa91f7c6bc9cb92be5982de4719c1c8af91ccff; compiled by 'vinodkv' on 2016-08-18T01:01Z INFO | jvm 1 | 2019/04/25 18:01:24 | STARTUP_MSG: java = 1.8.0_102 INFO | jvm 1 | 2019/04/25 18:01:24 | ************************************************************/ INFO | jvm 1 | 2019/04/25 18:01:24 | 2019-04-25 18:01:24,767 INFO [Thread-3] NameNode - registered UNIX signal handlers for [TERM, HUP, INT] INFO | jvm 1 | 2019/04/25 18:01:24 | 2019-04-25 18:01:24,769 INFO [Thread-3] NameNode - createNameNode [-rollingUpgrade, rollback] {code} {code:java} INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG [Thread-3] FSImage - Planning to load edit log stream: org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@53468702 INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG [Thread-3] FSImage - Planning to load edit log stream: org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@5e350e83 INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG [Thread-3] FSImage - Planning to load edit log stream: org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@cf86362 INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG [Thread-3] FSImage - Planning to load edit log stream: org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@840d683 INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 INFO [Thread-3] FSImage - Planning to load image: FSImageFile(file=/one/hadoop-data/dfs/current/fsimage_rollback_0000000000000000677, cpktTxId=0000000000000000677) INFO | jvm 1 | 2019/04/25 18:01:30 | Total time for which application threads were stopped: 0.0007557 seconds, Stopping threads took: 0.0001181 seconds INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,377 ERROR [Thread-3] FSImage - Failed to load image from FSImageFile(file=/one/hadoop-data/dfs/current/fsimage_rollback_0000000000000000677, cpktTxId=0000000000000000677) INFO | jvm 1 | 2019/04/25 18:01:30 | java.lang.ArrayIndexOutOfBoundsException: 536870913 INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.loadStringTableSection(FSImageFormatProtobuf.java:318) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.loadInternal(FSImageFormatProtobuf.java:251) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.load(FSImageFormatProtobuf.java:182) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImageFormat$LoaderDelegator.load(FSImageFormat.java:226) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:963) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:947) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImageFile(FSImage.java:746) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:677) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:294) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:976) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:681) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:585) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:645) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:812) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:796) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1493) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1559) {code} {code:java} INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,726 ERROR [Thread-3] NameNode - Failed to start namenode. INFO | jvm 1 | 2019/04/25 18:01:30 | java.io.IOException: Failed to load FSImage file, see error(s) above for more info. INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:692) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:294) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:976) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:681) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:585) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:645) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:812) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:796) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1493) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1559) INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,728 INFO [Thread-3] ExitUtil - Exiting with status 1 {code} was (Author: _ph): I'm testing last patch on a cluster with Kerberos: # download sources from [https://github.com/apache/hadoop] # apply patch _HDFS-13596.007.patch_ # build patched version of _hadoop-3.3.0-SNAPSHOT_ # prepare running cluster (_hadoop-2.7.3_) to _rollingUpgrade_ # run _hadoop-3.3.0-SNAPSHOT_ in _rollingUpgrade_ mode # upload test data to HDFS # restart NN and all is fine # stop cluster for rollback to hadoop-2.7.3 # hadoop-2.7.3 NN start fails: {code:java} INFO | jvm 1 | 2019/04/25 18:01:24 | STARTUP_MSG: build = https://git-wip-us.apache.org/repos/asf/hadoop.git -r baa91f7c6bc9cb92be5982de4719c1c8af91ccff; compiled by 'vinodkv' on 2016-08-18T01:01Z INFO | jvm 1 | 2019/04/25 18:01:24 | STARTUP_MSG: java = 1.8.0_102 INFO | jvm 1 | 2019/04/25 18:01:24 | ************************************************************/ INFO | jvm 1 | 2019/04/25 18:01:24 | 2019-04-25 18:01:24,767 INFO [Thread-3] NameNode - registered UNIX signal handlers for [TERM, HUP, INT] INFO | jvm 1 | 2019/04/25 18:01:24 | 2019-04-25 18:01:24,769 INFO [Thread-3] NameNode - createNameNode [-rollingUpgrade, rollback] {code} {code:java} INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG [Thread-3] FSImage - Planning to load edit log stream: org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@53468702 INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG [Thread-3] FSImage - Planning to load edit log stream: org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@5e350e83 INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG [Thread-3] FSImage - Planning to load edit log stream: org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@cf86362 INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 DEBUG [Thread-3] FSImage - Planning to load edit log stream: org.apache.hadoop.hdfs.server.namenode.RedundantEditLogInputStream@840d683 INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,342 INFO [Thread-3] FSImage - Planning to load image: FSImageFile(file=/one/hadoop-data/dfs/current/fsimage_rollback_0000000000000000677, cpktTxId=0000000000000000677) INFO | jvm 1 | 2019/04/25 18:01:30 | Total time for which application threads were stopped: 0.0007557 seconds, Stopping threads took: 0.0001181 seconds INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,377 ERROR [Thread-3] FSImage - Failed to load image from FSImageFile(file=/one/hadoop-data/dfs/current/fsimage_rollback_0000000000000000677, cpktTxId=0000000000000000677) INFO | jvm 1 | 2019/04/25 18:01:30 | java.lang.ArrayIndexOutOfBoundsException: 536870913 INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.loadStringTableSection(FSImageFormatProtobuf.java:318) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.loadInternal(FSImageFormatProtobuf.java:251) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf$Loader.load(FSImageFormatProtobuf.java:182) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImageFormat$LoaderDelegator.load(FSImageFormat.java:226) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:963) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:947) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImageFile(FSImage.java:746) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:677) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:294) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:976) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:681) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:585) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:645) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:812) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:796) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1493) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1559) {code} {code:java} INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,726 ERROR [Thread-3] NameNode - Failed to start namenode. INFO | jvm 1 | 2019/04/25 18:01:30 | java.io.IOException: Failed to load FSImage file, see error(s) above for more info. INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:692) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:294) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:976) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:681) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:585) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:645) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:812) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:796) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1493) INFO | jvm 1 | 2019/04/25 18:01:30 | at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1559) INFO | jvm 1 | 2019/04/25 18:01:30 | 2019-04-25 18:01:30,728 INFO [Thread-3] ExitUtil - Exiting with status 1 {code} > NN restart fails after RollingUpgrade from 2.x to 3.x > ----------------------------------------------------- > > Key: HDFS-13596 > URL: https://issues.apache.org/jira/browse/HDFS-13596 > Project: Hadoop HDFS > Issue Type: Bug > Components: hdfs > Reporter: Hanisha Koneru > Assignee: Fei Hui > Priority: Critical > Attachments: HDFS-13596.001.patch, HDFS-13596.002.patch, > HDFS-13596.003.patch, HDFS-13596.004.patch, HDFS-13596.005.patch, > HDFS-13596.006.patch, HDFS-13596.007.patch > > > After rollingUpgrade NN from 2.x and 3.x, if the NN is restarted, it fails > while replaying edit logs. > * After NN is started with rollingUpgrade, the layoutVersion written to > editLogs (before finalizing the upgrade) is the pre-upgrade layout version > (so as to support downgrade). > * When writing transactions to log, NN writes as per the current layout > version. In 3.x, erasureCoding bits are added to the editLog transactions. > * So any edit log written after the upgrade and before finalizing the > upgrade will have the old layout version but the new format of transactions. > * When NN is restarted and the edit logs are replayed, the NN reads the old > layout version from the editLog file. When parsing the transactions, it > assumes that the transactions are also from the previous layout and hence > skips parsing the erasureCoding bits. > * This cascades into reading the wrong set of bits for other fields and > leads to NN shutting down. > Sample error output: > {code:java} > java.lang.IllegalArgumentException: Invalid clientId - length is 0 expected > length 16 > at com.google.common.base.Preconditions.checkArgument(Preconditions.java:88) > at org.apache.hadoop.ipc.RetryCache$CacheEntry.<init>(RetryCache.java:74) > at org.apache.hadoop.ipc.RetryCache$CacheEntry.<init>(RetryCache.java:86) > at > org.apache.hadoop.ipc.RetryCache$CacheEntryWithPayload.<init>(RetryCache.java:163) > at > org.apache.hadoop.ipc.RetryCache.addCacheEntryWithPayload(RetryCache.java:322) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.addCacheEntryWithPayload(FSNamesystem.java:960) > at > org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.applyEditLogOp(FSEditLogLoader.java:397) > at > org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadEditRecords(FSEditLogLoader.java:249) > at > org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadFSEdits(FSEditLogLoader.java:158) > at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:888) > at > org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:745) > at > org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:323) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:1086) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:714) > at > org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:632) > at > org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:694) > at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:937) > at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:910) > at > org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1643) > at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1710) > 2018-05-17 19:10:06,522 WARN > org.apache.hadoop.hdfs.server.namenode.FSNamesystem: Encountered exception > loading fsimage > java.io.IOException: java.lang.IllegalStateException: Cannot skip to less > than the current value (=16389), where newValue=16388 > at > org.apache.hadoop.hdfs.server.namenode.FSDirectory.resetLastInodeId(FSDirectory.java:1945) > at > org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadEditRecords(FSEditLogLoader.java:298) > at > org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.loadFSEdits(FSEditLogLoader.java:158) > at org.apache.hadoop.hdfs.server.namenode.FSImage.loadEdits(FSImage.java:888) > at > org.apache.hadoop.hdfs.server.namenode.FSImage.loadFSImage(FSImage.java:745) > at > org.apache.hadoop.hdfs.server.namenode.FSImage.recoverTransitionRead(FSImage.java:323) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFSImage(FSNamesystem.java:1086) > at > org.apache.hadoop.hdfs.server.namenode.FSNamesystem.loadFromDisk(FSNamesystem.java:714) > at > org.apache.hadoop.hdfs.server.namenode.NameNode.loadNamesystem(NameNode.java:632) > at > org.apache.hadoop.hdfs.server.namenode.NameNode.initialize(NameNode.java:694) > at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:937) > at org.apache.hadoop.hdfs.server.namenode.NameNode.<init>(NameNode.java:910) > at > org.apache.hadoop.hdfs.server.namenode.NameNode.createNameNode(NameNode.java:1643) > at org.apache.hadoop.hdfs.server.namenode.NameNode.main(NameNode.java:1710) > Caused by: java.lang.IllegalStateException: Cannot skip to less than the > current value (=16389), where newValue=16388 > at org.apache.hadoop.util.SequentialNumber.skipTo(SequentialNumber.java:58) > at > org.apache.hadoop.hdfs.server.namenode.FSDirectory.resetLastInodeId(FSDirectory.java:1943) > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: hdfs-issues-unsubscr...@hadoop.apache.org For additional commands, e-mail: hdfs-issues-h...@hadoop.apache.org