Author: todd Date: Wed Apr 4 23:21:43 2012 New Revision: 1309623 URL: http://svn.apache.org/viewvc?rev=1309623&view=rev Log: HDFS-1378. Edit log replay should track and report file offsets in case of errors. Backport by Colin Patrick McCabe.
Added: hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java Modified: hadoop/common/branches/branch-1/CHANGES.txt hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java Modified: hadoop/common/branches/branch-1/CHANGES.txt URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/CHANGES.txt?rev=1309623&r1=1309622&r2=1309623&view=diff ============================================================================== --- hadoop/common/branches/branch-1/CHANGES.txt (original) +++ hadoop/common/branches/branch-1/CHANGES.txt Wed Apr 4 23:21:43 2012 @@ -84,6 +84,9 @@ Release 1.1.0 - unreleased HDFS-3131. Improve TestStorageRestore. (Brandon Li via atm) + HDFS-1378. Edit log replay should track and report file offsets in case of + errors. (atm and todd, backport by Colin Patrick McCabe via todd) + BUG FIXES MAPREDUCE-4087. [Gridmix] GenerateDistCacheData job of Gridmix can Modified: hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java?rev=1309623&r1=1309622&r2=1309623&view=diff ============================================================================== --- hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java (original) +++ hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java Wed Apr 4 23:21:43 2012 @@ -25,9 +25,12 @@ import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; +import java.io.FilterInputStream; +import java.io.InputStream; import java.io.IOException; import java.io.RandomAccessFile; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.lang.Math; import java.nio.channels.FileChannel; @@ -502,7 +505,14 @@ public class FSEditLog { long highestGenStamp = -1; long startTime = FSNamesystem.now(); - DataInputStream in = new DataInputStream(new BufferedInputStream(edits)); + // Keep track of the file offsets of the last several opcodes. + // This is handy when manually recovering corrupted edits files. + PositionTrackingInputStream tracker = + new PositionTrackingInputStream(new BufferedInputStream(edits)); + long recentOpcodeOffsets[] = new long[4]; + Arrays.fill(recentOpcodeOffsets, -1); + + DataInputStream in = new DataInputStream(tracker); try { // Read log file version. Could be missing. in.mark(4); @@ -542,6 +552,8 @@ public class FSEditLog { } catch (EOFException e) { break; // no more transactions } + recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] = + tracker.getPos(); numEdits++; switch (opcode) { case OP_ADD: @@ -850,21 +862,34 @@ public class FSEditLog { } } } - } catch (IOException ex) { - // Failed to load 0.20.203 version edits during upgrade. This version has - // conflicting opcodes with the later releases. The editlog must be - // emptied by restarting the namenode, before proceeding with the upgrade. + } catch (Throwable t) { + // Catch Throwable because in the case of a truly corrupt edits log, any + // sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.) if (Storage.is203LayoutVersion(logVersion) && logVersion != FSConstants.LAYOUT_VERSION) { + // Failed to load 0.20.203 version edits during upgrade. This version has + // conflicting opcodes with the later releases. The editlog must be + // emptied by restarting the namenode, before proceeding with the upgrade. String msg = "During upgrade, failed to load the editlog version " + - logVersion + " from release 0.20.203. Please go back to the old " + - " release and restart the namenode. This empties the editlog " + - " and saves the namespace. Resume the upgrade after this step."; - throw new IOException(msg, ex); - } else { - throw ex; + logVersion + " from release 0.20.203. Please go back to the old " + + " release and restart the namenode. This empties the editlog " + + " and saves the namespace. Resume the upgrade after this step."; + throw new IOException(msg, t); } - + StringBuilder sb = new StringBuilder(); + sb.append("Error replaying edit log at offset " + tracker.getPos()); + if (recentOpcodeOffsets[0] != -1) { + Arrays.sort(recentOpcodeOffsets); + sb.append("\nRecent opcode offsets:"); + for (long offset : recentOpcodeOffsets) { + if (offset != -1) { + sb.append(' ').append(offset); + } + } + } + String errorMessage = sb.toString(); + FSImage.LOG.error(errorMessage); + throw new IOException(errorMessage, t); } finally { in.close(); } @@ -1407,4 +1432,52 @@ public class FSEditLog { } return blocks; } + + /** + * Stream wrapper that keeps track of the current file position. + */ + private static class PositionTrackingInputStream extends FilterInputStream { + private long curPos = 0; + private long markPos = -1; + + public PositionTrackingInputStream(InputStream is) { + super(is); + } + + public int read() throws IOException { + int ret = super.read(); + if (ret != -1) curPos++; + return ret; + } + + public int read(byte[] data) throws IOException { + int ret = super.read(data); + if (ret > 0) curPos += ret; + return ret; + } + + public int read(byte[] data, int offset, int length) throws IOException { + int ret = super.read(data, offset, length); + if (ret > 0) curPos += ret; + return ret; + } + + public void mark(int limit) { + super.mark(limit); + markPos = curPos; + } + + public void reset() throws IOException { + if (markPos == -1) { + throw new IOException("Not marked!"); + } + super.reset(); + curPos = markPos; + markPos = -1; + } + + public long getPos() { + return curPos; + } + } } Modified: hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java?rev=1309623&r1=1309622&r2=1309623&view=diff ============================================================================== --- hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java (original) +++ hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java Wed Apr 4 23:21:43 2012 @@ -1001,4 +1001,9 @@ public class MiniDFSCluster { public String getDataDirectory() { return data_dir.getAbsolutePath(); } + + public static File getBaseDir() { + return new File(System.getProperty( + "test.build.data", "build/test/data"), "dfs/"); + } } Added: hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java?rev=1309623&view=auto ============================================================================== --- hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java (added) +++ hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java Wed Apr 4 23:21:43 2012 @@ -0,0 +1,79 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import static org.junit.Assert.*; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.junit.Test; + +public class TestEditLogLoading { + + private static final int NUM_DATA_NODES = 0; + + @Test + public void testDisplayRecentEditLogOpCodes() throws IOException { + // start a cluster + Configuration conf = new Configuration(); + conf.set("dfs.name.dir", new File(MiniDFSCluster.getBaseDir(), "name").getPath()); + + MiniDFSCluster cluster = null; + FileSystem fileSys = null; + cluster = new MiniDFSCluster(0, conf, NUM_DATA_NODES, true, false, null, null); + cluster.waitActive(); + fileSys = cluster.getFileSystem(); + final FSNamesystem namesystem = cluster.getNameNode().getNamesystem(); + + FSImage fsimage = namesystem.getFSImage(); + final FSEditLog editLog = fsimage.getEditLog(); + for (int i = 0; i < 20; i++) { + fileSys.mkdirs(new Path("/tmp/tmp" + i)); + } + File editFile = editLog.getFsEditName(); + System.out.println("edit log file: " + editFile); + editLog.close(); + cluster.shutdown(); + + // Corrupt the edits file. + long fileLen = editFile.length(); + RandomAccessFile rwf = new RandomAccessFile(editFile, "rw"); + rwf.seek(fileLen - 40); + for (int i = 0; i < 20; i++) { + rwf.write((byte) 2); // FSEditLog.DELETE + } + rwf.close(); + + String expectedErrorMessage = "^Error replaying edit log at offset \\d+\n"; + expectedErrorMessage += "Recent opcode offsets: (\\d+\\s*){4}$"; + try { + cluster = new MiniDFSCluster(0, conf, NUM_DATA_NODES, false, false, null, null); + cluster.waitActive(); + fail("should not be able to start"); + } catch (IOException e) { + assertTrue("error message contains opcodes message", + e.getMessage().matches(expectedErrorMessage)); + } + } +}