Author: todd
Date: Wed Apr  4 23:21:43 2012
New Revision: 1309623

URL: http://svn.apache.org/viewvc?rev=1309623&view=rev
Log:
HDFS-1378. Edit log replay should track and report file offsets in case of 
errors. Backport by Colin Patrick McCabe.

Added:
    
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java
Modified:
    hadoop/common/branches/branch-1/CHANGES.txt
    
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
    
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java

Modified: hadoop/common/branches/branch-1/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/CHANGES.txt?rev=1309623&r1=1309622&r2=1309623&view=diff
==============================================================================
--- hadoop/common/branches/branch-1/CHANGES.txt (original)
+++ hadoop/common/branches/branch-1/CHANGES.txt Wed Apr  4 23:21:43 2012
@@ -84,6 +84,9 @@ Release 1.1.0 - unreleased
 
     HDFS-3131. Improve TestStorageRestore. (Brandon Li via atm)
 
+    HDFS-1378. Edit log replay should track and report file offsets in case of
+    errors. (atm and todd, backport by Colin Patrick McCabe via todd)
+
   BUG FIXES
 
     MAPREDUCE-4087. [Gridmix] GenerateDistCacheData job of Gridmix can

Modified: 
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
URL: 
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java?rev=1309623&r1=1309622&r2=1309623&view=diff
==============================================================================
--- 
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
 (original)
+++ 
hadoop/common/branches/branch-1/src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
 Wed Apr  4 23:21:43 2012
@@ -25,9 +25,12 @@ import java.io.EOFException;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
+import java.io.FilterInputStream;
+import java.io.InputStream;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.lang.Math;
 import java.nio.channels.FileChannel;
@@ -502,7 +505,14 @@ public class FSEditLog {
     long highestGenStamp = -1;
     long startTime = FSNamesystem.now();
 
-    DataInputStream in = new DataInputStream(new BufferedInputStream(edits));
+    // Keep track of the file offsets of the last several opcodes.
+    // This is handy when manually recovering corrupted edits files.
+    PositionTrackingInputStream tracker = 
+      new PositionTrackingInputStream(new BufferedInputStream(edits));
+    long recentOpcodeOffsets[] = new long[4];
+    Arrays.fill(recentOpcodeOffsets, -1);
+
+    DataInputStream in = new DataInputStream(tracker);
     try {
       // Read log file version. Could be missing. 
       in.mark(4);
@@ -542,6 +552,8 @@ public class FSEditLog {
         } catch (EOFException e) {
           break; // no more transactions
         }
+        recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] =
+          tracker.getPos();
         numEdits++;
         switch (opcode) {
         case OP_ADD:
@@ -850,21 +862,34 @@ public class FSEditLog {
         }
         }
       }
-    } catch (IOException ex) {
-      // Failed to load 0.20.203 version edits during upgrade. This version has
-      // conflicting opcodes with the later releases. The editlog must be 
-      // emptied by restarting the namenode, before proceeding with the 
upgrade.
+    } catch (Throwable t) {
+      // Catch Throwable because in the case of a truly corrupt edits log, any
+      // sort of error might be thrown (NumberFormat, NullPointer, EOF, etc.)
       if (Storage.is203LayoutVersion(logVersion) &&
           logVersion != FSConstants.LAYOUT_VERSION) {
+        // Failed to load 0.20.203 version edits during upgrade. This version 
has
+        // conflicting opcodes with the later releases. The editlog must be 
+        // emptied by restarting the namenode, before proceeding with the 
upgrade.
         String msg = "During upgrade, failed to load the editlog version " + 
-        logVersion + " from release 0.20.203. Please go back to the old " + 
-        " release and restart the namenode. This empties the editlog " +
-        " and saves the namespace. Resume the upgrade after this step.";
-        throw new IOException(msg, ex);
-      } else {
-        throw ex;
+          logVersion + " from release 0.20.203. Please go back to the old " + 
+          " release and restart the namenode. This empties the editlog " +
+          " and saves the namespace. Resume the upgrade after this step.";
+        throw new IOException(msg, t);
       }
-      
+      StringBuilder sb = new StringBuilder();
+      sb.append("Error replaying edit log at offset " + tracker.getPos());
+      if (recentOpcodeOffsets[0] != -1) {
+        Arrays.sort(recentOpcodeOffsets);
+        sb.append("\nRecent opcode offsets:");
+        for (long offset : recentOpcodeOffsets) {
+          if (offset != -1) {
+            sb.append(' ').append(offset);
+          }
+        }
+      }
+      String errorMessage = sb.toString();
+      FSImage.LOG.error(errorMessage);
+      throw new IOException(errorMessage, t);
     } finally {
       in.close();
     }
@@ -1407,4 +1432,52 @@ public class FSEditLog {
     }
     return blocks;
   }
+
+  /**
+   * Stream wrapper that keeps track of the current file position.
+   */
+  private static class PositionTrackingInputStream extends FilterInputStream {
+    private long curPos = 0;
+    private long markPos = -1;
+
+    public PositionTrackingInputStream(InputStream is) {
+      super(is);
+    }
+
+    public int read() throws IOException {
+      int ret = super.read();
+      if (ret != -1) curPos++;
+      return ret;
+    }
+
+    public int read(byte[] data) throws IOException {
+      int ret = super.read(data);
+      if (ret > 0) curPos += ret;
+      return ret;
+    }
+
+    public int read(byte[] data, int offset, int length) throws IOException {
+      int ret = super.read(data, offset, length);
+      if (ret > 0) curPos += ret;
+      return ret;
+    }
+
+    public void mark(int limit) {
+      super.mark(limit);
+      markPos = curPos;
+    }
+
+    public void reset() throws IOException {
+      if (markPos == -1) {
+        throw new IOException("Not marked!");
+      }
+      super.reset();
+      curPos = markPos;
+      markPos = -1;
+    }
+
+    public long getPos() {
+      return curPos;
+    }
+  }
 }

Modified: 
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
URL: 
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java?rev=1309623&r1=1309622&r2=1309623&view=diff
==============================================================================
--- 
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
 (original)
+++ 
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/MiniDFSCluster.java
 Wed Apr  4 23:21:43 2012
@@ -1001,4 +1001,9 @@ public class MiniDFSCluster {
   public String getDataDirectory() {
     return data_dir.getAbsolutePath();
   }
+
+  public static File getBaseDir() {
+    return new File(System.getProperty(
+      "test.build.data", "build/test/data"), "dfs/");
+  }
 }

Added: 
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java
URL: 
http://svn.apache.org/viewvc/hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java?rev=1309623&view=auto
==============================================================================
--- 
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java
 (added)
+++ 
hadoop/common/branches/branch-1/src/test/org/apache/hadoop/hdfs/server/namenode/TestEditLogLoading.java
 Wed Apr  4 23:21:43 2012
@@ -0,0 +1,79 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.junit.Test;
+
+public class TestEditLogLoading {
+
+  private static final int NUM_DATA_NODES = 0;
+
+  @Test
+  public void testDisplayRecentEditLogOpCodes() throws IOException {
+    // start a cluster
+    Configuration conf = new Configuration();
+    conf.set("dfs.name.dir", new File(MiniDFSCluster.getBaseDir(), 
"name").getPath());
+
+    MiniDFSCluster cluster = null;
+    FileSystem fileSys = null;
+    cluster = new MiniDFSCluster(0, conf, NUM_DATA_NODES, true, false, null, 
null);
+    cluster.waitActive();
+    fileSys = cluster.getFileSystem();
+    final FSNamesystem namesystem = cluster.getNameNode().getNamesystem();
+
+    FSImage fsimage = namesystem.getFSImage();
+    final FSEditLog editLog = fsimage.getEditLog();
+    for (int i = 0; i < 20; i++) {
+      fileSys.mkdirs(new Path("/tmp/tmp" + i));
+    }
+    File editFile = editLog.getFsEditName();
+    System.out.println("edit log file: " + editFile);
+    editLog.close();
+    cluster.shutdown();
+
+    // Corrupt the edits file.
+    long fileLen = editFile.length();
+    RandomAccessFile rwf = new RandomAccessFile(editFile, "rw");
+    rwf.seek(fileLen - 40);
+    for (int i = 0; i < 20; i++) {
+      rwf.write((byte) 2); // FSEditLog.DELETE
+    }
+    rwf.close();
+
+    String expectedErrorMessage = "^Error replaying edit log at offset \\d+\n";
+    expectedErrorMessage += "Recent opcode offsets: (\\d+\\s*){4}$";
+    try {
+      cluster = new MiniDFSCluster(0, conf, NUM_DATA_NODES, false, false, 
null, null);
+      cluster.waitActive();
+      fail("should not be able to start");
+    } catch (IOException e) {
+      assertTrue("error message contains opcodes message",
+          e.getMessage().matches(expectedErrorMessage));
+    }
+  }
+}


Reply via email to