http://git-wip-us.apache.org/repos/asf/hbase/blob/0bf97bac/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java new file mode 100644 index 0000000..ce70682 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestWALLockup.java @@ -0,0 +1,279 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver; + + +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.CountDownLatch; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.Cell; +import org.apache.hadoop.hbase.CellScanner; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HTableDescriptor; +import org.apache.hadoop.hbase.Server; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Durability; +import org.apache.hadoop.hbase.client.Put; +import org.apache.hadoop.hbase.regionserver.wal.FSHLog; +import org.apache.hadoop.hbase.regionserver.wal.WALEdit; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper; +import org.apache.hadoop.hbase.util.Threads; +import org.apache.hadoop.hbase.wal.WAL; +import org.apache.hadoop.hbase.wal.WALKey; +import org.apache.hadoop.hbase.wal.WALProvider.Writer; +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.TestName; +import org.mockito.Mockito; + +/** + * Testing for lock up of WAL subsystem. + * Copied from TestHRegion. + */ +@Category({MediumTests.class}) +public class TestWALLockup { + private static final Log LOG = LogFactory.getLog(TestWALLockup.class); + @Rule public TestName name = new TestName(); + + private static final String COLUMN_FAMILY = "MyCF"; + private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY); + + HRegion region = null; + // Do not run unit tests in parallel (? Why not? It don't work? Why not? St.Ack) + private static HBaseTestingUtility TEST_UTIL; + private static Configuration CONF ; + private String dir; + + // Test names + protected TableName tableName; + + @Before + public void setup() throws IOException { + TEST_UTIL = HBaseTestingUtility.createLocalHTU(); + CONF = TEST_UTIL.getConfiguration(); + // Disable block cache. + CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f); + dir = TEST_UTIL.getDataTestDir("TestHRegion").toString(); + tableName = TableName.valueOf(name.getMethodName()); + } + + @After + public void tearDown() throws Exception { + EnvironmentEdgeManagerTestHelper.reset(); + LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir()); + TEST_UTIL.cleanupTestDir(); + } + + String getName() { + return name.getMethodName(); + } + + /** + * Reproduce locking up that happens when we get an inopportune sync during setup for + * zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because + * it is locked up. + * <p>First I need to set up some mocks for Server and RegionServerServices. I also need to + * set up a dodgy WAL that will throw an exception when we go to append to it. + */ + @Test (timeout=30000) + public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException { + // A WAL that we can have throw exceptions when a flag is set. + class DodgyFSLog extends FSHLog { + // Set this when want the WAL to start throwing exceptions. + volatile boolean throwException = false; + + // Latch to hold up processing until after another operation has had time to run. + CountDownLatch latch = new CountDownLatch(1); + + public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf) + throws IOException { + super(fs, root, logDir, conf); + } + + @Override + protected void afterCreatingZigZagLatch() { + // If throwException set, then append will throw an exception causing the WAL to be + // rolled. We'll come in here. Hold up processing until a sync can get in before + // the zigzag has time to complete its setup and get its own sync in. This is what causes + // the lock up we've seen in production. + if (throwException) { + try { + LOG.info("LATCHED"); + this.latch.await(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + @Override + protected void beforeWaitOnSafePoint() { + if (throwException) { + LOG.info("COUNTDOWN"); + // Don't countdown latch until someone waiting on it otherwise, the above + // afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll + // be stuck; test won't go down + while (this.latch.getCount() <= 0) Threads.sleep(1); + this.latch.countDown(); + } + } + + @Override + protected Writer createWriterInstance(Path path) throws IOException { + final Writer w = super.createWriterInstance(path); + return new Writer() { + @Override + public void close() throws IOException { + w.close(); + } + + @Override + public void sync() throws IOException { + if (throwException) { + throw new IOException("FAKE! Failed to replace a bad datanode...SYNC"); + } + w.sync(); + } + + @Override + public void append(Entry entry) throws IOException { + if (throwException) { + throw new IOException("FAKE! Failed to replace a bad datanode...APPEND"); + } + w.append(entry); + } + + @Override + public long getLength() throws IOException { + return w.getLength(); + } + }; + } + } + + // Mocked up server and regionserver services. Needed below. + Server server = Mockito.mock(Server.class); + Mockito.when(server.getConfiguration()).thenReturn(CONF); + Mockito.when(server.isStopped()).thenReturn(false); + Mockito.when(server.isAborted()).thenReturn(false); + RegionServerServices services = Mockito.mock(RegionServerServices.class); + + // OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test. + FileSystem fs = FileSystem.get(CONF); + Path rootDir = new Path(dir + getName()); + DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF); + Path originalWAL = dodgyWAL.getCurrentFileName(); + // I need a log roller running. + LogRoller logRoller = new LogRoller(server, services); + logRoller.addWAL(dodgyWAL); + // There is no 'stop' once a logRoller is running.. it just dies. + logRoller.start(); + // Now get a region and start adding in edits. + HTableDescriptor htd = new HTableDescriptor(TableName.META_TABLE_NAME); + final HRegion region = initHRegion(tableName, null, null, dodgyWAL); + byte [] bytes = Bytes.toBytes(getName()); + try { + // First get something into memstore. Make a Put and then pull the Cell out of it. Will + // manage append and sync carefully in below to manufacture hang. We keep adding same + // edit. WAL subsystem doesn't care. + Put put = new Put(bytes); + put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes); + WALKey key = new WALKey(region.getRegionInfo().getEncodedNameAsBytes(), htd.getTableName()); + WALEdit edit = new WALEdit(); + List<Cell> cells = new ArrayList<Cell>(); + for (CellScanner cs = put.cellScanner(); cs.advance();) { + edit.add(cs.current()); + cells.add(cs.current()); + } + // Put something in memstore and out in the WAL. Do a big number of appends so we push + // out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL + for (int i = 0; i < 1000; i++) { + dodgyWAL.append(htd, region.getRegionInfo(), key, edit, region.getSequenceId(), true, + cells); + } + // Set it so we start throwing exceptions. + dodgyWAL.throwException = true; + // This append provokes a WAL roll. + dodgyWAL.append(htd, region.getRegionInfo(), key, edit, region.getSequenceId(), true, cells); + boolean exception = false; + try { + dodgyWAL.sync(); + } catch (Exception e) { + exception = true; + } + assertTrue("Did not get sync exception", exception); + + // Get a memstore flush going too so we have same hung profile as up in the issue over + // in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up + // by the zigzaglatch waiting on syncs to come home. + Thread t = new Thread ("flusher") { + public void run() { + try { + region.flush(false); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + }; + }; + t.setDaemon(true); + t.start(); + // Wait till it gets into flushing. It will get stuck on getSequenceId. Then proceed. + while (!region.writestate.flushing) Threads.sleep(1); + // Now assert I got a new WAL file put in place even though loads of errors above. + assertTrue(originalWAL != dodgyWAL.getCurrentFileName()); + // Can I append to it? + dodgyWAL.throwException = false; + region.put(put); + } finally { + // To stop logRoller, its server has to say it is stopped. + Mockito.when(server.isStopped()).thenReturn(true); + if (logRoller != null) logRoller.interrupt(); + if (region != null) region.close(); + if (dodgyWAL != null) dodgyWAL.close(); + } + } + + /** + * @return A region on which you must call + * {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)} when done. + */ + public HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey, WAL wal) + throws IOException { + return TEST_UTIL.createLocalHRegion(tableName.getName(), startKey, stopKey, + getName(), CONF, false, Durability.SYNC_WAL, + wal, COLUMN_FAMILY_BYTES); + } +} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hbase/blob/0bf97bac/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestLogRolling.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestLogRolling.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestLogRolling.java index caf05a4..daca637 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestLogRolling.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestLogRolling.java @@ -89,10 +89,6 @@ public class TestLogRolling { private MiniHBaseCluster cluster; private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); - /** - * constructor - * @throws Exception - */ public TestLogRolling() { this.server = null; this.tableName = null; @@ -527,7 +523,16 @@ public class TestLogRolling { // flush all regions for (Region r: server.getOnlineRegionsLocalContext()) { - r.flush(true); + try { + r.flush(true); + } catch (Exception e) { + // This try/catch was added by HBASE-14317. It is needed + // because this issue tightened up the semantic such that + // a failed append could not be followed by a successful + // sync. What is coming out here is a failed sync, a sync + // that used to 'pass'. + LOG.info(e); + } } ResultScanner scanner = table.getScanner(new Scan());