HIVE-11934 Transaction lock retry logic results in infinite loop(Eugene Koifman, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0d43e876 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0d43e876 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0d43e876 Branch: refs/heads/llap Commit: 0d43e876be9c36156a28bd2c2b9493f986841dd7 Parents: edd6300 Author: Eugene Koifman <ekoif...@hortonworks.com> Authored: Wed Sep 30 16:05:34 2015 -0700 Committer: Eugene Koifman <ekoif...@hortonworks.com> Committed: Wed Sep 30 16:05:34 2015 -0700 ---------------------------------------------------------------------- .../hadoop/hive/metastore/txn/TxnHandler.java | 117 +++++++++---------- 1 file changed, 57 insertions(+), 60 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/0d43e876/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java b/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java index 0b19368..cc7e2c6 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/txn/TxnHandler.java @@ -91,8 +91,8 @@ public class TxnHandler { /** * Number of consecutive deadlocks we have seen */ - protected int deadlockCnt; - private long deadlockRetryInterval; + private int deadlockCnt; + private final long deadlockRetryInterval; protected HiveConf conf; protected DatabaseProduct dbProduct; @@ -115,10 +115,8 @@ public class TxnHandler { // // All public methods that write to the database have to check for deadlocks when a SQLException // comes back and handle it if they see one. This has to be done with the connection pooling - // in mind. To do this they should call detectDeadlock AFTER rolling back the db transaction, - // and then in an outer loop they should catch DeadlockException. In the catch for this they - // should increment the deadlock counter and recall themselves. See commitTxn for an example. - // the connection has been closed and returned to the pool. + // in mind. To do this they should call checkRetryable() AFTER rolling back the db transaction, + // and then they should catch RetryException and call themselves recursively. See commitTxn for an example. public TxnHandler(HiveConf conf) { this.conf = conf; @@ -135,7 +133,6 @@ public class TxnHandler { } timeout = HiveConf.getTimeVar(conf, HiveConf.ConfVars.HIVE_TXN_TIMEOUT, TimeUnit.MILLISECONDS); - deadlockCnt = 0; buildJumpTable(); retryInterval = HiveConf.getTimeVar(conf, HiveConf.ConfVars.HMSHANDLERINTERVAL, TimeUnit.MILLISECONDS); @@ -280,7 +277,6 @@ public class TxnHandler { } public OpenTxnsResponse openTxns(OpenTxnRequest rqst) throws MetaException { - deadlockCnt = 0; // Reset deadlock count since this is a new transaction int numTxns = rqst.getNum_txns(); try { Connection dbConn = null; @@ -420,7 +416,6 @@ public class TxnHandler { public LockResponse lock(LockRequest rqst) throws NoSuchTxnException, TxnAbortedException, MetaException { - deadlockCnt = 0; try { Connection dbConn = null; try { @@ -636,8 +631,6 @@ public class TxnHandler { } } catch (RetryException e) { heartbeat(ids); - } finally { - deadlockCnt = 0; } } @@ -903,14 +896,14 @@ public class TxnHandler { void rollbackDBConn(Connection dbConn) { try { - if (dbConn != null) dbConn.rollback(); + if (dbConn != null && !dbConn.isClosed()) dbConn.rollback(); } catch (SQLException e) { LOG.warn("Failed to rollback db connection " + getMessage(e)); } } protected void closeDbConn(Connection dbConn) { try { - if (dbConn != null) dbConn.close(); + if (dbConn != null && !dbConn.isClosed()) dbConn.close(); } catch (SQLException e) { LOG.warn("Failed to close db connection " + getMessage(e)); } @@ -922,7 +915,7 @@ public class TxnHandler { */ protected void closeStmt(Statement stmt) { try { - if (stmt != null) stmt.close(); + if (stmt != null && !stmt.isClosed()) stmt.close(); } catch (SQLException e) { LOG.warn("Failed to close statement " + getMessage(e)); } @@ -952,15 +945,14 @@ public class TxnHandler { closeDbConn(dbConn); } /** - * Determine if an exception was such that it makse sense to retry. Unfortunately there is no standard way to do + * Determine if an exception was such that it makes sense to retry. Unfortunately there is no standard way to do * this, so we have to inspect the error messages and catch the telltale signs for each - * different database. + * different database. This method will throw {@code RetryException} + * if the error is retry-able. * @param conn database connection * @param e exception that was thrown. - * @param caller name of the method calling this - * @throws org.apache.hadoop.hive.metastore.txn.TxnHandler.RetryException when deadlock - * detected and retry count has not been exceeded. - * TODO: make "caller" more elaborate like include lockId for example + * @param caller name of the method calling this (and other info useful to log) + * @throws org.apache.hadoop.hive.metastore.txn.TxnHandler.RetryException when the operation should be retried */ protected void checkRetryable(Connection conn, SQLException e, @@ -973,53 +965,57 @@ public class TxnHandler { // so I've tried to capture the different error messages (there appear to be fewer different // error messages than SQL states). // Derby and newer MySQL driver use the new SQLTransactionRollbackException - if (dbProduct == null && conn != null) { - determineDatabaseProduct(conn); - } - if (e instanceof SQLTransactionRollbackException || - ((dbProduct == DatabaseProduct.MYSQL || dbProduct == DatabaseProduct.POSTGRES || - dbProduct == DatabaseProduct.SQLSERVER) && e.getSQLState().equals("40001")) || - (dbProduct == DatabaseProduct.POSTGRES && e.getSQLState().equals("40P01")) || - (dbProduct == DatabaseProduct.ORACLE && (e.getMessage().contains("deadlock detected") - || e.getMessage().contains("can't serialize access for this transaction")))) { - if (deadlockCnt++ < ALLOWED_REPEATED_DEADLOCKS) { - long waitInterval = deadlockRetryInterval * deadlockCnt; - LOG.warn("Deadlock detected in " + caller + ". Will wait " + waitInterval + - "ms try again up to " + (ALLOWED_REPEATED_DEADLOCKS - deadlockCnt + 1) + " times."); - // Pause for a just a bit for retrying to avoid immediately jumping back into the deadlock. - try { - Thread.sleep(waitInterval); - } catch (InterruptedException ie) { - // NOP - } - throw new RetryException(); - } else { - LOG.error("Too many repeated deadlocks in " + caller + ", giving up."); - deadlockCnt = 0; + boolean sendRetrySignal = false; + try { + if (dbProduct == null && conn != null) { + determineDatabaseProduct(conn); } - } - else if(isRetryable(e)) { - //in MSSQL this means Communication Link Failure - if(retryNum++ < retryLimit) { - LOG.warn("Retryable error detected in " + caller + ". Will wait " + retryInterval + - "ms and retry up to " + (retryLimit - retryNum + 1) + " times. Error: " + getMessage(e)); - try { - Thread.sleep(retryInterval); + if (e instanceof SQLTransactionRollbackException || + ((dbProduct == DatabaseProduct.MYSQL || dbProduct == DatabaseProduct.POSTGRES || + dbProduct == DatabaseProduct.SQLSERVER) && e.getSQLState().equals("40001")) || + (dbProduct == DatabaseProduct.POSTGRES && e.getSQLState().equals("40P01")) || + (dbProduct == DatabaseProduct.ORACLE && (e.getMessage().contains("deadlock detected") + || e.getMessage().contains("can't serialize access for this transaction")))) { + if (deadlockCnt++ < ALLOWED_REPEATED_DEADLOCKS) { + long waitInterval = deadlockRetryInterval * deadlockCnt; + LOG.warn("Deadlock detected in " + caller + ". Will wait " + waitInterval + + "ms try again up to " + (ALLOWED_REPEATED_DEADLOCKS - deadlockCnt + 1) + " times."); + // Pause for a just a bit for retrying to avoid immediately jumping back into the deadlock. + try { + Thread.sleep(waitInterval); + } catch (InterruptedException ie) { + // NOP + } + sendRetrySignal = true; + } else { + LOG.error("Too many repeated deadlocks in " + caller + ", giving up."); } - catch(InterruptedException ex) { - // + } else if (isRetryable(e)) { + //in MSSQL this means Communication Link Failure + if (retryNum++ < retryLimit) { + LOG.warn("Retryable error detected in " + caller + ". Will wait " + retryInterval + + "ms and retry up to " + (retryLimit - retryNum + 1) + " times. Error: " + getMessage(e)); + try { + Thread.sleep(retryInterval); + } catch (InterruptedException ex) { + // + } + sendRetrySignal = true; + } else { + LOG.error("Fatal error. Retry limit (" + retryLimit + ") reached. Last error: " + getMessage(e)); } - throw new RetryException(); } - else { - LOG.error("Fatal error. Retry limit (" + retryLimit + ") reached. Last error: " + getMessage(e)); + } + finally { + /*if this method ends with anything except a retry signal, the caller should fail the operation + and propagate the error up to the its caller (Metastore client); thus must reset retry counters*/ + if(!sendRetrySignal) { + deadlockCnt = 0; retryNum = 0; } } - else { - //if here, we got something that will propagate the error (rather than retry), so reset counters - deadlockCnt = 0; - retryNum = 0; + if(sendRetrySignal) { + throw new RetryException(); } } @@ -2100,6 +2096,7 @@ public class TxnHandler { //in MSSQL this means Communication Link Failure return true; } + //see https://issues.apache.org/jira/browse/HIVE-9938 } return false; }