[ https://issues.apache.org/jira/browse/TRAFODION-2468?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15935310#comment-15935310 ]
ASF GitHub Bot commented on TRAFODION-2468: ------------------------------------------- Github user sbroeder commented on a diff in the pull request: https://github.com/apache/incubator-trafodion/pull/993#discussion_r107271115 --- Diff: core/sqf/src/seatrans/tm/hbasetmlib2/src/main/java/org/trafodion/dtm/HBaseTxClient.java --- @@ -1232,8 +1203,218 @@ public void run() { } if(LOG.isDebugEnabled()) LOG.debug("Exiting recovery thread for tm ID: " + tmID); } - } + + private Map<Long, TransactionState> getTransactionsFromRegions( + Map<String, byte[]> regions) + throws IOException, KeeperException, + DeserializationException + { + if (LOG.isDebugEnabled()) LOG.debug("TRAF RCOV THREAD: in-doubt region size " + regions.size()); + for (Map.Entry<String, byte[]> regionEntry : regions.entrySet()) { + Map<Long, TransactionState> transactionStates = + new HashMap<Long, TransactionState>(); + List<Long> TxRecoverList = new ArrayList<Long>(); + String hostnamePort = regionEntry.getKey(); + byte[] regionBytes = regionEntry.getValue(); + if (LOG.isDebugEnabled()) + LOG.debug("TRAF RCOV THREAD:Recovery Thread Processing region: " + new String(regionBytes)); + if (recoveryIterations == 0) { + if(LOG.isWarnEnabled()) { + // Let's get the host name + final byte [] delimiter = ",".getBytes(); + String[] hostname = hostnamePort.split(new String(delimiter), 3); + if (hostname.length < 2) { + throw new IllegalArgumentException("hostnamePort format is incorrect"); + } + + LOG.warn ("TRAF RCOV THREAD:Starting recovery with " + regions.size() + + " regions to recover. First region hostname: " + hostnamePort + + " Recovery iterations: " + recoveryIterations); + } + } + else { + if(recoveryIterations % 10 == 0) { + if(LOG.isWarnEnabled()) { + // Let's get the host name + final byte [] delimiter = ",".getBytes(); + String[] hostname = hostnamePort.split(new String(delimiter), 3); + if (hostname.length < 2) { + throw new IllegalArgumentException("hostnamePort format is incorrect"); + } + LOG.warn("TRAF RCOV THREAD:Recovery thread encountered " + regions.size() + + " regions to recover. First region hostname: " + hostnamePort + + " Recovery iterations: " + recoveryIterations); + } + } + } + try { + TxRecoverList = txnManager.recoveryRequest(hostnamePort, regionBytes, tmID); + } + catch (IOException e) { + // For all cases of Exception, we rely on the region to redrive the request. + // Likely there is nothing to recover, due to a stale region entry, but it is always safe to redrive. + // We log a warning event and delete the ZKNode entry. + LOG.warn("TRAF RCOV THREAD:Exception calling txnManager.recoveryRequest. " + "TM: " + + tmID + " regionBytes: [" + regionBytes + "]. Deleting zookeeper region entry. \n exception: ", e); + zookeeper.deleteRegionEntry(regionEntry); + + // In the case of NotServingRegionException we will repost the ZKNode after refreshing the table. + if ((e instanceof NotServingRegionException) || (e.getCause() instanceof NotServingRegionException)){ + // Create a local HTable object using the regionInfo + HTable table = new HTable(config, HRegionInfo.parseFrom(regionBytes).getTable().getNameAsString()); + // Repost a zookeeper entry for all current regions in the table + zookeeper.postAllRegionEntries(table); + } + } // IOException + + if (TxRecoverList != null) { + if (LOG.isDebugEnabled()) LOG.trace("TRAF RCOV THREAD:size of TxRecoverList " + TxRecoverList.size()); + if (TxRecoverList.size() == 0) { + // First delete the zookeeper entry + LOG.warn("TRAF RCOV THREAD:Leftover Znode calling txnManager.recoveryRequest. " + "TM: " + + tmID + " regionBytes: [" + regionBytes + "]. Deleting zookeeper region entry. "); + zookeeper.deleteRegionEntry(regionEntry); + } + for (Long txid : TxRecoverList) { + TransactionState ts = transactionStates.get(txid); + if (ts == null) { + ts = new TransactionState(txid); + + //Identify if DDL is part of this transaction and valid + if(hbtx.useDDLTrans){ + TmDDL tmDDL = hbtx.getTmDDL(); + StringBuilder state = new StringBuilder (); + tmDDL.getState(txid,state); + if(state.toString().equals("VALID")) + ts.setDDLTx(true); + } + } + this.addRegionToTS(hostnamePort, regionBytes, ts); + transactionStates.put(txid, ts); + } + } + else if (LOG.isDebugEnabled()) LOG.debug("TRAF RCOV THREAD:size od TxRecoverList is NULL "); + + return transactionStates; + } + return null; + } + + private Map<Long, TransactionState> getTransactionsFromTmDDL() + throws IOException + { + if (LOG.isDebugEnabled()) LOG.debug("TRAF RCOV THREAD: Checking for DDL only recovery"); + + //Access TMDDL, return null if not enabled. + if(! hbtx.useDDLTrans) + return null; + + Map<Long, TransactionState> transactionStates = null; + TmDDL tmDDL = hbtx.getTmDDL(); + List<Long> txIdList = tmDDL.getTxIdList(tmID); + + //This list of txID is specific to tmID owner. + //This list may include txId that are: + //1. currently in ACTIVE state. RecoverTransactions() call takes care of + //ignoring TxId which are currently actively in progress. + //2. Txids regions which have not yet requested for help(regions requesting help + //from zookeeper) , probably will, could be timing. + //3. Txids regions which have already requested for help. + //4. Txids whose regions have already serviced, but only require recovery + //from DDL perspective. + //For 2 and 3 use cases above, those regions will ultimately seek help if + //they need help. So no need to handle those regions here. We are only + //interested to handle use case 4. If usecase 4 also involves DML regions + //it is ok to recover the DDL only here and not dependent on DML regions. + // + //Note that recoverTransactions() attempts recovery, its a no-op if those + //txids are completed for some reason, some of the regions might have completed + //processing, ignoreUnknownTransactionException is enabled. + if(txIdList != null && txIdList.size() > 0) + { + transactionStates = new HashMap<Long, TransactionState>(); + for (Long txid : txIdList) + { + //build ts object + TransactionState ts = new TransactionState(txid); + ts.setDDLTx(true); + transactionStates.put(txid, ts); + } + } + return transactionStates; + } + + private void recoverTransactions(Map<Long, TransactionState> transactionStates) throws IOException + { + if (LOG.isDebugEnabled()) LOG.debug("TRAF RCOV THREAD: in-doubt transaction size " + transactionStates.size()); + + for (Map.Entry<Long, TransactionState> tsEntry : transactionStates.entrySet()) { + int isTransactionStillAlive = 0; + TransactionState ts = tsEntry.getValue(); + Long txID = ts.getTransactionId(); + // TransactionState ts = new TransactionState(txID); + + //It is possible for long prepare situations that involve multiple DDL + //operations, multiple prompts from RS is received. Hence check to see if there + //is a TS object in main TS list and transaction is still active. + //Note that tsEntry is local TS object. + if (hbtx.mapTransactionStates.get(txID) != null) { + if (hbtx.mapTransactionStates.get(txID).getStatus().toString().contains("ACTIVE")) { + isTransactionStillAlive = 1; + } + if (LOG.isInfoEnabled()) + LOG.info("TRAF RCOV THREAD: TID " + txID + + " still has TS object in TM memory. TS details: " + + hbtx.mapTransactionStates.get(txID).toString() + + " transactionAlive: " + isTransactionStillAlive); + if(isTransactionStillAlive == 1) + continue; //for loop + } + + try { + audit.getTransactionState(ts); + if (ts.getStatus().equals(TransState.STATE_COMMITTED.toString())) { + if (LOG.isDebugEnabled()) + LOG.debug("TRAF RCOV THREAD:Redriving commit for " + txID + " number of regions " + ts.getParticipatingRegions().size() + + " and tolerating UnknownTransactionExceptions"); + txnManager.doCommit(ts, true /*ignore UnknownTransactionException*/); + if(useTlog && useForgotten) { + long nextAsn = tLog.getNextAuditSeqNum((int)TransactionState.getNodeId(txID)); + tLog.putSingleRecord(txID, ts.getCommitId(), "FORGOTTEN", null, forceForgotten, nextAsn); + } + } else if (ts.getStatus().equals(TransState.STATE_ABORTED.toString())) { + if (LOG.isDebugEnabled()) + LOG.debug("TRAF RCOV THREAD:Redriving abort for " + txID); + txnManager.abort(ts); --- End diff -- OK. Sounds good. > Recovery of DDL transaction may fail upon TM restart > ---------------------------------------------------- > > Key: TRAFODION-2468 > URL: https://issues.apache.org/jira/browse/TRAFODION-2468 > Project: Apache Trafodion > Issue Type: Bug > Components: dtm > Affects Versions: 2.1-incubating > Reporter: Prashanth Vasudev > Assignee: Prashanth Vasudev > > Depending on timing of TM process going down and restarting, if there is a > DDL operation in flight that has not registered in TMDDL table, recovery of > the DDL operation may get unnoticed. -- This message was sent by Atlassian JIRA (v6.3.15#6346)