osaf/libs/common/immsv/immpbe_dump.cc | 9 +++++---- osaf/services/saf/immsv/immpbed/immpbe_daemon.cc | 12 ++++++++---- 2 files changed, 13 insertions(+), 8 deletions(-)
An SMF campaign enables the PBE (with 2PBE) and immediately attempts to update a PRTA. This fails because the slave PBE (PBE-B) has not completed its initialization when it receives the prepare message (for the PRTA update). This causes the PRTA update to be rejected. It also causes the PBE slave to exit and restart again due to an erroneous abort of an empty sqlite transaction. Mar 31 10:33:19 SC-2-1 osafimmnd[13967]: NO ERR_BAD_OPERATION: Mismatch on administrative owner '' != 'safImmService' Mar 31 10:33:19 SC-2-1 osafimmpbed: WA Start prepare for ccb: 100000078/4294967416 towards slave PBE returned: '20' from Immsv Mar 31 10:33:19 SC-2-1 osafimmpbed: WA PBE-A failed to prepare PRTA update Ccb:100000078/4294967416 towards PBE-B Mar 31 10:33:19 SC-2-1 osafimmpbed: NO 2PBE Error (20) in PRTA update (ccbId:100000078) Mar 31 10:33:19 SC-2-1 osafimmnd[13967]: WA update of PERSISTENT runtime attributes in object 'safSmfCampaign=ERIC-TestAppInstall,safApp=safSmfService' REVERTED. PBE rc:20 Mar 31 10:33:22 SC-2-2 osafimmpbed: IN PBE slave waiting for prepare from primary on PRTA update ccb:100000078 Mar 31 10:33:22 SC-2-2 osafimmnd[5243]: WA update of PERSISTENT runtime attributes in object 'safSmfCampaign=ERIC-TestAppInstall,safApp=safSmfService' REVERTED. PBE rc:20 Mar 31 10:33:24 SC-2-2 osafimmpbed: IN PBE slave waiting for prepare from primary on PRTA update ccb:100000078 Mar 31 10:33:24 SC-2-2 osafimmpbed: NO Slave PBE time-out in waiting on porepare for PRTA update ccb:100000078 dn:safSmfCampaign=ERIC-TestAppInstall,safApp=safSmfService Mar 31 10:33:24 SC-2-2 osafimmpbed: ER SQL statement ('ROLLBACK') failed because: cannot rollback - no transaction is active Mar 31 10:33:24 SC-2-2 osafimmpbed: ER Exiting (line:2827) The problem is the time gap between the creation of the RTO representing the slave PBE and the setting of admin-owner by the slave PBE for that RTO. Admin owner must be set for an admin-operation on the object to succeed. The fix is to have the primary PBE be tolerant of receiving ERR_BAD_OPERATION on the prepare request, treating it the same way it treats ERR_NOT_EXIST for the slave PBE RTO not existing, or ERR_TRY_AGAIN for the slave PBE still being busy with some other transaction. A fix is also made to the pbeAbortTrans function to do nothing if the transaction is empty. diff --git a/osaf/libs/common/immsv/immpbe_dump.cc b/osaf/libs/common/immsv/immpbe_dump.cc --- a/osaf/libs/common/immsv/immpbe_dump.cc +++ b/osaf/libs/common/immsv/immpbe_dump.cc @@ -2818,6 +2818,11 @@ void pbeAbortTrans(void* db_handle) char *execErr=NULL; int rc=0; + if(sqliteTransLock == 0) { + LOG_WA("pbeAbortTrans was called when sqliteTransLock==0 -- ignoring abort"); + return; + } + rc = sqlite3_exec(dbHandle, "ROLLBACK", NULL, NULL, &execErr); if(rc != SQLITE_OK) { LOG_ER("SQL statement ('ROLLBACK') failed because:\n %s", @@ -2828,10 +2833,6 @@ void pbeAbortTrans(void* db_handle) exit(1); } - if(sqliteTransLock == 0) { - LOG_WA("pbeAbortTrans was called when sqliteTransLock==0"); - } - switch(sqliteTransLock) { case 3: --sqliteTransLock; diff --git a/osaf/services/saf/immsv/immpbed/immpbe_daemon.cc b/osaf/services/saf/immsv/immpbed/immpbe_daemon.cc --- a/osaf/services/saf/immsv/immpbed/immpbe_daemon.cc +++ b/osaf/services/saf/immsv/immpbed/immpbe_daemon.cc @@ -242,14 +242,18 @@ static bool pbe2_start_prepare_ccb_A_to_ rc2B = saImmOmAdminOperationInvoke_2(sOwnerHandle, &slavePbeRtObjName, 0, OPENSAF_IMM_PBE_CCB_PREPARE, params, &slavePbeRtReply, SA_TIME_ONE_SECOND * 10); - if(rc2B == SA_AIS_ERR_TRY_AGAIN || (rc2B==SA_AIS_OK && slavePbeRtReply==SA_AIS_ERR_TRY_AGAIN)) { + if(rc2B == SA_AIS_ERR_TRY_AGAIN || rc2B == SA_AIS_ERR_BAD_OPERATION || + (rc2B==SA_AIS_OK && slavePbeRtReply==SA_AIS_ERR_TRY_AGAIN)) { usleep(sleep_delay_ms * 1000); msecs_waited += sleep_delay_ms; LOG_NO("Slave PBE %u or Immsv (%u) replied with TRY_AGAIN on prepare for ccb:%llx/%llu", rc2B, slavePbeRtReply, ccbId, ccbId); } - /* Adjust the waiting time,a bove & below to be more appropriate .... */ - } while (((rc2B == SA_AIS_ERR_TRY_AGAIN) || (slavePbeRtReply == SA_AIS_ERR_TRY_AGAIN)) && (msecs_waited < 3000)); + /* Adjust the waiting time, above & below to be more appropriate .... + SA_AIS_ERR_BAD_OPERATION from immsv can happen ar slave PBE startup when + slave has created its RTO, but not yet set admin-owner for it. */ + } while (((rc2B == SA_AIS_ERR_TRY_AGAIN) || (rc2B == SA_AIS_ERR_BAD_OPERATION) || + (rc2B==SA_AIS_OK && slavePbeRtReply == SA_AIS_ERR_TRY_AGAIN)) && (msecs_waited < 3000)); if(rc2B != SA_AIS_OK) { if((rc2B == SA_AIS_ERR_NOT_EXIST) && (sNoStdFlags & OPENSAF_IMM_FLAG_2PBE1_ALLOW)) { @@ -294,7 +298,7 @@ static SaAisErrorT pbe2_ok_to_prepare_cc The runtime thread thus reads from the immutils structure only from a stable pointer verified to point at a ccb record with corrrect ccb-id. */ - LOG_WA("Missmatch on record for ccbId:%llx/%llu - thread interference problems ?", ccbId, ccbId); + LOG_WA("Mismatch on record for ccbId:%llx/%llu - thread interference problems ?", ccbId, ccbId); s2PbeBCcbUtilCcbData = NULL; rc = SA_AIS_ERR_TRY_AGAIN; goto done; ------------------------------------------------------------------------------ _______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel