Ack, Not tested. Will push it. Mathi.
> -----Original Message----- > From: Alex Jones [mailto:[email protected]] > Sent: Wednesday, June 03, 2015 2:07 AM > To: Mathivanan Naickan Palanivelu > Cc: [email protected] > Subject: [PATCH 1 of 1] plm: fix blade extraction mechanism [#1378] > > osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c | 204 +++++++----------- > ---- > osaf/services/saf/plmsv/plms/plms_amf.c | 19 +-- > 2 files changed, 71 insertions(+), 152 deletions(-) > > > Blade extraction does not work consistently. If you extract a blade, and AMF > should reject it (because it cannot failover the services), the blade will > sometimes still deactivate. > > The standby plm daemon is not calling saHpiHotSwapPolicyCancel and > saHpiAutoExtractTimeoutSet for the resources. When openhpid on the > standby controller gets the pending extraction message from the shelf > manager, it happily allows the extraction to proceed because it doesn't have > the extraction policy set. The openhpid on the active controller has been > programmed by the active plm daemon to cancel the hot swap policy, and > set the auto extract timeout for the resource. Now there is a race condition. > Shelf manager responds to which openhpid first? > > Both active and standby plm daemons need to call > saHpiHotSwapPolicyCancel and saHpiAutoExtractTimeoutSet, to make sure > that both openhpid on the active and standby controllers have the same > auto extract policy. > > diff --git a/osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c > b/osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c > --- a/osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c > +++ b/osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c > @@ -71,7 +71,6 @@ static SaUint32T hsm_get_idr_chassis_inf > SaHpiIdrIdT idr_id, > PLMS_INV_DATA *inv_data); > static SaUint32T hsm_session_reopen(); > -SaUint32T plms_hsm_session_close(); > static SaUint32T hsm_discover_and_dispatch(); static void *plms_hsm(); > > @@ -446,28 +445,10 @@ static void *plms_hsm(void) > SaHpiPowerStateT power_state; > SaUint32T retriev_idr_info = 0; > SaInt32T rc,ret; > - SaInt32T got_new_active = false; > + SaInt32T active = false; > > TRACE_ENTER(); > > - rc = pthread_mutex_lock(&hsm_ha_state.mutex); > - if(rc){ > - LOG_CR("HSM: Failed to take hsm_ha_state lock, > exiting \ > - the thread, ret value:%d err:%s", rc, > strerror(errno)); > - assert(0); > - } > - if(hsm_ha_state.state != SA_AMF_HA_ACTIVE){ > - TRACE("HSM: Thread going to block till Active state is set"); > - > pthread_cond_wait(&hsm_ha_state.cond,&hsm_ha_state.mutex); > - } > - > - rc = pthread_mutex_unlock(&hsm_ha_state.mutex); > - if(rc){ > - LOG_CR("HSM:Failed to unlock hsm_ha_state > lock,exiting \ > - the thread, ret value:%d err:%s", rc, > strerror(errno)); > - assert(0); > - } > - > /* Subscribe to receive events on this HPI session */ > rc = saHpiSubscribe(cb->session_id); > if( SA_OK != rc ){ > @@ -493,68 +474,9 @@ static void *plms_hsm(void) > > TRACE("HSM:Blocking to receive events on HPI session"); > while(true){ > - rc = pthread_mutex_lock(&hsm_ha_state.mutex); > - if(rc){ > - LOG_CR("HSM: Failed to take hsm_ha_state lock, > exiting \ > - the thread, ret value:%d err:%s", rc, strerror(errno)); > - assert(0); > - } > - if(hsm_ha_state.state != SA_AMF_HA_ACTIVE){ > - /* Wait on condition variable for the HA role from > PLMS main thread */ > - TRACE("HSM:Received Standby state,thread going to > block till Active state is set"); > - > pthread_cond_wait(&hsm_ha_state.cond,&hsm_ha_state.mutex); > - got_new_active = true; > - } > - rc = pthread_mutex_unlock(&hsm_ha_state.mutex); > - if(rc){ > - LOG_CR("HSM:Failed to unlock hsm_ha_state > lock,exiting \ > - the thread, ret value:%d err:%s", rc, strerror(errno)); > - assert(0); > - } > - if(got_new_active){ > - /* Open the session on New active*/ > - hsm_session_reopen(); > - > - /* Rediscover the resources */ > - hsm_discover_and_dispatch(); > - > - got_new_active = false; > - > - /* PLMC initialize */ > - if( !plms_cb->plmc_initialized ) { > - rc = > plmc_initialize(plms_plmc_connect_cbk,plms_plmc_udp_cbk,plms_plmc_err > or_cbk); > - if (rc) { > - LOG_ER("PLMC initialize failed"); > - rc = NCSCC_RC_FAILURE; > - exit(0); > - } > - plms_cb->plmc_initialized = true; > - TRACE("PLMC initialization Success."); > - } > - } > - > ret = saHpiEventGet(cb->session_id, > SAHPI_TIMEOUT_BLOCK, > &event, &rdr, &rpt_entry, NULL); > > - plms_send_hpi_evt_ntf(event.EventType, &event, > &(rpt_entry)); > - rc = pthread_mutex_lock(&hsm_ha_state.mutex); > - if(rc){ > - LOG_CR("HSM: Failed to take hsm_ha_state > lock,exiting thread, ret value:%d err:%s",rc,strerror(errno)); > - assert(0); > - } > - if(hsm_ha_state.state != SA_AMF_HA_ACTIVE){ > - rc = pthread_mutex_unlock(&hsm_ha_state.mutex); > - if(rc){ > - LOG_CR("HSM:Failed to unlock > hsm_ha_state,exiting thread,ret value:%d err:%s",rc,strerror(errno)); > - assert(0); > - } > - continue; > - } > - rc = pthread_mutex_unlock(&hsm_ha_state.mutex); > - if(rc){ > - LOG_CR("HSM:Failed to unlock hsm_ha_state,exiting > thread,ret value:%d err:%s",rc,strerror(errno)); > - assert(0); > - } > if( SA_OK != ret ){ > LOG_ER("HSM:saHpiEventGet failed, ret val > is:%d",rc); > /* Reopen the session */ > @@ -566,6 +488,21 @@ static void *plms_hsm(void) > > TRACE("HSM:Receieved event for res_id:%u Evt type:%u > ",rpt_entry.ResourceId,event.EventType); > > + rc = pthread_mutex_lock(&hsm_ha_state.mutex); > + if(rc){ > + LOG_CR("HSM: Failed to take hsm_ha_state > lock,exiting thread, ret value:%d err:%s",rc,strerror(errno)); > + assert(0); > + } > + active = (hsm_ha_state.state == SA_AMF_HA_ACTIVE) ? > true : false; > + rc = pthread_mutex_unlock(&hsm_ha_state.mutex); > + if(rc){ > + LOG_CR("HSM:Failed to unlock hsm_ha_state,exiting > thread,ret value:%d err:%s",rc,strerror(errno)); > + assert(0); > + } > + > + if (active) > + plms_send_hpi_evt_ntf(event.EventType, &event, > &(rpt_entry)); > + > if (event.EventType == SAHPI_ET_OEM) { > /* not currently supporting OEM events */ > continue; > @@ -612,6 +549,42 @@ static void *plms_hsm(void) > } > } > > + if (event.EventType == SAHPI_ET_HOTSWAP){ > + if(hotswap_state_model == > PLMS_HPI_FULL_FIVE_HOTSWAP_MODEL){ > + if > (event.EventDataUnion.HotSwapEvent.HotSwapState == > + > SAHPI_HS_STATE_EXTRACTION_PENDING || > + > event.EventDataUnion.HotSwapEvent.HotSwapState == > + > SAHPI_HS_STATE_INSERTION_PENDING){ > + /* Cancel the hotswap polcy */ > + rc = saHpiHotSwapPolicyCancel(cb- > >session_id,rpt_entry.ResourceId); > + if (SA_OK != rc) > + LOG_ER("Error taking control > of res:%d ret val:%d", > + > rpt_entry.ResourceId,rc); > + > + /* Set the AutoExtractionTimeout */ > + rc = saHpiAutoExtractTimeoutSet(cb- > >session_id,rpt_entry.ResourceId, > + cb- > >extr_pending_timeout); > + if (SA_OK != rc) > + > LOG_ER("AutoExtractTimeoutSet failed for res:%u ret val:%d", > + > rpt_entry.ResourceId,rc); > + > + } > + } > + > + if (active) { > + > hsm_send_hotswap_event(&rpt_entry,hotswap_state_model,even > t.EventDataUnion.HotSwapEvent.HotSwapState, > + > event.EventDataUnion.HotSwapEvent.PreviousHotSwapState,retrie > v_idr_info); > + } > + } > + > + /* > + * saHpiHotSwapPolicyCancel and > saHpiAutoExtractTimeoutSet need to be set on > + * both active and standby, but anything else is only done by > active > + */ > + if (!active) > + continue; > + > + > /* If it is a resource restore event( communication lost and > got restored immediately ) ,retrieve the hotswap state after > communication is restored */ > @@ -638,32 +611,6 @@ static void *plms_hsm(void) > retriev_idr_info); > } > } > - > - if (event.EventType == SAHPI_ET_HOTSWAP){ > - if(hotswap_state_model == > PLMS_HPI_FULL_FIVE_HOTSWAP_MODEL){ > - if > (event.EventDataUnion.HotSwapEvent.HotSwapState == > - > SAHPI_HS_STATE_EXTRACTION_PENDING || > - > event.EventDataUnion.HotSwapEvent.HotSwapState == > - > SAHPI_HS_STATE_INSERTION_PENDING){ > - /* Cancel the hotswap polcy */ > - rc = saHpiHotSwapPolicyCancel(cb- > >session_id,rpt_entry.ResourceId); > - if (SA_OK != rc) > - LOG_ER("Error taking control > of res:%d ret val:%d", > - > rpt_entry.ResourceId,rc); > - > - /* Set the AutoExtractionTimeout */ > - rc = saHpiAutoExtractTimeoutSet(cb- > >session_id,rpt_entry.ResourceId, > - cb- > >extr_pending_timeout); > - if (SA_OK != rc) > - > LOG_ER("AutoExtractTimeoutSet failed for res:%u ret val:%d", > - > rpt_entry.ResourceId,rc); > - > - } > - } > - > hsm_send_hotswap_event(&rpt_entry,hotswap_state_model,even > t.EventDataUnion.HotSwapEvent.HotSwapState, > - > event.EventDataUnion.HotSwapEvent.PreviousHotSwapState,retrie > v_idr_info); > - > - } > } > > TRACE_LEAVE(); > @@ -698,6 +645,7 @@ static SaUint32T hsm_discover_and_dispat > SaUint32T prev_domain_op_status = NCSCC_RC_SUCCESS; > SaUint32T rc = NCSCC_RC_SUCCESS; > static SaUint32T rpt_retry_count = 0; > + bool active = false; > > TRACE_ENTER(); > > @@ -742,13 +690,21 @@ static SaUint32T hsm_discover_and_dispat > plmscb->my_entity_path = 0; > #endif > > + rc = pthread_mutex_lock(&hsm_ha_state.mutex); > + if(rc){ > + LOG_CR("HSM: Failed to take hsm_ha_state lock,exiting > thread, ret value:%d err:%s",rc,strerror(errno)); > + assert(0); > + } > + active = (hsm_ha_state.state == SA_AMF_HA_ACTIVE) ? true : false; > + rc = pthread_mutex_unlock(&hsm_ha_state.mutex); > + if(rc){ > + LOG_CR("HSM:Failed to unlock hsm_ha_state,exiting > thread,ret value:%d err:%s",rc,strerror(errno)); > + assert(0); > + } > + > /* Process the list of RPT entries on this session */ > next = SAHPI_FIRST_ENTRY; > do{ > - > - if(hsm_ha_state.state == SA_AMF_HA_STANDBY) > - return NCSCC_RC_FAILURE; > - > current = next; > /* Get the RPT entry */ > rc = saHpiRptEntryGet(cb->session_id, current,&next, > &rpt_entry); @@ -869,8 +825,11 @@ static SaUint32T > hsm_discover_and_dispat > retriev_idr_info = true; > > /* Send the outstanding hot_swap event*/ > - hsm_send_hotswap_event(&rpt_entry, > hotswap_state_model, state, > - previous_state,retriev_idr_info); > + if (active) { > + hsm_send_hotswap_event(&rpt_entry, > hotswap_state_model, state, > + > previous_state,retriev_idr_info); > + } > + > if(SAHPI_LAST_ENTRY == next && > NCSCC_RC_SUCCESS == prev_domain_op_status ){ > /* Get the update count of domain_info*/ > @@ -1617,26 +1576,3 @@ static SaUint32T hsm_session_reopen() > TRACE_LEAVE(); > return NCSCC_RC_SUCCESS; > } > - > /********************************************************** > ************* > -* @brief This function closes HPI session > -* > -* @param[in] > -* > -* > -* @return NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE > - > ********************************************************** > *************/ > -SaUint32T plms_hsm_session_close() > -{ > - PLMS_HSM_CB *cb = hsm_cb; > - SaUint32T rc = 0; > - /* Close the HPI session */ > - rc = saHpiSessionClose(cb->session_id); > - if (SA_OK != rc){ > - LOG_ER("HSM:Close session return error: %d:\n",rc); > - return NCSCC_RC_FAILURE; > - } > - > - /* Reset the session_id */ > - cb->session_id = 0; > - return NCSCC_RC_SUCCESS; > -} > diff --git a/osaf/services/saf/plmsv/plms/plms_amf.c > b/osaf/services/saf/plmsv/plms/plms_amf.c > --- a/osaf/services/saf/plmsv/plms/plms_amf.c > +++ b/osaf/services/saf/plmsv/plms/plms_amf.c > @@ -266,7 +266,7 @@ plms_amf_CSI_set_callback(SaInvocationT > pthread_mutex_unlock(&hrb_ha_state.mutex); > } > /* PLMC initialize */ > - if(!cb->hpi_cfg.hpi_support && !cb->plmc_initialized){ > + if(!cb->plmc_initialized){ > TRACE("Initializing PLMC"); > rc = plmc_initialize(plms_plmc_connect_cbk, > plms_plmc_udp_cbk, @@ > -297,23 +297,6 @@ > plms_amf_CSI_set_callback(SaInvocationT > hrb_ha_state.state = SA_AMF_HA_STANDBY; > pthread_mutex_unlock(&hrb_ha_state.mutex); > > - SaUint32T (* hsm_func_ptr)() = NULL; > - if(cb->hpi_cfg.hpi_support){ > - /* Get the hsm Init func ptr */ > - hsm_func_ptr = dlsym(cb->hpi_intf_hdl, > "plms_hsm_session_close"); > - if ( NULL == hsm_func_ptr ) { > - LOG_ER("dlsym() failed to get the > hsm_func_ptr,error %s", dlerror()); > - goto response; > - } > - > - /* Initialize HSM */ > - rc = (* hsm_func_ptr)(); > - if ( NCSCC_RC_SUCCESS != rc ) { > - LOG_ER("plms_session_close failed"); > - goto response; > - } > - } > - > /* PLMC finalize */ > if(cb->plmc_initialized){ > rc = plmc_destroy(); > ------------------------------------------------------------------------------ Monitor 25 network devices or servers for free with OpManager! OpManager is web-based network management software that monitors network devices and physical & virtual servers, alerts via email & sms for fault. Monitor 25 devices for free with no restriction. Download now http://ad.doubleclick.net/ddm/clk/292181274;119417398;o _______________________________________________ Opensaf-devel mailing list [email protected] https://lists.sourceforge.net/lists/listinfo/opensaf-devel
