Ack,
Not tested. Will push it.

Mathi.

> -----Original Message-----
> From: Alex Jones [mailto:[email protected]]
> Sent: Wednesday, June 03, 2015 2:07 AM
> To: Mathivanan Naickan Palanivelu
> Cc: [email protected]
> Subject: [PATCH 1 of 1] plm: fix blade extraction mechanism [#1378]
> 
>  osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c |  204 +++++++-----------
> ----
>  osaf/services/saf/plmsv/plms/plms_amf.c          |   19 +--
>  2 files changed, 71 insertions(+), 152 deletions(-)
> 
> 
> Blade extraction does not work consistently. If you extract a blade, and AMF
> should reject it (because it cannot failover the services), the blade will
> sometimes still deactivate.
> 
> The standby plm daemon is not calling saHpiHotSwapPolicyCancel and
> saHpiAutoExtractTimeoutSet for the resources. When openhpid on the
> standby controller gets the pending extraction message from the shelf
> manager, it happily allows the extraction to proceed because it doesn't have
> the extraction policy set. The openhpid on the active controller has been
> programmed by the active plm daemon to cancel the hot swap policy, and
> set the auto extract timeout for the resource. Now there is a race condition.
> Shelf manager responds to which openhpid first?
> 
> Both active and standby plm daemons need to call
> saHpiHotSwapPolicyCancel and saHpiAutoExtractTimeoutSet, to make sure
> that both openhpid on the active and standby controllers have the same
> auto extract policy.
> 
> diff --git a/osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c
> b/osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c
> --- a/osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c
> +++ b/osaf/services/saf/plmsv/plms/hpi_intf/plms_hsm.c
> @@ -71,7 +71,6 @@ static SaUint32T hsm_get_idr_chassis_inf
>                                       SaHpiIdrIdT       idr_id,
>                                       PLMS_INV_DATA     *inv_data);
>  static SaUint32T hsm_session_reopen();
> -SaUint32T plms_hsm_session_close();
>  static SaUint32T hsm_discover_and_dispatch();  static void *plms_hsm();
> 
> @@ -446,28 +445,10 @@ static void *plms_hsm(void)
>       SaHpiPowerStateT  power_state;
>       SaUint32T         retriev_idr_info = 0;
>       SaInt32T          rc,ret;
> -     SaInt32T          got_new_active = false;
> +     SaInt32T          active = false;
> 
>       TRACE_ENTER();
> 
> -     rc = pthread_mutex_lock(&hsm_ha_state.mutex);
> -     if(rc){
> -                        LOG_CR("HSM: Failed to take hsm_ha_state lock, 
> exiting \
> -                        the thread, ret value:%d err:%s", rc, 
> strerror(errno));
> -                        assert(0);
> -        }
> -     if(hsm_ha_state.state != SA_AMF_HA_ACTIVE){
> -             TRACE("HSM: Thread going to block till Active state is set");
> -
>       pthread_cond_wait(&hsm_ha_state.cond,&hsm_ha_state.mutex);
> -     }
> -
> -     rc = pthread_mutex_unlock(&hsm_ha_state.mutex);
> -     if(rc){
> -                        LOG_CR("HSM:Failed to unlock hsm_ha_state 
> lock,exiting \
> -                        the thread, ret value:%d err:%s", rc, 
> strerror(errno));
> -                        assert(0);
> -        }
> -
>       /* Subscribe to receive events on this HPI session */
>       rc =  saHpiSubscribe(cb->session_id);
>       if( SA_OK != rc ){
> @@ -493,68 +474,9 @@ static void *plms_hsm(void)
> 
>       TRACE("HSM:Blocking to receive events on HPI session");
>       while(true){
> -             rc = pthread_mutex_lock(&hsm_ha_state.mutex);
> -             if(rc){
> -                     LOG_CR("HSM: Failed to take hsm_ha_state lock,
> exiting \
> -                     the thread, ret value:%d err:%s", rc, strerror(errno));
> -                     assert(0);
> -             }
> -             if(hsm_ha_state.state != SA_AMF_HA_ACTIVE){
> -                     /* Wait on condition variable for the HA role from
> PLMS main thread */
> -                     TRACE("HSM:Received Standby state,thread going to
> block till Active state is set");
> -
>       pthread_cond_wait(&hsm_ha_state.cond,&hsm_ha_state.mutex);
> -                     got_new_active = true;
> -             }
> -             rc = pthread_mutex_unlock(&hsm_ha_state.mutex);
> -             if(rc){
> -                     LOG_CR("HSM:Failed to unlock hsm_ha_state
> lock,exiting \
> -                     the thread, ret value:%d err:%s", rc, strerror(errno));
> -                     assert(0);
> -             }
> -             if(got_new_active){
> -                     /* Open the session on New active*/
> -                     hsm_session_reopen();
> -
> -                     /* Rediscover the resources */
> -                     hsm_discover_and_dispatch();
> -
> -                     got_new_active = false;
> -
> -                     /* PLMC initialize */
> -                     if( !plms_cb->plmc_initialized ) {
> -                             rc =
> plmc_initialize(plms_plmc_connect_cbk,plms_plmc_udp_cbk,plms_plmc_err
> or_cbk);
> -                             if (rc) {
> -                                     LOG_ER("PLMC initialize failed");
> -                                     rc = NCSCC_RC_FAILURE;
> -                                     exit(0);
> -                             }
> -                             plms_cb->plmc_initialized = true;
> -                             TRACE("PLMC initialization Success.");
> -                     }
> -             }
> -
>               ret = saHpiEventGet(cb->session_id,
> SAHPI_TIMEOUT_BLOCK,
>                                       &event, &rdr, &rpt_entry, NULL);
> 
> -             plms_send_hpi_evt_ntf(event.EventType, &event,
> &(rpt_entry));
> -             rc = pthread_mutex_lock(&hsm_ha_state.mutex);
> -             if(rc){
> -                     LOG_CR("HSM: Failed to take hsm_ha_state
> lock,exiting thread, ret value:%d err:%s",rc,strerror(errno));
> -                     assert(0);
> -             }
> -             if(hsm_ha_state.state != SA_AMF_HA_ACTIVE){
> -                     rc = pthread_mutex_unlock(&hsm_ha_state.mutex);
> -                     if(rc){
> -                             LOG_CR("HSM:Failed to unlock
> hsm_ha_state,exiting thread,ret value:%d err:%s",rc,strerror(errno));
> -                             assert(0);
> -                     }
> -                     continue;
> -             }
> -             rc = pthread_mutex_unlock(&hsm_ha_state.mutex);
> -             if(rc){
> -                     LOG_CR("HSM:Failed to unlock hsm_ha_state,exiting
> thread,ret value:%d err:%s",rc,strerror(errno));
> -                     assert(0);
> -             }
>               if( SA_OK != ret ){
>                       LOG_ER("HSM:saHpiEventGet failed, ret val
> is:%d",rc);
>                       /* Reopen the session */
> @@ -566,6 +488,21 @@ static void *plms_hsm(void)
> 
>               TRACE("HSM:Receieved event for res_id:%u Evt type:%u
> ",rpt_entry.ResourceId,event.EventType);
> 
> +             rc = pthread_mutex_lock(&hsm_ha_state.mutex);
> +             if(rc){
> +                     LOG_CR("HSM: Failed to take hsm_ha_state
> lock,exiting thread, ret value:%d err:%s",rc,strerror(errno));
> +                     assert(0);
> +             }
> +             active = (hsm_ha_state.state == SA_AMF_HA_ACTIVE) ?
> true : false;
> +             rc = pthread_mutex_unlock(&hsm_ha_state.mutex);
> +             if(rc){
> +                     LOG_CR("HSM:Failed to unlock hsm_ha_state,exiting
> thread,ret value:%d err:%s",rc,strerror(errno));
> +                     assert(0);
> +             }
> +
> +             if (active)
> +                     plms_send_hpi_evt_ntf(event.EventType, &event,
> &(rpt_entry));
> +
>               if (event.EventType == SAHPI_ET_OEM) {
>                       /* not currently supporting OEM events */
>                       continue;
> @@ -612,6 +549,42 @@ static void *plms_hsm(void)
>                       }
>               }
> 
> +             if (event.EventType == SAHPI_ET_HOTSWAP){
> +                     if(hotswap_state_model  ==
> PLMS_HPI_FULL_FIVE_HOTSWAP_MODEL){
> +                             if
> (event.EventDataUnion.HotSwapEvent.HotSwapState ==
> +
> SAHPI_HS_STATE_EXTRACTION_PENDING ||
> +
> event.EventDataUnion.HotSwapEvent.HotSwapState ==
> +
> SAHPI_HS_STATE_INSERTION_PENDING){
> +                                     /* Cancel the hotswap polcy */
> +                                     rc = saHpiHotSwapPolicyCancel(cb-
> >session_id,rpt_entry.ResourceId);
> +                                     if (SA_OK != rc)
> +                                             LOG_ER("Error taking control
> of res:%d ret val:%d",
> +
>       rpt_entry.ResourceId,rc);
> +
> +                                     /* Set the AutoExtractionTimeout */
> +                                     rc = saHpiAutoExtractTimeoutSet(cb-
> >session_id,rpt_entry.ResourceId,
> +                                                                     cb-
> >extr_pending_timeout);
> +                                     if (SA_OK != rc)
> +
>       LOG_ER("AutoExtractTimeoutSet failed for res:%u ret val:%d",
> +
>       rpt_entry.ResourceId,rc);
> +
> +                             }
> +                     }
> +
> +                     if (active) {
> +
>       hsm_send_hotswap_event(&rpt_entry,hotswap_state_model,even
> t.EventDataUnion.HotSwapEvent.HotSwapState,
> +
>       event.EventDataUnion.HotSwapEvent.PreviousHotSwapState,retrie
> v_idr_info);
> +                     }
> +             }
> +
> +             /*
> +              * saHpiHotSwapPolicyCancel and
> saHpiAutoExtractTimeoutSet need to be set on
> +              * both active and standby, but anything else is only done by
> active
> +              */
> +             if (!active)
> +                     continue;
> +
> +
>               /* If it is a resource restore event( communication lost and
>               got restored immediately ) ,retrieve the hotswap state after
>               communication is restored */
> @@ -638,32 +611,6 @@ static void *plms_hsm(void)
>                       retriev_idr_info);
>                       }
>               }
> -
> -             if (event.EventType == SAHPI_ET_HOTSWAP){
> -                     if(hotswap_state_model  ==
> PLMS_HPI_FULL_FIVE_HOTSWAP_MODEL){
> -                             if
> (event.EventDataUnion.HotSwapEvent.HotSwapState ==
> -
> SAHPI_HS_STATE_EXTRACTION_PENDING ||
> -
> event.EventDataUnion.HotSwapEvent.HotSwapState ==
> -
> SAHPI_HS_STATE_INSERTION_PENDING){
> -                                     /* Cancel the hotswap polcy */
> -                                     rc = saHpiHotSwapPolicyCancel(cb-
> >session_id,rpt_entry.ResourceId);
> -                                     if (SA_OK != rc)
> -                                             LOG_ER("Error taking control
> of res:%d ret val:%d",
> -
>       rpt_entry.ResourceId,rc);
> -
> -                                     /* Set the AutoExtractionTimeout */
> -                                     rc = saHpiAutoExtractTimeoutSet(cb-
> >session_id,rpt_entry.ResourceId,
> -                                                                     cb-
> >extr_pending_timeout);
> -                                     if (SA_OK != rc)
> -
>       LOG_ER("AutoExtractTimeoutSet failed for res:%u ret val:%d",
> -
>       rpt_entry.ResourceId,rc);
> -
> -                             }
> -                     }
> -
>       hsm_send_hotswap_event(&rpt_entry,hotswap_state_model,even
> t.EventDataUnion.HotSwapEvent.HotSwapState,
> -
>       event.EventDataUnion.HotSwapEvent.PreviousHotSwapState,retrie
> v_idr_info);
> -
> -             }
>       }
> 
>       TRACE_LEAVE();
> @@ -698,6 +645,7 @@ static SaUint32T hsm_discover_and_dispat
>       SaUint32T         prev_domain_op_status = NCSCC_RC_SUCCESS;
>       SaUint32T         rc = NCSCC_RC_SUCCESS;
>       static SaUint32T        rpt_retry_count = 0;
> +     bool              active = false;
> 
>       TRACE_ENTER();
> 
> @@ -742,13 +690,21 @@ static SaUint32T hsm_discover_and_dispat
>       plmscb->my_entity_path = 0;
>  #endif
> 
> +     rc = pthread_mutex_lock(&hsm_ha_state.mutex);
> +     if(rc){
> +             LOG_CR("HSM: Failed to take hsm_ha_state lock,exiting
> thread, ret value:%d err:%s",rc,strerror(errno));
> +             assert(0);
> +     }
> +     active = (hsm_ha_state.state == SA_AMF_HA_ACTIVE) ? true : false;
> +     rc = pthread_mutex_unlock(&hsm_ha_state.mutex);
> +     if(rc){
> +             LOG_CR("HSM:Failed to unlock hsm_ha_state,exiting
> thread,ret value:%d err:%s",rc,strerror(errno));
> +             assert(0);
> +     }
> +
>       /* Process the list of RPT entries on this session */
>       next = SAHPI_FIRST_ENTRY;
>       do{
> -
> -     if(hsm_ha_state.state == SA_AMF_HA_STANDBY)
> -                     return NCSCC_RC_FAILURE;
> -
>               current = next;
>               /* Get the RPT entry */
>               rc = saHpiRptEntryGet(cb->session_id, current,&next,
> &rpt_entry); @@ -869,8 +825,11 @@ static SaUint32T
> hsm_discover_and_dispat
>                       retriev_idr_info = true;
> 
>               /* Send the outstanding hot_swap event*/
> -             hsm_send_hotswap_event(&rpt_entry,
> hotswap_state_model, state,
> -                                     previous_state,retriev_idr_info);
> +             if (active) {
> +                     hsm_send_hotswap_event(&rpt_entry,
> hotswap_state_model, state,
> +
>       previous_state,retriev_idr_info);
> +             }
> +
>               if(SAHPI_LAST_ENTRY == next &&
>                        NCSCC_RC_SUCCESS == prev_domain_op_status ){
>                       /* Get the update count of domain_info*/
> @@ -1617,26 +1576,3 @@ static SaUint32T hsm_session_reopen()
>       TRACE_LEAVE();
>       return NCSCC_RC_SUCCESS;
>  }
> -
> /**********************************************************
> *************
> -* @brief      This function closes HPI session
> -*
> -* @param[in]
> -*
> -*
> -* @return    NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE
> -
> **********************************************************
> *************/
> -SaUint32T plms_hsm_session_close()
> -{
> -     PLMS_HSM_CB        *cb = hsm_cb;
> -     SaUint32T          rc = 0;
> -     /* Close the HPI session */
> -        rc = saHpiSessionClose(cb->session_id);
> -        if (SA_OK != rc){
> -             LOG_ER("HSM:Close session return error: %d:\n",rc);
> -             return NCSCC_RC_FAILURE;
> -     }
> -
> -     /* Reset the session_id */
> -     cb->session_id = 0;
> -     return NCSCC_RC_SUCCESS;
> -}
> diff --git a/osaf/services/saf/plmsv/plms/plms_amf.c
> b/osaf/services/saf/plmsv/plms/plms_amf.c
> --- a/osaf/services/saf/plmsv/plms/plms_amf.c
> +++ b/osaf/services/saf/plmsv/plms/plms_amf.c
> @@ -266,7 +266,7 @@ plms_amf_CSI_set_callback(SaInvocationT
>                       pthread_mutex_unlock(&hrb_ha_state.mutex);
>               }
>                  /* PLMC initialize */
> -                if(!cb->hpi_cfg.hpi_support && !cb->plmc_initialized){
> +                if(!cb->plmc_initialized){
>                          TRACE("Initializing PLMC");
>                          rc = plmc_initialize(plms_plmc_connect_cbk,
>                                                  plms_plmc_udp_cbk, @@ 
> -297,23 +297,6 @@
> plms_amf_CSI_set_callback(SaInvocationT
>               hrb_ha_state.state = SA_AMF_HA_STANDBY;
>               pthread_mutex_unlock(&hrb_ha_state.mutex);
> 
> -             SaUint32T (* hsm_func_ptr)() = NULL;
> -             if(cb->hpi_cfg.hpi_support){
> -                     /* Get the hsm Init func ptr */
> -                     hsm_func_ptr = dlsym(cb->hpi_intf_hdl,
> "plms_hsm_session_close");
> -                     if ( NULL == hsm_func_ptr ) {
> -                             LOG_ER("dlsym() failed to get the
> hsm_func_ptr,error %s", dlerror());
> -                             goto response;
> -                     }
> -
> -                     /* Initialize HSM */
> -                     rc = (* hsm_func_ptr)();
> -                     if ( NCSCC_RC_SUCCESS != rc ) {
> -                             LOG_ER("plms_session_close failed");
> -                             goto response;
> -                     }
> -             }
> -
>               /* PLMC finalize */
>               if(cb->plmc_initialized){
>                       rc = plmc_destroy();
> 

------------------------------------------------------------------------------
Monitor 25 network devices or servers for free with OpManager!
OpManager is web-based network management software that monitors 
network devices and physical & virtual servers, alerts via email & sms 
for fault. Monitor 25 devices for free with no restriction. Download now
http://ad.doubleclick.net/ddm/clk/292181274;119417398;o
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to