Hi Hal, Attached is a new patch with several fixes for this issue. I decided to remove the checking for zero in the atomic_dec after all, since as I mentioned before - clearing it is not a fix, and we will see the value in other infos in the log file.
Thanks, Yael Signed-off-by: Yael Kalka <[EMAIL PROTECTED]> Index: include/opensm/osm_vl15intf.h =================================================================== --- include/opensm/osm_vl15intf.h (revision 3704) +++ include/opensm/osm_vl15intf.h (working copy) @@ -55,11 +55,13 @@ #include <complib/cl_event.h> #include <complib/cl_thread.h> #include <complib/cl_qlist.h> +#include <complib/cl_passivelock.h> #include <opensm/osm_stats.h> #include <opensm/osm_log.h> #include <opensm/osm_madw.h> #include <opensm/osm_mad_pool.h> #include <vendor/osm_vendor.h> +#include <opensm/osm_subnet.h> #ifdef __cplusplus # define BEGIN_C_DECLS extern "C" { @@ -137,6 +139,9 @@ typedef struct _osm_vl15 osm_vendor_t *p_vend; osm_log_t *p_log; osm_stats_t *p_stats; + osm_subn_t *p_subn; + cl_disp_reg_handle_t h_disp; + cl_plock_t *p_lock; } osm_vl15_t; /* @@ -176,6 +181,15 @@ typedef struct _osm_vl15 * p_stats * Pointer to the OpenSM statistics block. * +* p_subn +* Pointer to the Subnet object for this subnet. +* +* h_disp +* Handle returned from dispatcher registration. +* +* p_lock +* Pointer to the serializing lock. +* * SEE ALSO * VL15 object *********/ @@ -265,7 +279,10 @@ osm_vl15_init( IN osm_vendor_t* const p_vend, IN osm_log_t* const p_log, IN osm_stats_t* const p_stats, - IN const int32_t max_wire_smps ); + IN const int32_t max_wire_smps, + IN osm_subn_t* const p_subn, + IN cl_dispatcher_t* const p_disp, + IN cl_plock_t* const p_lock ); /* * PARAMETERS * p_vl15 @@ -283,6 +300,15 @@ osm_vl15_init( * max_wire_smps * [in] Maximum number of MADs allowed on the wire at one time. * +* p_subn +* [in] Pointer to the subnet object. +* +* p_disp +* [in] Pointer to the dispatcher object. +* +* p_lock +* [in] Pointer to the OpenSM serializing lock. +* * RETURN VALUES * IB_SUCCESS if the VL15 object was initialized successfully. * Index: opensm/osm_opensm.c =================================================================== --- opensm/osm_opensm.c (revision 3704) +++ opensm/osm_opensm.c (working copy) @@ -257,7 +257,8 @@ osm_opensm_init( status = osm_vl15_init( &p_osm->vl15, p_osm->p_vendor, - &p_osm->log, &p_osm->stats, p_opt->max_wire_smps ); + &p_osm->log, &p_osm->stats, p_opt->max_wire_smps, + &p_osm->subn, &p_osm->disp, &p_osm->lock ); if( status != IB_SUCCESS ) goto Exit; Index: opensm/osm_vl15intf.c =================================================================== --- opensm/osm_vl15intf.c (revision 3704) +++ opensm/osm_vl15intf.c (working copy) @@ -157,6 +157,8 @@ __osm_vl15_poller( if( status != IB_SUCCESS ) { + uint32_t outstanding; + cl_status_t cl_status; osm_log( p_vl->p_log, OSM_LOG_ERROR, "__osm_vl15_poller: ERR 3E03: " "MAD send failed (%s).\n", @@ -166,7 +168,69 @@ __osm_vl15_poller( The MAD was never successfully sent, so fix up the pre-incremented count values. */ + /* Decrement qp0_mads_sent and qp0_mads_outstanding_on_wire + that was incremented in the code above. */ mads_sent = cl_atomic_dec( &p_vl->p_stats->qp0_mads_sent ); + if( p_madw->resp_expected == TRUE ) + cl_atomic_dec( &p_vl->p_stats->qp0_mads_outstanding_on_wire ); + + /* + The following code is similar to the one in + __osm_sm_mad_ctrl_retire_trans_mad. We need to decrement the + qp0_mads_outstanding counter, and if we reached 0 - need to call + the cl_disp_post with OSM_SIGNAL_NO_PENDING_TRANSACTION (in order + to wake up the state mgr). + */ + cl_atomic_dec( &p_vl->p_stats->qp0_mads_outstanding ); + + osm_log( p_vl->p_log, OSM_LOG_DEBUG, + "__osm_vl15_poller: " + "%u QP0 MADs outstanding.\n", + p_vl->p_stats->qp0_mads_outstanding ); + + /* + Acquire the lock non-exclusively. + Other modules that send MADs grab this lock exclusively. + These modules that are in the process of sending MADs + will hold the lock until they finish posting all the MADs + they plan to send. While the other module is sending MADs + the outstanding count may temporarily go to zero. + Thus, by grabbing the lock ourselves, we get an accurate + view of whether or not the number of outstanding MADs is + really zero. + */ + CL_PLOCK_ACQUIRE( p_vl->p_lock ); + outstanding = p_vl->p_stats->qp0_mads_outstanding; + CL_PLOCK_RELEASE( p_vl->p_lock ); + + if( outstanding == 0 ) + { + /* + The wire is clean. + Signal the state manager. + */ + if( osm_log_is_active( p_vl->p_log, OSM_LOG_DEBUG ) ) + { + osm_log( p_vl->p_log, OSM_LOG_DEBUG, + "__osm_vl15_poller: " + "Posting Dispatcher message %s.\n", + osm_get_disp_msg_str( OSM_MSG_NO_SMPS_OUTSTANDING ) ); + } + + cl_status = cl_disp_post( p_vl->h_disp, + OSM_MSG_NO_SMPS_OUTSTANDING, + (void *)OSM_SIGNAL_NO_PENDING_TRANSACTIONS, + NULL, + NULL ); + + if( cl_status != CL_SUCCESS ) + { + osm_log( p_vl->p_log, OSM_LOG_ERROR, + "__osm_vl15_poller: ERR 3E06: " + "Dispatcher post message failed (%s).\n", + CL_STATUS_MSG( cl_status ) ); + } + } } else { @@ -232,6 +296,7 @@ osm_vl15_construct( cl_qlist_init( &p_vl->rfifo ); cl_qlist_init( &p_vl->ufifo ); cl_thread_construct( &p_vl->poller ); + p_vl->h_disp = CL_DISP_INVALID_HANDLE; } /********************************************************************** @@ -281,6 +346,8 @@ osm_vl15_destroy( p_vl->state = OSM_VL15_STATE_INIT; cl_spinlock_destroy( &p_vl->lock ); + cl_disp_unregister( p_vl->h_disp ); + OSM_LOG_EXIT( p_vl->p_log ); } @@ -292,7 +359,11 @@ osm_vl15_init( IN osm_vendor_t* const p_vend, IN osm_log_t* const p_log, IN osm_stats_t* const p_stats, - IN const int32_t max_wire_smps ) + IN const int32_t max_wire_smps, + IN osm_subn_t* const p_subn, + IN cl_dispatcher_t* const p_disp, + IN cl_plock_t* const p_lock + ) { ib_api_status_t status = IB_SUCCESS; OSM_LOG_ENTER( p_log, osm_vl15_init ); @@ -301,6 +372,8 @@ osm_vl15_init( p_vl->p_log = p_log; p_vl->p_stats = p_stats; p_vl->max_wire_smps = max_wire_smps; + p_vl->p_subn = p_subn; + p_vl->p_lock = p_lock; status = cl_event_init( &p_vl->signal, FALSE ); if( status != IB_SUCCESS ) @@ -321,6 +394,21 @@ osm_vl15_init( if( status != IB_SUCCESS ) goto Exit; + p_vl->h_disp = cl_disp_register( + p_disp, + CL_DISP_MSGID_NONE, + NULL, + NULL ); + + if( p_vl->h_disp == CL_DISP_INVALID_HANDLE ) + { + osm_log( p_log, OSM_LOG_ERROR, + "osm_vl15_init: ERR 3E01: " + "Dispatcher registration failed.\n" ); + status = IB_INSUFFICIENT_RESOURCES; + goto Exit; + } + Exit: OSM_LOG_EXIT( p_log ); return( status ); _______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general