When OpenSM is terminated umad_receiver thread still running even after the structures are destroyed and freed, this causes to random (but easily reproducible) crashes. The reason is that osm_vendor_delete() does not care about thread termination. This patch adds the receiver thread cancellation (by using pthread_cancel() and pthread_join()) and cares to keep have all mutexes unlocked upon termination. There is also minor termination code consolidation - osm_vendor_port_close() function.
Signed-off-by: Sasha Khapyorsky <[EMAIL PROTECTED]> --- osm/include/vendor/osm_vendor_ibumad.h | 6 +- osm/libvendor/osm_vendor_ibumad.c | 157 +++++++++++++++----------------- osm/libvendor/osm_vendor_ibumad_sa.c | 3 +- 3 files changed, 77 insertions(+), 89 deletions(-) diff --git a/osm/include/vendor/osm_vendor_ibumad.h b/osm/include/vendor/osm_vendor_ibumad.h index 4cbd59f..f6e3d69 100644 --- a/osm/include/vendor/osm_vendor_ibumad.h +++ b/osm/include/vendor/osm_vendor_ibumad.h @@ -39,7 +39,6 @@ #include <iba/ib_types.h> #include <complib/cl_qlist.h> -#include <complib/cl_thread.h> #include <opensm/osm_base.h> #include <opensm/osm_log.h> @@ -87,7 +86,6 @@ typedef struct _osm_ca_info ib_net64_t guid; size_t attr_size; ib_ca_attr_t *p_attr; - } osm_ca_info_t; /* * FIELDS @@ -170,8 +168,8 @@ typedef struct _osm_vendor vendor_match_tbl_t mtbl; umad_ca_t umad_ca; umad_port_t umad_port; - cl_spinlock_t cb_lock; - cl_spinlock_t match_tbl_lock; + pthread_mutex_t cb_mutex; + pthread_mutex_t match_tbl_mutex; int umad_port_id; void *receiver; int issmfd; diff --git a/osm/libvendor/osm_vendor_ibumad.c b/osm/libvendor/osm_vendor_ibumad.c index 35f127a..7320738 100644 --- a/osm/libvendor/osm_vendor_ibumad.c +++ b/osm/libvendor/osm_vendor_ibumad.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved. + * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved. * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved. * Copyright (c) 1996-2003 Intel Corporation. All rights reserved. * @@ -58,13 +58,11 @@ #include <unistd.h> #include <stdlib.h> -#include <signal.h> #include <fcntl.h> #include <errno.h> #include <iba/ib_types.h> #include <complib/cl_qlist.h> -#include <complib/cl_thread.h> #include <complib/cl_math.h> #include <complib/cl_debug.h> #include <opensm/osm_madw.h> @@ -97,12 +95,13 @@ typedef struct _osm_umad_bind_info typedef struct _umad_receiver { - cl_event_t signal; - cl_thread_t receiver; - osm_vendor_t *p_vend; - osm_log_t *p_log; + pthread_t tid; + osm_vendor_t *p_vend; + osm_log_t *p_log; } umad_receiver_t; +static void osm_vendor_close_port(osm_vendor_t* const p_vend); + static void clear_madw(osm_vendor_t *p_vend) { @@ -110,7 +109,7 @@ clear_madw(osm_vendor_t *p_vend) ib_net64_t old_tid; OSM_LOG_ENTER( p_vend->p_log, clear_madw ); - cl_spinlock_acquire( &p_vend->match_tbl_lock ); + pthread_mutex_lock(&p_vend->match_tbl_mutex); for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) { if (m->tid) { old_m = m; @@ -119,7 +118,7 @@ clear_madw(osm_vendor_t *p_vend) osm_mad_pool_put( ((osm_umad_bind_info_t *)((osm_madw_t *)m->v)->h_bind)->p_mad_pool, m->v); - cl_spinlock_release( &p_vend->match_tbl_lock ); + pthread_mutex_unlock(&p_vend->match_tbl_mutex); osm_log(p_vend->p_log, OSM_LOG_ERROR, "clear_madw: ERR 5401: " "evicting entry %p (tid was 0x%"PRIx64")\n", @@ -127,7 +126,7 @@ clear_madw(osm_vendor_t *p_vend) goto Exit; } } - cl_spinlock_release( &p_vend->match_tbl_lock ); + pthread_mutex_unlock(&p_vend->match_tbl_mutex); Exit: OSM_LOG_EXIT( p_vend->p_log ); @@ -147,18 +146,18 @@ get_madw(osm_vendor_t *p_vend, ib_net64_t *tid) if (mtid == 0) return 0; - cl_spinlock_acquire( &p_vend->match_tbl_lock ); + pthread_mutex_lock(&p_vend->match_tbl_mutex); for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) { if (m->tid == mtid) { m->tid = 0; *tid = mtid; res = m->v; - cl_spinlock_release( &p_vend->match_tbl_lock ); + pthread_mutex_unlock(&p_vend->match_tbl_mutex); return res; } } - cl_spinlock_release( &p_vend->match_tbl_lock ); + pthread_mutex_unlock(&p_vend->match_tbl_mutex); return 0; } @@ -171,13 +170,13 @@ put_madw(osm_vendor_t *p_vend, osm_madw_t *p_madw, ib_net64_t *tid) ib_net64_t old_tid; uint32_t oldest = ~0; - cl_spinlock_acquire( &p_vend->match_tbl_lock ); + pthread_mutex_lock(&p_vend->match_tbl_mutex); for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) { if (m->tid == 0) { m->tid = *tid; m->v = p_madw; m->version = cl_atomic_inc((atomic32_t *)&p_vend->mtbl.last_version); - cl_spinlock_release( &p_vend->match_tbl_lock ); + pthread_mutex_unlock(&p_vend->match_tbl_mutex); return; } if (oldest > m->version) { @@ -191,13 +190,13 @@ put_madw(osm_vendor_t *p_vend, osm_madw_t *p_madw, ib_net64_t *tid) p_req_madw = old_lru->v; p_bind = p_req_madw->h_bind; p_req_madw->status = IB_CANCELED; - cl_spinlock_acquire( &p_vend->cb_lock ); + pthread_mutex_lock(&p_vend->cb_mutex); (*p_bind->send_err_callback)(p_bind->client_context, old_lru->v); - cl_spinlock_release( &p_vend->cb_lock ); + pthread_mutex_unlock(&p_vend->cb_mutex); lru->tid = *tid; lru->v = p_madw; lru->version = cl_atomic_inc((atomic32_t *)&p_vend->mtbl.last_version); - cl_spinlock_release( &p_vend->match_tbl_lock ); + pthread_mutex_unlock(&p_vend->match_tbl_mutex); osm_log(p_vend->p_log, OSM_LOG_ERROR, "put_madw: ERR 5402: " "evicting entry %p (tid was 0x%"PRIx64")\n", old_lru, old_tid); @@ -237,7 +236,12 @@ swap_mad_bufs(osm_madw_t *p_madw, void *umad) return old; } -void +static void unlock_mutex(void *arg) +{ + pthread_mutex_unlock(arg); +} + +void * umad_receiver(void *p_ptr) { umad_receiver_t* const p_ur = (umad_receiver_t *)p_ptr; @@ -356,9 +360,10 @@ umad_receiver(void *p_ptr) } else { p_req_madw->status = IB_TIMEOUT; /* cb frees req_madw */ - cl_spinlock_acquire( &p_vend->cb_lock ); + pthread_mutex_lock(&p_vend->cb_mutex); + pthread_cleanup_push(unlock_mutex, &p_vend->cb_mutex); (*p_bind->send_err_callback)(p_bind->client_context, p_req_madw); - cl_spinlock_release( &p_vend->cb_lock ); + pthread_cleanup_pop(1); } osm_mad_pool_put(p_bind->p_mad_pool, p_madw); @@ -398,47 +403,37 @@ umad_receiver(void *p_ptr) #endif /* call the CB */ - cl_spinlock_acquire( &p_vend->cb_lock ); + pthread_mutex_lock(&p_vend->cb_mutex); + pthread_cleanup_push(unlock_mutex, &p_vend->cb_mutex); (*p_bind->mad_recv_callback)(p_madw, p_bind->client_context, p_req_madw); - cl_spinlock_release( &p_vend->cb_lock ); + pthread_cleanup_pop(1); } OSM_LOG_EXIT( p_vend->p_log ); - return; + return NULL; } -static int -umad_receiver_init(osm_vendor_t *p_vend) +static int umad_receiver_start(osm_vendor_t *p_vend) { umad_receiver_t *p_ur = p_vend->receiver; - int r = -1; - - OSM_LOG_ENTER( p_vend->p_log, umad_receiver_init ); p_ur->p_vend = p_vend; p_ur->p_log = p_vend->p_log; - cl_event_construct(&p_ur->signal); - cl_thread_construct(&p_ur->receiver); - - if (cl_event_init(&p_ur->signal, FALSE)) - goto Exit; - - /* - * Initialize the thread after all other dependent objects - * have been initialized. - */ - if (cl_thread_init( &p_ur->receiver, umad_receiver, p_ur, - "umad receiver" )) - goto Exit; - - r = 0; /* success */ + if (pthread_create(&p_ur->tid, NULL, umad_receiver, p_ur) < 0) + return -1; -Exit: - OSM_LOG_EXIT( p_vend->p_log ); - return r; + return 0; } +static void umad_receiver_stop(umad_receiver_t *p_ur) +{ + pthread_cancel(p_ur->tid); + pthread_join(p_ur->tid, NULL); + p_ur->tid = 0; + p_ur->p_vend = NULL; + p_ur->p_log = NULL; +} /********************************************************************** **********************************************************************/ ib_api_status_t @@ -454,23 +449,11 @@ osm_vendor_init( p_vend->p_log = p_log; p_vend->timeout = timeout; p_vend->max_retries = OSM_DEFAULT_RETRY_COUNT; - cl_spinlock_construct( &p_vend->cb_lock ); - cl_spinlock_construct( &p_vend->match_tbl_lock ); + pthread_mutex_init(&p_vend->cb_mutex, NULL); + pthread_mutex_init(&p_vend->match_tbl_mutex, NULL); p_vend->umad_port_id = -1; p_vend->issmfd = -1; - if ((r = cl_spinlock_init( &p_vend->cb_lock ))) { - osm_log(p_vend->p_log, OSM_LOG_ERROR, - "osm_vendor_init: ERR 5435: Error initializing cb spinlock\n"); - goto Exit; - } - - if ((r = cl_spinlock_init( &p_vend->match_tbl_lock ))) { - osm_log(p_vend->p_log, OSM_LOG_ERROR, - "osm_vendor_init: ERR 5434: Error initializing match tbl spinlock\n"); - goto Exit; - } - /* * Open our instance of UMAD. */ @@ -541,29 +524,14 @@ void osm_vendor_delete( IN osm_vendor_t** const pp_vend ) { - umad_receiver_t *p_ur; - int agent_id; - - if ((*pp_vend)->umad_port_id >= 0) { - /* unregister UMAD agents */ - for (agent_id = 0; agent_id < UMAD_CA_MAX_AGENTS; agent_id++) - if ( (*pp_vend)->agents[agent_id] ) - umad_unregister((*pp_vend)->umad_port_id, - agent_id ); - umad_close_port((*pp_vend)->umad_port_id); - (*pp_vend)->umad_port_id = -1; - } + osm_vendor_close_port(*pp_vend); clear_madw( *pp_vend ); /* make sure all ports are closed */ umad_done(); - /* umad receiver thread ? */ - p_ur = (*pp_vend)->receiver; - if (p_ur) - cl_event_destroy( &p_ur->signal ); - cl_spinlock_destroy( &(*pp_vend)->cb_lock ); - cl_spinlock_destroy( &(*pp_vend)->match_tbl_lock ); + pthread_mutex_destroy(&(*pp_vend)->cb_mutex); + pthread_mutex_destroy(&(*pp_vend)->match_tbl_mutex); free( *pp_vend ); *pp_vend = NULL; } @@ -780,7 +748,7 @@ osm_vendor_open_port( p_vend->umad_port_id = umad_port_id = -1; goto Exit; } - if (umad_receiver_init(p_vend) != 0) { + if (umad_receiver_start(p_vend) != 0) { osm_log( p_vend->p_log, OSM_LOG_ERROR, "osm_vendor_open_port: ERR 5420: " "umad_receiver_init failed\n" ); @@ -793,6 +761,27 @@ Exit: return umad_port_id; } +static void osm_vendor_close_port(osm_vendor_t* const p_vend) +{ + umad_receiver_t *p_ur; + int i; + + p_ur = p_vend->receiver; + p_vend->receiver = NULL; + if (p_ur) { + umad_receiver_stop(p_ur); + free(p_ur); + } + + if (p_vend->umad_port_id >= 0) { + for (i = 0; i < UMAD_CA_MAX_AGENTS; i++) + if (p_vend->agents[i]) + umad_unregister(p_vend->umad_port_id, i); + umad_close_port(p_vend->umad_port_id); + p_vend->umad_port_id = -1; + } +} + static int set_bit(int nr, void *method_mask) { int mask, retval; @@ -985,10 +974,10 @@ osm_vendor_unbind( OSM_LOG_ENTER( p_vend->p_log, osm_vendor_unbind ); - cl_spinlock_acquire( &p_vend->cb_lock ); + pthread_mutex_lock(&p_vend->cb_mutex); p_bind->mad_recv_callback = __osm_vendor_recv_dummy_cb; p_bind->send_err_callback = __osm_vendor_send_err_dummy_cb; - cl_spinlock_release( &p_vend->cb_lock ); + pthread_mutex_unlock(&p_vend->cb_mutex); OSM_LOG_EXIT( p_vend->p_log); } @@ -1154,9 +1143,9 @@ Resp: "Send p_madw = %p of size %d failed %d (%m)\n", p_madw, sent_mad_size, ret); p_madw->status = IB_ERROR; - cl_spinlock_acquire( &p_vend->cb_lock ); + pthread_mutex_lock(&p_vend->cb_mutex); (*p_bind->send_err_callback)(p_bind->client_context, p_madw); /* cb frees madw */ - cl_spinlock_release( &p_vend->cb_lock ); + pthread_mutex_unlock(&p_vend->cb_mutex); goto Exit; } diff --git a/osm/libvendor/osm_vendor_ibumad_sa.c b/osm/libvendor/osm_vendor_ibumad_sa.c index e3978ef..a110e81 100644 --- a/osm/libvendor/osm_vendor_ibumad_sa.c +++ b/osm/libvendor/osm_vendor_ibumad_sa.c @@ -39,9 +39,10 @@ #include <stdlib.h> #include <string.h> +#include <sys/time.h> #include <vendor/osm_vendor_api.h> #include <vendor/osm_vendor_sa_api.h> -#include <sys/time.h> +#include <complib/cl_event.h> #define MAX_PORTS 64 -- 1.5.0.1.40.gb40d _______________________________________________ openib-general mailing list openib-general@openib.org http://openib.org/mailman/listinfo/openib-general To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general