When OpenSM is terminated umad_receiver thread still running even after
the structures are destroyed and freed, this causes to random (but easily
reproducible) crashes. The reason is that osm_vendor_delete() does not
care about thread termination. This patch adds the receiver thread
cancellation (by using pthread_cancel() and pthread_join()) and cares to
keep have all mutexes unlocked upon termination. There is also minor
termination code consolidation - osm_vendor_port_close() function.

Signed-off-by: Sasha Khapyorsky <[EMAIL PROTECTED]>
---
 osm/include/vendor/osm_vendor_ibumad.h |    6 +-
 osm/libvendor/osm_vendor_ibumad.c      |  157 +++++++++++++++-----------------
 osm/libvendor/osm_vendor_ibumad_sa.c   |    3 +-
 3 files changed, 77 insertions(+), 89 deletions(-)

diff --git a/osm/include/vendor/osm_vendor_ibumad.h 
b/osm/include/vendor/osm_vendor_ibumad.h
index 4cbd59f..f6e3d69 100644
--- a/osm/include/vendor/osm_vendor_ibumad.h
+++ b/osm/include/vendor/osm_vendor_ibumad.h
@@ -39,7 +39,6 @@
 
 #include <iba/ib_types.h>
 #include <complib/cl_qlist.h>
-#include <complib/cl_thread.h>
 #include <opensm/osm_base.h>
 #include <opensm/osm_log.h>
 
@@ -87,7 +86,6 @@ typedef struct _osm_ca_info
        ib_net64_t              guid;
        size_t                  attr_size;
        ib_ca_attr_t            *p_attr;
-
 } osm_ca_info_t;
 /*
 * FIELDS
@@ -170,8 +168,8 @@ typedef     struct _osm_vendor
        vendor_match_tbl_t mtbl;
        umad_ca_t umad_ca;
        umad_port_t umad_port;
-       cl_spinlock_t cb_lock;
-       cl_spinlock_t match_tbl_lock;
+       pthread_mutex_t cb_mutex;
+       pthread_mutex_t match_tbl_mutex;
        int umad_port_id;
        void *receiver;
        int issmfd;
diff --git a/osm/libvendor/osm_vendor_ibumad.c 
b/osm/libvendor/osm_vendor_ibumad.c
index 35f127a..7320738 100644
--- a/osm/libvendor/osm_vendor_ibumad.c
+++ b/osm/libvendor/osm_vendor_ibumad.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
  * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
@@ -58,13 +58,11 @@
 
 #include <unistd.h>
 #include <stdlib.h>
-#include <signal.h>
 #include <fcntl.h>
 #include <errno.h>
 
 #include <iba/ib_types.h>
 #include <complib/cl_qlist.h>
-#include <complib/cl_thread.h>
 #include <complib/cl_math.h>
 #include <complib/cl_debug.h>
 #include <opensm/osm_madw.h>
@@ -97,12 +95,13 @@ typedef struct _osm_umad_bind_info
 
 typedef struct _umad_receiver
 {
-       cl_event_t              signal;
-       cl_thread_t             receiver;
-       osm_vendor_t            *p_vend;
-       osm_log_t               *p_log;
+       pthread_t tid;
+       osm_vendor_t    *p_vend;
+       osm_log_t       *p_log;
 } umad_receiver_t;
 
+static void osm_vendor_close_port(osm_vendor_t* const p_vend);
+
 static void
 clear_madw(osm_vendor_t *p_vend)
 {
@@ -110,7 +109,7 @@ clear_madw(osm_vendor_t *p_vend)
        ib_net64_t old_tid;
 
        OSM_LOG_ENTER( p_vend->p_log, clear_madw );
-       cl_spinlock_acquire( &p_vend->match_tbl_lock );
+       pthread_mutex_lock(&p_vend->match_tbl_mutex);
        for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
                if (m->tid) {
                        old_m = m;
@@ -119,7 +118,7 @@ clear_madw(osm_vendor_t *p_vend)
                        osm_mad_pool_put(
                                ((osm_umad_bind_info_t *)((osm_madw_t 
*)m->v)->h_bind)->p_mad_pool,
                                m->v);
-                       cl_spinlock_release( &p_vend->match_tbl_lock );
+                       pthread_mutex_unlock(&p_vend->match_tbl_mutex);
                        osm_log(p_vend->p_log, OSM_LOG_ERROR,
                                "clear_madw: ERR 5401: "
                                "evicting entry %p (tid was 0x%"PRIx64")\n",
@@ -127,7 +126,7 @@ clear_madw(osm_vendor_t *p_vend)
                        goto Exit;
                }
        }
-       cl_spinlock_release( &p_vend->match_tbl_lock );
+       pthread_mutex_unlock(&p_vend->match_tbl_mutex);
 
 Exit:
        OSM_LOG_EXIT( p_vend->p_log );
@@ -147,18 +146,18 @@ get_madw(osm_vendor_t *p_vend, ib_net64_t *tid)
        if (mtid == 0)
                return 0;
 
-       cl_spinlock_acquire( &p_vend->match_tbl_lock );
+       pthread_mutex_lock(&p_vend->match_tbl_mutex);
        for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
                if (m->tid == mtid) {
                        m->tid = 0;
                        *tid = mtid;
                        res = m->v;
-                       cl_spinlock_release( &p_vend->match_tbl_lock );
+                       pthread_mutex_unlock(&p_vend->match_tbl_mutex);
                        return res;
                }
        }
 
-       cl_spinlock_release( &p_vend->match_tbl_lock );
+       pthread_mutex_unlock(&p_vend->match_tbl_mutex);
        return 0;
 }
 
@@ -171,13 +170,13 @@ put_madw(osm_vendor_t *p_vend, osm_madw_t *p_madw, 
ib_net64_t *tid)
        ib_net64_t old_tid;
        uint32_t oldest = ~0;
 
-       cl_spinlock_acquire( &p_vend->match_tbl_lock );
+       pthread_mutex_lock(&p_vend->match_tbl_mutex);
        for (m = p_vend->mtbl.tbl, e = m + p_vend->mtbl.max; m < e; m++) {
                if (m->tid == 0) {
                        m->tid = *tid;
                        m->v = p_madw;
                        m->version = cl_atomic_inc((atomic32_t 
*)&p_vend->mtbl.last_version);
-                       cl_spinlock_release( &p_vend->match_tbl_lock );
+                       pthread_mutex_unlock(&p_vend->match_tbl_mutex);
                        return;
                }
                if (oldest > m->version) {
@@ -191,13 +190,13 @@ put_madw(osm_vendor_t *p_vend, osm_madw_t *p_madw, 
ib_net64_t *tid)
        p_req_madw = old_lru->v;
        p_bind = p_req_madw->h_bind;
        p_req_madw->status = IB_CANCELED;
-       cl_spinlock_acquire( &p_vend->cb_lock );
+       pthread_mutex_lock(&p_vend->cb_mutex);
        (*p_bind->send_err_callback)(p_bind->client_context, old_lru->v);
-       cl_spinlock_release( &p_vend->cb_lock );
+       pthread_mutex_unlock(&p_vend->cb_mutex);
        lru->tid = *tid;
        lru->v = p_madw;
        lru->version = cl_atomic_inc((atomic32_t *)&p_vend->mtbl.last_version);
-       cl_spinlock_release( &p_vend->match_tbl_lock );
+       pthread_mutex_unlock(&p_vend->match_tbl_mutex);
        osm_log(p_vend->p_log, OSM_LOG_ERROR,
                "put_madw: ERR 5402: "
                "evicting entry %p (tid was 0x%"PRIx64")\n", old_lru, old_tid);
@@ -237,7 +236,12 @@ swap_mad_bufs(osm_madw_t *p_madw, void *umad)
        return old;
 }
 
-void
+static void unlock_mutex(void *arg)
+{
+       pthread_mutex_unlock(arg);
+}
+
+void *
 umad_receiver(void *p_ptr)
 {
        umad_receiver_t* const p_ur = (umad_receiver_t *)p_ptr;
@@ -356,9 +360,10 @@ umad_receiver(void *p_ptr)
                        } else {
                                p_req_madw->status = IB_TIMEOUT;
                                /* cb frees req_madw */
-                               cl_spinlock_acquire( &p_vend->cb_lock );
+                               pthread_mutex_lock(&p_vend->cb_mutex);
+                               pthread_cleanup_push(unlock_mutex, 
&p_vend->cb_mutex);
                                
(*p_bind->send_err_callback)(p_bind->client_context, p_req_madw);
-                               cl_spinlock_release( &p_vend->cb_lock );
+                               pthread_cleanup_pop(1);
                        }
 
                        osm_mad_pool_put(p_bind->p_mad_pool, p_madw);
@@ -398,47 +403,37 @@ umad_receiver(void *p_ptr)
 #endif
 
                /* call the CB */
-               cl_spinlock_acquire( &p_vend->cb_lock );
+               pthread_mutex_lock(&p_vend->cb_mutex);
+               pthread_cleanup_push(unlock_mutex, &p_vend->cb_mutex);
                (*p_bind->mad_recv_callback)(p_madw, p_bind->client_context, 
p_req_madw);
-               cl_spinlock_release( &p_vend->cb_lock );
+               pthread_cleanup_pop(1);
        }
 
        OSM_LOG_EXIT( p_vend->p_log );
-       return;
+       return NULL;
 }
 
-static int
-umad_receiver_init(osm_vendor_t *p_vend)
+static int umad_receiver_start(osm_vendor_t *p_vend)
 {
        umad_receiver_t *p_ur = p_vend->receiver;
-       int r = -1;
-
-       OSM_LOG_ENTER( p_vend->p_log, umad_receiver_init );
 
        p_ur->p_vend = p_vend;
        p_ur->p_log = p_vend->p_log;
 
-       cl_event_construct(&p_ur->signal);
-       cl_thread_construct(&p_ur->receiver);
-
-       if (cl_event_init(&p_ur->signal, FALSE))
-               goto Exit;
-
-       /*
-        * Initialize the thread after all other dependent objects
-        * have been initialized.
-        */
-       if (cl_thread_init( &p_ur->receiver, umad_receiver, p_ur,
-                           "umad receiver" ))
-           goto Exit;
-
-       r = 0;  /* success */
+       if (pthread_create(&p_ur->tid, NULL, umad_receiver, p_ur) < 0)
+               return -1;
 
-Exit:
-       OSM_LOG_EXIT( p_vend->p_log );
-       return r;
+       return 0;
 }
 
+static void umad_receiver_stop(umad_receiver_t *p_ur)
+{
+       pthread_cancel(p_ur->tid);
+       pthread_join(p_ur->tid, NULL);
+       p_ur->tid = 0;
+       p_ur->p_vend = NULL;
+       p_ur->p_log = NULL;
+}
 /**********************************************************************
  **********************************************************************/
 ib_api_status_t
@@ -454,23 +449,11 @@ osm_vendor_init(
        p_vend->p_log = p_log;
        p_vend->timeout = timeout;
        p_vend->max_retries = OSM_DEFAULT_RETRY_COUNT;
-       cl_spinlock_construct( &p_vend->cb_lock );
-       cl_spinlock_construct( &p_vend->match_tbl_lock );
+       pthread_mutex_init(&p_vend->cb_mutex, NULL);
+       pthread_mutex_init(&p_vend->match_tbl_mutex, NULL);
        p_vend->umad_port_id = -1;
        p_vend->issmfd = -1;
 
-       if ((r = cl_spinlock_init( &p_vend->cb_lock ))) {
-               osm_log(p_vend->p_log, OSM_LOG_ERROR,
-                       "osm_vendor_init: ERR 5435: Error initializing cb 
spinlock\n");
-               goto Exit;
-       }
-
-       if ((r = cl_spinlock_init( &p_vend->match_tbl_lock ))) {
-               osm_log(p_vend->p_log, OSM_LOG_ERROR,
-                       "osm_vendor_init: ERR 5434: Error initializing match 
tbl spinlock\n");
-               goto Exit;
-       }
-       
        /*
         * Open our instance of UMAD.
         */
@@ -541,29 +524,14 @@ void
 osm_vendor_delete(
   IN osm_vendor_t** const pp_vend )
 {
-       umad_receiver_t *p_ur;
-       int agent_id;
-
-       if ((*pp_vend)->umad_port_id >= 0) {
-               /* unregister UMAD agents */
-               for (agent_id = 0; agent_id < UMAD_CA_MAX_AGENTS; agent_id++)
-                       if ( (*pp_vend)->agents[agent_id] )
-                               umad_unregister((*pp_vend)->umad_port_id,
-                                               agent_id );
-               umad_close_port((*pp_vend)->umad_port_id);
-               (*pp_vend)->umad_port_id = -1;
-       }
+       osm_vendor_close_port(*pp_vend);
 
        clear_madw( *pp_vend );
        /* make sure all ports are closed */
        umad_done();
 
-       /* umad receiver thread ? */
-       p_ur = (*pp_vend)->receiver;
-       if (p_ur)
-               cl_event_destroy( &p_ur->signal );
-       cl_spinlock_destroy( &(*pp_vend)->cb_lock );
-       cl_spinlock_destroy( &(*pp_vend)->match_tbl_lock );
+       pthread_mutex_destroy(&(*pp_vend)->cb_mutex);
+       pthread_mutex_destroy(&(*pp_vend)->match_tbl_mutex);
        free( *pp_vend );
        *pp_vend = NULL;
 }
@@ -780,7 +748,7 @@ osm_vendor_open_port(
                p_vend->umad_port_id = umad_port_id = -1;
                goto Exit;
        }
-       if (umad_receiver_init(p_vend) != 0) {
+       if (umad_receiver_start(p_vend) != 0) {
                osm_log( p_vend->p_log, OSM_LOG_ERROR,
                         "osm_vendor_open_port: ERR 5420: "
                         "umad_receiver_init failed\n" );
@@ -793,6 +761,27 @@ Exit:
        return umad_port_id;
 }
 
+static void osm_vendor_close_port(osm_vendor_t* const p_vend)
+{
+       umad_receiver_t *p_ur;
+       int i;
+
+       p_ur = p_vend->receiver;
+       p_vend->receiver = NULL;
+       if (p_ur) {
+               umad_receiver_stop(p_ur);
+               free(p_ur);
+       }
+
+       if (p_vend->umad_port_id >= 0) {
+               for (i = 0; i < UMAD_CA_MAX_AGENTS; i++)
+                       if (p_vend->agents[i])
+                               umad_unregister(p_vend->umad_port_id, i);
+               umad_close_port(p_vend->umad_port_id);
+               p_vend->umad_port_id = -1;
+       }
+}
+
 static int set_bit(int nr, void *method_mask)
 {
        int mask, retval;
@@ -985,10 +974,10 @@ osm_vendor_unbind(
 
        OSM_LOG_ENTER( p_vend->p_log, osm_vendor_unbind );
 
-       cl_spinlock_acquire( &p_vend->cb_lock );
+       pthread_mutex_lock(&p_vend->cb_mutex);
        p_bind->mad_recv_callback = __osm_vendor_recv_dummy_cb;
        p_bind->send_err_callback = __osm_vendor_send_err_dummy_cb;
-       cl_spinlock_release( &p_vend->cb_lock );
+       pthread_mutex_unlock(&p_vend->cb_mutex);
 
        OSM_LOG_EXIT( p_vend->p_log);
 }
@@ -1154,9 +1143,9 @@ Resp:
                        "Send p_madw = %p of size %d failed %d (%m)\n", 
                        p_madw, sent_mad_size, ret);
                p_madw->status = IB_ERROR;
-               cl_spinlock_acquire( &p_vend->cb_lock );
+               pthread_mutex_lock(&p_vend->cb_mutex);
                (*p_bind->send_err_callback)(p_bind->client_context, p_madw);   
/* cb frees madw */
-               cl_spinlock_release( &p_vend->cb_lock );
+               pthread_mutex_unlock(&p_vend->cb_mutex);
                goto Exit;
        }
 
diff --git a/osm/libvendor/osm_vendor_ibumad_sa.c 
b/osm/libvendor/osm_vendor_ibumad_sa.c
index e3978ef..a110e81 100644
--- a/osm/libvendor/osm_vendor_ibumad_sa.c
+++ b/osm/libvendor/osm_vendor_ibumad_sa.c
@@ -39,9 +39,10 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include <sys/time.h>
 #include <vendor/osm_vendor_api.h>
 #include <vendor/osm_vendor_sa_api.h>
-#include <sys/time.h>
+#include <complib/cl_event.h>
 
 #define MAX_PORTS 64
 
-- 
1.5.0.1.40.gb40d


_______________________________________________
openib-general mailing list
openib-general@openib.org
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to