While looking over Sean's changes, I noticed what look like a few bugs
in the mad thread usage.  It didn't seem like there was any way for
the MAD thread to stop, and I think there are a few race conditions
that could lead to lost wakeups.  This patch tries to fix both of
these problems.

I didn't test this because I didn't feel like messing with the
Makefile to get it to build in my environment.  (It would be good to
switch to a standard kbuild Makefile so things like cross-compiling
and separate object directories work)

Thanks,
  Roland

Index: infiniband/access/ib_mad_priv.h
===================================================================
--- infiniband/access/ib_mad_priv.h     (revision 899)
+++ infiniband/access/ib_mad_priv.h     (working copy)
@@ -131,11 +131,6 @@
        struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
 };
 
-struct ib_mad_thread_private {
-       wait_queue_head_t       wait;
-       atomic_t                completion_event;
-};
-
 struct ib_mad_port_private {
        struct list_head port_list;
        struct ib_device *device;
@@ -159,7 +154,7 @@
        u32 recv_wr_index[IB_MAD_QPS_CORE];
 
        struct task_struct *mad_thread;
-       struct ib_mad_thread_private mad_thread_private;
+       int thread_wake;
 };
 
 #endif /* __IB_MAD_PRIV_H__ */
Index: infiniband/access/ib_mad.c
===================================================================
--- infiniband/access/ib_mad.c  (revision 899)
+++ infiniband/access/ib_mad.c  (working copy)
@@ -892,6 +892,8 @@
        struct ib_wc wc;
        int err_status = 0;
        
+       ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
        while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
                printk(KERN_DEBUG "Completion opcode 0x%x WRID 0x%Lx\n", wc.opcode, 
wc.wr_id);
                if (wc.status != IB_WC_SUCCESS) {
@@ -928,11 +930,8 @@
                }
        }
 
-       if (err_status) {
+       if (err_status)
                ib_mad_port_restart(port_priv);
-       } else {
-               ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
-       }
 }
 
 /*
@@ -941,23 +940,22 @@
 static int ib_mad_thread(void *param)
 {
        struct ib_mad_port_private *port_priv = param;
-       struct ib_mad_thread_private *mad_thread_priv = &port_priv->mad_thread_private;
-       int ret;
 
-       while (1) {
-               while (!signal_pending(current)) {
-                       ret = wait_event_interruptible(mad_thread_priv->wait,
-                                                      
atomic_read(&mad_thread_priv->completion_event) > 0);
-                       atomic_set(&mad_thread_priv->completion_event, 0);
-                       if (ret) {
-                               printk(KERN_ERR "ib_mad thread exiting\n");
-                               return 0;
-                       }
+       __set_current_state(TASK_RUNNING);
 
-                       ib_mad_completion_handler(port_priv);
+       do {
+               port_priv->thread_wake = 0;
+               wmb();
 
-               }
-       }
+               ib_mad_completion_handler(port_priv);
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (!port_priv->thread_wake)
+                       schedule();
+               __set_current_state(TASK_RUNNING);
+       } while (!kthread_should_stop());
+
+       return 0;
 }
 
 /*
@@ -965,11 +963,8 @@
  */
 static int ib_mad_thread_init(struct ib_mad_port_private *port_priv)
 {
-       struct ib_mad_thread_private *mad_thread_priv = &port_priv->mad_thread_private;
+       port_priv->thread_wake = 0;
 
-       atomic_set(&mad_thread_priv->completion_event, 0);
-       init_waitqueue_head(&mad_thread_priv->wait);
-
        port_priv->mad_thread = kthread_create(ib_mad_thread,
                                               port_priv,
                                               "ib_mad(%6s-%-2d)",
@@ -978,27 +973,18 @@
        if (IS_ERR(port_priv->mad_thread)) {
                printk(KERN_ERR "Couldn't start ib_mad thread for %s port %d\n",
                       port_priv->device->name, port_priv->port_num);
-               return 1;
+               return PTR_ERR(port_priv->mad_thread);
        }       
-       wake_up_process(port_priv->mad_thread);
        return 0;
 }
 
-/*
- * Stop the IB MAD thread
- */
-static void ib_mad_thread_stop(struct ib_mad_port_private *port_priv)
-{
-       kthread_stop(port_priv->mad_thread);    /* !!! */
-}
-
 static void ib_mad_thread_completion_handler(struct ib_cq *cq)
 {
        struct ib_mad_port_private *port_priv = cq->cq_context;
-       struct ib_mad_thread_private *mad_thread_priv = &port_priv->mad_thread_private;
 
-       atomic_inc(&mad_thread_priv->completion_event);
-       wake_up_interruptible(&mad_thread_priv->wait);
+       port_priv->thread_wake = 1;
+       wmb();
+       wake_up_process(port_priv->mad_thread);
 }
 
 static int ib_mad_post_receive_mad(struct ib_mad_port_private *port_priv,
@@ -1527,7 +1513,7 @@
        spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
 
        ib_mad_port_stop(port_priv);
-       ib_mad_thread_stop(port_priv);
+       kthread_stop(port_priv->mad_thread);
        ib_destroy_qp(port_priv->qp[1]);
        ib_destroy_qp(port_priv->qp[0]);
        ib_dereg_mr(port_priv->mr);
_______________________________________________
openib-general mailing list
[EMAIL PROTECTED]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to