Find attached a fix, that will maybe solve the problem.
Maybe, because i haven't time to check it and this work week (down here)
is ended.
I'll check the patch next week.
 
About the patch.
 
First of all, special thanks to Anatoly for the right pointing to 1435
patch.
 
To recall, 1435 patch has improved event notification mechanism for cq,
qp and srq objects.
I found one problem in the patch, which repeats itself for all three
objects and for both drivers: new event handlers get the old (and wrong)
context values.
The new (and right) context values are nor used. As a result, IBAL
callbacks are called with wrong handle parameter, which ends up with
crash.
 
 
Index: hw/mlx4/kernel/bus/inc/ib_verbs.h
===================================================================
--- hw/mlx4/kernel/bus/inc/ib_verbs.h (revision 1452)
+++ hw/mlx4/kernel/bus/inc/ib_verbs.h (working copy)
@@ -742,7 +742,6 @@
  void *             cq_context;
  int                cqe;
  atomic_t           usecnt; /* count number of work queues */
- struct ib_cq_ex    x;
 };
 
 struct ib_srq {
@@ -752,7 +751,6 @@
  void        (*event_handler)(ib_event_rec_t *);
  void         *srq_context;
  atomic_t  usecnt;
- struct ib_srq_ex        x;
 };
 
 struct ib_qp {
@@ -766,7 +764,6 @@
  void         *qp_context;
  u32   qp_num;
  enum ib_qp_type  qp_type;
- struct ib_qp_ex         x;
 };
 
 struct ib_mr {
Index: hw/mlx4/kernel/bus/inc/ib_verbs_ex.h
===================================================================
--- hw/mlx4/kernel/bus/inc/ib_verbs_ex.h (revision 1452)
+++ hw/mlx4/kernel/bus/inc/ib_verbs_ex.h (working copy)
@@ -73,24 +73,6 @@
  int     fw_if_open;
 };
 
-/* extension for ib_cq */
-struct ib_cq_ex 
-{
- void *    ctx;  /* IBAL CQ context */
-};
-
-/* extension for ib_qp */
-struct ib_qp_ex 
-{
- void *    ctx;  /* IBAL QP context */
-};
-
-/* extension for ib_srq */
-struct ib_srq_ex 
-{
- void *    ctx;  /* IBAL SRQ context */
-};
-
 /* extension for ib_event */
 struct ib_event_ex 
 {
Index: hw/mlx4/kernel/hca/cq.c
===================================================================
--- hw/mlx4/kernel/hca/cq.c (revision 1452)
+++ hw/mlx4/kernel/hca/cq.c (working copy)
@@ -89,7 +89,7 @@
  // allocate cq 
  p_ib_cq = ibv_create_cq(p_ibdev, 
   cq_comp_handler, event_handler,
-  p_hca, *p_size, p_uctx, p_umv_buf );
+  (void*)cq_context, *p_size, p_uctx, p_umv_buf );
  if (IS_ERR(p_ib_cq)) {
   err = PTR_ERR(p_ib_cq);
   HCA_PRINT (TRACE_LEVEL_ERROR ,HCA_DBG_CQ, ("ibv_create_cq failed
(%d)\n", err));
@@ -97,9 +97,6 @@
   goto err_create_cq;
  }
 
- // fill the object
- p_ib_cq->x.ctx = (void*)cq_context;
- 
  // return the result
  *p_size = p_ib_cq->cqe;
 
Index: hw/mlx4/kernel/hca/qp.c
===================================================================
--- hw/mlx4/kernel/hca/qp.c (revision 1452)
+++ hw/mlx4/kernel/hca/qp.c (working copy)
@@ -100,8 +100,6 @@
  struct ib_qp_init_attr qp_init_attr;
  struct ib_ucontext *p_uctx = NULL;
  struct ib_pd *p_ib_pd = (struct ib_pd *)h_pd;
- struct ib_device *p_ib_dev = p_ib_pd->device;
- mlnx_hca_t *p_hca = ibdev2hca(p_ib_dev);
  struct ibv_create_qp *p_req = NULL;
  
  HCA_ENTER(HCA_DBG_QP);
@@ -121,7 +119,7 @@
  // prepare the parameters
  RtlZeroMemory(&qp_init_attr, sizeof(qp_init_attr));
  qp_init_attr.event_handler = event_handler;
- qp_init_attr.qp_context = p_hca;
+ qp_init_attr.qp_context = (void*)qp_uctx;
  qp_init_attr.send_cq = (struct ib_cq *)p_create_attr->h_sq_cq;
  qp_init_attr.recv_cq = (struct ib_cq *)p_create_attr->h_rq_cq;
  qp_init_attr.srq = (struct ib_srq *)p_create_attr->h_srq;
@@ -153,9 +151,6 @@
   goto err_create_qp;
  }
 
- // fill the object
- p_ib_qp->x.ctx = (void*)qp_uctx;
-
  // Query QP to obtain requested attributes
  if (p_qp_attr) {
   status = mlnx_query_qp((ib_qp_handle_t)p_ib_qp, p_qp_attr,
p_umv_buf);
Index: hw/mlx4/kernel/hca/srq.c
===================================================================
--- hw/mlx4/kernel/hca/srq.c (revision 1452)
+++ hw/mlx4/kernel/hca/srq.c (working copy)
@@ -55,8 +55,6 @@
  struct ib_srq_init_attr srq_init_attr;
  struct ib_ucontext *p_uctx = NULL;
  struct ib_pd *p_ib_pd = (struct ib_pd *)h_pd;
- struct ib_device *p_ib_dev = p_ib_pd->device;
- mlnx_hca_t *p_hca = ibdev2hca(p_ib_dev);
 
  HCA_ENTER(HCA_DBG_SRQ);
 
@@ -75,7 +73,7 @@
  // prepare the parameters
  RtlZeroMemory(&srq_init_attr, sizeof(srq_init_attr));
  srq_init_attr.event_handler = event_handler;
- srq_init_attr.srq_context = p_hca;
+ srq_init_attr.srq_context = (void*)srq_context;
  srq_init_attr.attr.max_wr = p_srq_attr->max_wr;
  srq_init_attr.attr.max_sge = p_srq_attr->max_sge;
  srq_init_attr.attr.srq_limit = p_srq_attr->srq_limit;
@@ -88,7 +86,6 @@
   status = errno_to_iberr(err);
   goto err_create_srq;
  }
- p_ib_srq->x.ctx = (void*)srq_context;
 
  // return the result
  if (ph_srq) *ph_srq = (ib_srq_handle_t)p_ib_srq;
Index: hw/mthca/kernel/hca_verbs.c
===================================================================
--- hw/mthca/kernel/hca_verbs.c (revision 1452)
+++ hw/mthca/kernel/hca_verbs.c (working copy)
@@ -870,12 +870,10 @@
  int err;
  ib_api_status_t  status;
  struct ib_srq *ib_srq_p;
- struct mthca_srq *srq_p;
  struct ib_srq_init_attr srq_init_attr;
  struct ib_ucontext *p_context = NULL;
  struct ib_pd *ib_pd_p = (struct ib_pd *)h_pd;
  struct ib_device *ib_dev = ib_pd_p->device;
- mlnx_hob_t  *hob_p = HOB_FROM_IBDEV(ib_dev);
 
  HCA_ENTER(HCA_DBG_SRQ);
 
@@ -894,7 +892,7 @@
  // prepare the parameters
  RtlZeroMemory(&srq_init_attr, sizeof(srq_init_attr));
  srq_init_attr.event_handler = event_handler;
- srq_init_attr.srq_context = hob_p;
+ srq_init_attr.srq_context = (void*)srq_context;
  srq_init_attr.attr = *p_srq_attr;
 
  // allocate srq 
@@ -906,12 +904,8 @@
   goto err_create_srq;
  }
 
- // fill the object
- srq_p = (struct mthca_srq *)ib_srq_p;
- srq_p->srq_context = (void*)srq_context;
- 
  // return the result
- if (ph_srq) *ph_srq = (ib_srq_handle_t)srq_p;
+ if (ph_srq) *ph_srq = (ib_srq_handle_t)ib_srq_p;
 
  status = IB_SUCCESS;
  
@@ -1044,7 +1038,6 @@
  struct ib_ucontext *p_context = NULL;
  struct ib_pd *ib_pd_p = (struct ib_pd *)h_pd;
  struct ib_device *ib_dev = ib_pd_p->device;
- mlnx_hob_t  *hob_p = HOB_FROM_IBDEV(ib_dev);
  
  HCA_ENTER(HCA_DBG_QP);
 
@@ -1063,7 +1056,7 @@
  RtlZeroMemory(&qp_init_attr, sizeof(qp_init_attr));
  qp_init_attr.qp_type = p_create_attr->qp_type;
  qp_init_attr.event_handler = event_handler;
- qp_init_attr.qp_context = hob_p;
+ qp_init_attr.qp_context = (void*)qp_context;
  qp_init_attr.recv_cq = (struct ib_cq *)p_create_attr->h_rq_cq;
  qp_init_attr.send_cq = (struct ib_cq *)p_create_attr->h_sq_cq;
  qp_init_attr.srq = (struct ib_srq *)p_create_attr->h_srq;
@@ -1087,7 +1080,6 @@
 
  // fill the object
  qp_p = (struct mthca_qp *)ib_qp_p;
- qp_p->qp_context = (void*)qp_context;
  qp_p->qp_init_attr = qp_init_attr;
 
  // Query QP to obtain requested attributes
@@ -1401,7 +1393,6 @@
  int err;
  ib_api_status_t  status;
  struct ib_cq *ib_cq_p;
- struct mthca_cq *cq_p;
  mlnx_hob_t   *hob_p;
  struct ib_device *ib_dev;
  struct ib_ucontext *p_context;
@@ -1437,7 +1428,7 @@
  // allocate cq 
  ib_cq_p = ibv_create_cq(ib_dev, 
   cq_comp_handler, event_handler,
-  hob_p, *p_size, p_context, p_umv_buf );
+  (void*)cq_context, *p_size, p_context, p_umv_buf );
  if (IS_ERR(ib_cq_p)) {
   err = PTR_ERR(ib_cq_p);
   HCA_PRINT (TRACE_LEVEL_ERROR ,HCA_DBG_CQ, ("ibv_create_cq failed
(%d)\n", err));
@@ -1445,15 +1436,11 @@
   goto err_create_cq;
  }
 
- // fill the object
- cq_p = (struct mthca_cq *)ib_cq_p;
- cq_p->cq_context = (void*)cq_context;
- 
  // return the result
 // *p_size = *p_size; // return the same value
  *p_size = ib_cq_p->cqe;
 
- if (ph_cq) *ph_cq = (ib_cq_handle_t)cq_p;
+ if (ph_cq) *ph_cq = (ib_cq_handle_t)ib_cq_p;
 
  status = IB_SUCCESS;
  
Index: hw/mthca/kernel/mthca_cq.c
===================================================================
--- hw/mthca/kernel/mthca_cq.c (revision 1452)
+++ hw/mthca/kernel/mthca_cq.c (working copy)
@@ -237,7 +237,7 @@
    ++cq->arm_sn;
  }
 
- cq->ibcq.comp_handler(cq->cq_context);
+ cq->ibcq.comp_handler(cq->ibcq.cq_context);
 }
 
 void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
Index: hw/mthca/kernel/mthca_provider.h
===================================================================
--- hw/mthca/kernel/mthca_provider.h (revision 1452)
+++ hw/mthca/kernel/mthca_provider.h (working copy)
@@ -189,7 +189,6 @@
 
 struct mthca_cq {
  struct ib_cq           ibcq;
- void      *cq_context; // leo: for IBAL shim
  spinlock_t             lock;
  atomic_t               refcount;
  int                    cqn;
@@ -234,7 +233,6 @@
 
  wait_queue_head_t wait;
  KMUTEX   mutex;
- void    *srq_context; 
 };
 
 struct mthca_wq {
@@ -254,7 +252,6 @@
 
 struct mthca_qp {
  struct ib_qp           ibqp;
- void      *qp_context; // leo: for IBAL shim
  //TODO: added just because absense of ibv_query_qp
  // thereafter it may be worth to be replaced by struct ib_qp_attr
qp_attr;
  struct ib_qp_init_attr qp_init_attr; // leo: for query_qp

 
 
 


________________________________

        From: [EMAIL PROTECTED]
[mailto:[EMAIL PROTECTED] On Behalf Of Anatoly
Greenblatt
        Sent: Wednesday, July 30, 2008 2:35 PM
        To: [email protected]
        Subject: [ofw] ConnectX functionality is completely broken
        Importance: High
        
        

        Hi,

         

        I've compiled svn rev 1450 and tried to install on ws2k3/ws2k8
x64 with connect x HCA. The BSOD occurs during the installation. The
last revision we tested was 1421. I've recompiled few revisions and
found that changes between rev 1434 and 1435 cause the bugheck.

         

        Bottom line - connect x functionality is broken in revision
1435.

         

        Regards,

        Anatoly.

         

Attachment: event_notify.patch
Description: event_notify.patch

_______________________________________________
ofw mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw

Reply via email to