Segmentation Fault when accessing QPN of a RC Queue pair

2011-03-23 Thread Konstantin Boyanov

Hello list,

I have a very peculiar problem with a simple code of mine. In the last 
week I am trying to bring up a simple RC-based programm to run and 
exchange a single message between two peers, but right now I am stuck 
with a segmentation fault when accessing one of the QPs QPN. It is a 
peculiar problem because I create two identical QPs (one on the sending 
peer and one on the receiving peer), and when I access the fields of the 
one queue pair structure I got no problems, but when I do this on the 
other I get a segmentation fault.


I am trying to create a reliable connection between two QPs residing on 
the same HCA and the send messages via the loopback mechanism.


I first open the HCA and init the corresponding context with it:


CODE
static void gpeIBopenDev(ib_thread_info *ibthr, int verbose){

   struct ibv_device **infband_dev_list;
   struct ibv_device_attr ibdev_attr;
   int ret;

   infband_dev_list = ibv_get_device_list(NULL);
   if(!infband_dev_list)
   perror(ibv_get_device_list);

   if(infband_dev_list[0] != NULL){
   ibthr-ibdev = infband_dev_list[0];
   }else
   printf(Error: No IB device found!\n);

   ibthr-ibctx = ibv_open_device(ibthr-ibdev);
   if(!ibthr-ibctx)
   perror(ibv_open_device);
}
/CODE


Then I allocate memory buffers and create protection domains, create a 
protection domain to be associated with the QPs an at last create the 
memory regions which the QPs will be using:



CODE
static void gpeIBinitMemory(ib_thread_info *ibthr, bench_args_t *barg){

 static long int pg_sz;
 pg_sz = sysconf(_SC_PAGESIZE);

 if (barg-conn_type_arg==CONN_TYPE_UD) {
   ibthr-buf = memalign(pg_sz, ( barg-tx_byte_sz_arg + 40 ) * 2);
   if (!ibthr-buf) {
 printf(Could not allocate buffer.\n);
 perror(memalign);
 exit(1);
   }
   memset(ibthr-buf, 0, ( barg-tx_byte_sz_arg + 40 ) * 2);
 } else {
   ibthr-buf = memalign(pg_sz, barg-tx_byte_sz_arg*2);
   if (!ibthr-buf) {
 printf(Could not allocate buffer.\n);
 perror(memalign);
 exit(1);
   }
   memset(ibthr-buf, 0, barg-tx_byte_sz_arg*2);
 }

 ibthr-ib_prot_domain =  ibv_alloc_pd(ibthr-ibctx);
 if(!ibthr-ib_prot_domain){
 perror(ibv_alloc_pd);
 exit(1);
 }

 if(barg-verbose == 1)
 printf(Initialize the inbound and outbound context buffers \n);

 ibthr-out_data = (char *)ibthr-buf + (barg-tx_byte_sz_arg-1);
 ibthr-in_data  = (char *)ibthr-buf + (barg-tx_byte_sz_arg-1)*2;

 if(barg-verbose == 1)
 printf(initialize memory region (MR)\n);

 if (barg-conn_type_arg==CONN_TYPE_UD) {
   ibthr-mr = ibv_reg_mr(ibthr-ib_prot_domain, ibthr-buf, 
(barg-tx_byte_sz_arg+40)*2,

 IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
 }else{
   ibthr-mr = ibv_reg_mr(ibthr-ib_prot_domain, ibthr-buf, 
(barg-tx_byte_sz_arg)*2,

 IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
 }
 if(!ibthr-mr)
   perror(ibv_reg_mr);
}
/CODE


After this being successfully done,  I create and initialize the 
completition channels for this QP context's Queue Pair, create the 
uninitialized Queue Pairs themselves.



CODE
static void gpeIBcreateQP(ib_thread_info *ibthr, bench_args_t *barg){

   struct ibv_qp_init_attr qp_init_attr;
   memset(qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));

   // fill ibqp_ini_attr with needed values
   qp_init_attr.cap.max_send_sge = 0;
   qp_init_attr.cap.max_recv_sge = 0;
   qp_init_attr.cap.max_send_wr = barg-tx_dpth_arg;
   qp_init_attr.cap.max_recv_wr = barg-tx_dpth_arg;

   qp_init_attr.sq_sig_all = 0;

   switch (barg-conn_type_arg) {
 case CONN_TYPE_RC :
 qp_init_attr.qp_type = IBV_QPT_RC;
 break;
 case CONN_TYPE_UC :
 qp_init_attr.qp_type = IBV_QPT_UC;
 break;
 case CONN_TYPE_UD :
 qp_init_attr.qp_type = IBV_QPT_UD;
 break;
 default:
 printf(Unknown connection type %d \n,barg-conn_type_arg);
 exit(1);
   }

   // First create an uninitialized instance of the Queue Pairs
   if(barg-verbose == 1)
   printf( Initialize completion channel \n);

   ibthr-ibcompl_ch = ibv_create_comp_channel(ibthr-ibctx);
   if(!ibthr-ibcompl_ch)
 perror(ibv_create_comp_channel);

   if(barg-verbose == 1)
 printf( Create the Completiotion Queues (CQs) \n);

   ibthr-send_cq = ibv_create_cq(ibthr-ibctx, barg-tx_dpth_arg, 
NULL, ibthr-ibcompl_ch, 0);

   if(!ibthr-send_cq)
 perror(ibv_create_cq);

   qp_init_attr.send_cq = ibthr-send_cq;

   ibthr-recv_cq = ibv_create_cq(ibthr-ibctx, barg-tx_dpth_arg, 
NULL, ibthr-ibcompl_ch, 0);

   if(!ibthr-recv_cq )
 perror(ibv_create_cq);

   qp_init_attr.recv_cq = ibthr-recv_cq;


   ibthr-qp = ibv_create_qp(ibthr-ib_prot_domain, qp_init_attr);
   if (!ibthr-qp) {
   perror(ibv_create_qp);
   fprintf(stderr, Couldn't create QP, %p\n, ibthr-qp);
   exit(1);
   }

   

Re: [opensm] routing segfault + LMC 0 routing bug?

2011-03-23 Thread Hal Rosenstock
On Tue, Mar 22, 2011 at 9:23 PM, Albert Chu ch...@llnl.gov wrote:
 Hey Jim, Alex,

 Just hit a segfault on the main tree.  It appears patch

 commit 9ddcf3419eade13bdc0a54f93930c49fe67efd63
 Author: Jim Schutt jasc...@sandia.gov
 Date:   Fri Sep 3 10:43:12 2010 -0600

 opensm: Avoid havoc in minhop caused by torus-2QoS persistent use of
 osm_port_t:priv.

 segfaults opensm on one of our systems w/ updn routing and lmc  0
 (would likely segfault dor, minhop, and maybe others too).  Our system
 has older switches that do not support enhanced port zero, thus do not
 support LMC  0.  (I imagine setting lmc_esp0 to FALSE, results in the
 same behavior.)  Subsequently even if you set LMC  0 in your opensm
 config file, there can be ports with LMC = 0 and LMC != 0 (e.g. from
 HCAs). Subsequently in alloc_ports_priv(), some ports will have priv set
 to NULL and some will not.  Because of assumptions in osm_switch.c about
 priv != NULL when lmc  0, we hit a segfault.  The issue didn't exist
 before b/c we allocated p_port-priv non-NULL no matter what.

 The attached patch fixes the problem w/ updn.  I haven't looked through
 all of the 2Qos code thoroughly to figure out the consequences of this
 change, so I'm just considering this a starting point for discussion.

 In addition, with the possibility that SP0 ports will be LMC = 0, this
 code in osm_ucast_mgr.c ucast_mgr_process_tbl() does not look good.

 lids_per_port = 1  p_mgr-p_subn-opt.lmc;
 for (i = 0; i  lids_per_port; i++) {
     cl_qlist_t *list = p_mgr-port_order_list;
     cl_list_item_t *item;
     for (item = cl_qlist_head(list); item != cl_qlist_end(list);
          item = cl_qlist_next(item)) {
          osm_port_t *port = cl_item_obj(item, port, list_item);
          ucast_mgr_process_port(p_mgr, p_sw, port, i);
     }
 }

 It iterates over all ports with the configured LMC, not the LMC of the
 port?

Yes, base SP0 is always LMC 0 and either LMC of port or perhaps 0 when
base SP0 and configured LMC otherwise (assuming it's an endport) could
be used for such loops. There used to be cases that used the latter
approach. I'm not sure which is more appropriate now.

-- Hal

 I haven't thought about this too deeply or investigated deeply,
 so consider this another starting point for discussion.

 Al

 --
 Albert Chu
 ch...@llnl.gov
 Computer Scientist
 High Performance Systems Division
 Lawrence Livermore National Laboratory

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [opensm] routing segfault + LMC 0 routing bug?

2011-03-23 Thread Jim Schutt

Hi Al,

Albert Chu wrote:

Hey Jim, Alex,

Just hit a segfault on the main tree.  It appears patch 


commit 9ddcf3419eade13bdc0a54f93930c49fe67efd63
Author: Jim Schutt jasc...@sandia.gov
Date:   Fri Sep 3 10:43:12 2010 -0600

opensm: Avoid havoc in minhop caused by torus-2QoS persistent use of
osm_port_t:priv.

segfaults opensm on one of our systems w/ updn routing and lmc  0
(would likely segfault dor, minhop, and maybe others too).  Our system
has older switches that do not support enhanced port zero, thus do not
support LMC  0.  (I imagine setting lmc_esp0 to FALSE, results in the
same behavior.)  Subsequently even if you set LMC  0 in your opensm
config file, there can be ports with LMC = 0 and LMC != 0 (e.g. from
HCAs). Subsequently in alloc_ports_priv(), some ports will have priv set
to NULL and some will not.  Because of assumptions in osm_switch.c about
priv != NULL when lmc  0, we hit a segfault.  The issue didn't exist
before b/c we allocated p_port-priv non-NULL no matter what.


OK, I think I see.  But this segfault can only occur in
the case where LMC is configured  0, right?

The issue is in osm_switch_recommend_path() when
routing_for_lmc is true, but p_port-priv is NULL, right?



The attached patch fixes the problem w/ updn.  I haven't looked through
all of the 2Qos code thoroughly to figure out the consequences of this
change, so I'm just considering this a starting point for discussion.


Torus-2QoS's use of port-priv is unique because it persists
between routing sweeps.  So if another routing engine runs
after torus-2QoS and uses port-priv without having ensured
that it set it itself, there will be trouble.  9ddcf3419ea
was fixing such an issue.

I can find only two calls of osm_switch_recommend_path(),
and both seem to be to do the right thing, so I think
your patch is OK.



In addition, with the possibility that SP0 ports will be LMC = 0, this
code in osm_ucast_mgr.c ucast_mgr_process_tbl() does not look good.

lids_per_port = 1  p_mgr-p_subn-opt.lmc;
for (i = 0; i  lids_per_port; i++) {
 cl_qlist_t *list = p_mgr-port_order_list;
 cl_list_item_t *item;
 for (item = cl_qlist_head(list); item != cl_qlist_end(list);
  item = cl_qlist_next(item)) {
  osm_port_t *port = cl_item_obj(item, port, list_item);
  ucast_mgr_process_port(p_mgr, p_sw, port, i);
 }
}

It iterates over all ports with the configured LMC, not the LMC of the
port?  I haven't thought about this too deeply or investigated deeply,
so consider this another starting point for discussion.


Hmm, looks like ucast_mgr_process_port() DTRT, though;
it ignores lids that aren't in the range configured on
the port?



Al





Subject:
[PATCH] fix segfault corner case w/ updn routing and LMC  0
From:
Albert L.Chu ch...@llnl.gov
Date:
Tue, 22 Mar 2011 17:36:16 -0700


Signed-off-by: Albert L. Chu ch...@llnl.gov


Reviewed-by: Jim Schutt jasc...@sandia.gov

-- Jim


---
 opensm/osm_ucast_mgr.c |4 
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/opensm/osm_ucast_mgr.c b/opensm/osm_ucast_mgr.c
index 4019589..211d6e0 100644
--- a/opensm/osm_ucast_mgr.c
+++ b/opensm/osm_ucast_mgr.c
@@ -318,10 +318,6 @@ static void alloc_ports_priv(osm_ucast_mgr_t * mgr)
 item = cl_qmap_next(item)) {
port = (osm_port_t *) item;
lmc = ib_port_info_get_lmc(port-p_physp-port_info);
-   if (!lmc) {
-   port-priv = NULL;
-   continue;
-   }
r = malloc(sizeof(*r) + sizeof(r-guids[0]) * (1  lmc));
if (!r) {
OSM_LOG(mgr-p_log, OSM_LOG_ERROR, ERR 3A09: 


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch] mlx4: make mcg entry size a module parameter

2011-03-23 Thread Doug Ledford
We ran across a problem at a customer's site where the qp array in the
mcg entry was being filling up and denying further qp attaches.  In
addition, the upcoming SRIOV support is expected to increase use of this
array as well for unicast address steering (or so the comments in a
patch Yevgeny's original patch that statically increased the size of the
mcg entries stated).  But, since increasing the size of this to an
arbitrarily large number is just a waste of memory, and since we don't
know that 0x200 will be large enough for all use cases, make the option
a module parameter instead.  This has been tested at our customer's site
and solves their problem.

commit ff608ce370b49d2e5b614ff91f4e23b5deaac8a4
Author: Doug Ledford dledf...@redhat.com
Date:   Wed Mar 23 12:20:47 2011 -0400

mlx4: make the size of the mcg entry a module parameter

Testing showed that the default size of 0x100 could be overrun.
Bumping to 0x200 would fix the problem, but only until we hit
the cap again, and at the expense of making memory consumption in
all scenarios worse.  So, make the size of the mcg entry a module
parameter and let those people who need to bump the size do so.

Signed-off-by: Doug Ledford dledf...@redhat.com

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index c7a6213..322f0af 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -158,7 +158,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props-masked_atomic_cap   = IB_ATOMIC_HCA;
props-max_pkeys   = dev-dev-caps.pkey_table_len[1];
props-max_mcast_grp   = dev-dev-caps.num_mgms + 
dev-dev-caps.num_amgms;
-   props-max_mcast_qp_attach = dev-dev-caps.num_qp_per_mgm;
+   props-max_mcast_qp_attach = dev-dev-caps.num_qp_per_mcg;
props-max_total_mcast_qp_attach = props-max_mcast_qp_attach *
   props-max_mcast_grp;
props-max_map_per_fmr = (1  (32 - ilog2(dev-dev-caps.num_mpts))) - 
1;
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 5de1db8..800cb2d 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -434,8 +434,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
 dev_cap-reserved_mrws, dev_cap-reserved_mtts);
mlx4_dbg(dev, Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n,
 dev_cap-max_pds, dev_cap-reserved_pds, 
dev_cap-reserved_uars);
-   mlx4_dbg(dev, Max QP/MCG: %d, reserved MGMs: %d\n,
-dev_cap-max_pds, dev_cap-reserved_mgms);
+   mlx4_dbg(dev, Max QP/MCG: %d, Max MCGs: %d, reserved MGMs: %d\n,
+dev_cap-max_qp_per_mcg, dev_cap-max_mcgs, 
dev_cap-reserved_mgms);
mlx4_dbg(dev, Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n,
 dev_cap-max_cq_sz, dev_cap-max_qp_sz, dev_cap-max_srq_sz);
mlx4_dbg(dev, Local CA ACK delay: %d, max MTU: %d, port width cap: 
%d\n,
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 2765a3c..5306141 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -101,6 +101,10 @@ module_param_named(use_prio, use_prio, bool, 0444);
 MODULE_PARM_DESC(use_prio, Enable steering by VLAN priority on ETH ports 
  (0/1, default 0));
 
+static int log_mcg_size = 8;
+module_param_named(log_mcg_size, log_mcg_size, int, 0444);
+MODULE_PARM_DESC(log_mcg_size, Log2 size of MCG struct (8-11));
+
 static int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG);
 module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
 MODULE_PARM_DESC(log_mtts_per_seg, Log2 number of MTT entries per segment 
(1-7));
@@ -203,7 +207,14 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
dev-caps.reserved_srqs  = dev_cap-reserved_srqs;
dev-caps.max_sq_desc_sz = dev_cap-max_sq_desc_sz;
dev-caps.max_rq_desc_sz = dev_cap-max_rq_desc_sz;
-   dev-caps.num_qp_per_mgm = MLX4_QP_PER_MGM;
+   dev-caps.max_qp_per_mcg = dev_cap-max_qp_per_mcg;
+   dev-caps.max_mcgs   = dev_cap-max_mcgs;
+   i = 0;
+   do {
+   dev-caps.mcg_entry_size = 1  (log_mcg_size - i++);
+   dev-caps.num_qp_per_mcg = 4 * (dev-caps.mcg_entry_size / 
16 - 2);
+   } while (dev-caps.num_qp_per_mcg  dev-caps.max_qp_per_mcg);
+
/*
 * Subtract 1 from the limit because we need to allocate a
 * spare CQE so the HCA HW can tell the difference between an
@@ -642,7 +653,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap,
 * and it's a lot easier than trying to track ref counts.
 */
err = mlx4_init_icm_table(dev, priv-mcg_table.table,
- init_hca-mc_base, MLX4_MGM_ENTRY_SIZE,
+ init_hca-mc_base, dev-caps.mcg_entry_size,

RE: Segmentation Fault when accessing QPN of a RC Queue pair

2011-03-23 Thread Hefty, Sean
 I have little to no idea why the one causes a SEGFAULT and the other
 not, after all I have created and initialized them in the same way. I am
 really getting frustrated by this, I spend the past 3 days looking for
 some cause but to no avail. If I ommit the line where the SEGFAULT
 occurs I get another later when trying to get the QP to the RTS state,
 so sonething is definetly wrong, but right now I don't have the
 slightest clue what. So please, if someone has some idea what I might be
 doing wrong, share it with me. I am really starting to hate this code,
 as there is no comprehensible description as to what are the
 requirements to create a QP for the different transport types, and also
 no examples (at least I didn't found any).

Both libibverbs and librdmacm provide sample programs, and you could also look 
at some of the perftest samples.  The simplest of these for RC QPs is probably 
the rdma_server/rdma_client samples with the librdmacm, but these make use of 
newer APIs.  For UC QPs, you would need to look at libibverbs.  UD QPs are 
setup differently, but both libibverbs and librdmacm have samples.

There wasn't enough context provided in the code snippet for me to see the 
cause of the crash.

- Sean
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [opensm] routing segfault + LMC 0 routing bug?

2011-03-23 Thread Albert Chu
Hey Jim,

On Wed, 2011-03-23 at 09:01 -0700, Jim Schutt wrote:
 Hi Al,
 
 Albert Chu wrote:
  Hey Jim, Alex,
  
  Just hit a segfault on the main tree.  It appears patch 
  
  commit 9ddcf3419eade13bdc0a54f93930c49fe67efd63
  Author: Jim Schutt jasc...@sandia.gov
  Date:   Fri Sep 3 10:43:12 2010 -0600
  
  opensm: Avoid havoc in minhop caused by torus-2QoS persistent use of
  osm_port_t:priv.
  
  segfaults opensm on one of our systems w/ updn routing and lmc  0
  (would likely segfault dor, minhop, and maybe others too).  Our system
  has older switches that do not support enhanced port zero, thus do not
  support LMC  0.  (I imagine setting lmc_esp0 to FALSE, results in the
  same behavior.)  Subsequently even if you set LMC  0 in your opensm
  config file, there can be ports with LMC = 0 and LMC != 0 (e.g. from
  HCAs). Subsequently in alloc_ports_priv(), some ports will have priv set
  to NULL and some will not.  Because of assumptions in osm_switch.c about
  priv != NULL when lmc  0, we hit a segfault.  The issue didn't exist
  before b/c we allocated p_port-priv non-NULL no matter what.
 
 OK, I think I see.  But this segfault can only occur in
 the case where LMC is configured  0, right?

 The issue is in osm_switch_recommend_path() when
 routing_for_lmc is true, but p_port-priv is NULL, right?

Yup.

  
  The attached patch fixes the problem w/ updn.  I haven't looked through
  all of the 2Qos code thoroughly to figure out the consequences of this
  change, so I'm just considering this a starting point for discussion.
 
 Torus-2QoS's use of port-priv is unique because it persists
 between routing sweeps.  So if another routing engine runs
 after torus-2QoS and uses port-priv without having ensured
 that it set it itself, there will be trouble.  9ddcf3419ea
 was fixing such an issue.
 
 I can find only two calls of osm_switch_recommend_path(),
 and both seem to be to do the right thing, so I think
 your patch is OK.

Sounds good.  When reading over your comments about the 2Qos patches
that affected this area, I wasn't quite sure how you were dealing with
the p_port-priv, so I was unsure how my patch would affect things.

  
  In addition, with the possibility that SP0 ports will be LMC = 0, this
  code in osm_ucast_mgr.c ucast_mgr_process_tbl() does not look good.
  
  lids_per_port = 1  p_mgr-p_subn-opt.lmc;
  for (i = 0; i  lids_per_port; i++) {
   cl_qlist_t *list = p_mgr-port_order_list;
   cl_list_item_t *item;
   for (item = cl_qlist_head(list); item != cl_qlist_end(list);
item = cl_qlist_next(item)) {
osm_port_t *port = cl_item_obj(item, port, list_item);
ucast_mgr_process_port(p_mgr, p_sw, port, i);
   }
  }
  
  It iterates over all ports with the configured LMC, not the LMC of the
  port?  I haven't thought about this too deeply or investigated deeply,
  so consider this another starting point for discussion.
 
 Hmm, looks like ucast_mgr_process_port() DTRT, though;
 it ignores lids that aren't in the range configured on
 the port?

Ahh, I think you're right.  It does appear to do the right thing there.
I don't think it's a problem afterall.

Al

  
  Al
  
  
  
  
  
  Subject:
  [PATCH] fix segfault corner case w/ updn routing and LMC  0
  From:
  Albert L.Chu ch...@llnl.gov
  Date:
  Tue, 22 Mar 2011 17:36:16 -0700
  
  
  Signed-off-by: Albert L. Chu ch...@llnl.gov
 
 Reviewed-by: Jim Schutt jasc...@sandia.gov
 
 -- Jim
 
  ---
   opensm/osm_ucast_mgr.c |4 
   1 files changed, 0 insertions(+), 4 deletions(-)
  
  diff --git a/opensm/osm_ucast_mgr.c b/opensm/osm_ucast_mgr.c
  index 4019589..211d6e0 100644
  --- a/opensm/osm_ucast_mgr.c
  +++ b/opensm/osm_ucast_mgr.c
  @@ -318,10 +318,6 @@ static void alloc_ports_priv(osm_ucast_mgr_t * mgr)
   item = cl_qmap_next(item)) {
  port = (osm_port_t *) item;
  lmc = ib_port_info_get_lmc(port-p_physp-port_info);
  -   if (!lmc) {
  -   port-priv = NULL;
  -   continue;
  -   }
  r = malloc(sizeof(*r) + sizeof(r-guids[0]) * (1  lmc));
  if (!r) {
  OSM_LOG(mgr-p_log, OSM_LOG_ERROR, ERR 3A09: 
 
-- 
Albert Chu
ch...@llnl.gov
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] please pull infiniband.git

2011-03-23 Thread Roland Dreier
Linus, please pull from

master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git for-linus

This tree is also available from kernel.org mirrors at:

git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git 
for-linus

This will get the second batch of 2.6.39 changes:
 - Dave Dillow's SRP initiator changes
 - Fix to uninitialized return value introduced by net tree merge
 - A couple of other minor cleanups

David Dillow (7):
  IB/srp: always avoid non-zero offsets into an FMR
  IB/srp: move IB CM setup completion into its own function
  IB/srp: allow sg_tablesize to be set for each target
  IB/srp: rework mapping engine to use multiple FMR entries
  IB/srp: add support for indirect tables that don't fit in SRP_CMD
  IB/srp: try to use larger FMR sizes to cover our mappings
  IB: Increase DMA max_segment_size on Mellanox hardware

Michael Heinz (1):
  IB/mad: Improve an error message so error code is included

Roland Dreier (3):
  RDMA/nes: Don't print success message at level KERN_ERR
  Merge branch 'external-indirect' of 
git://git.kernel.org/.../dad/srp-initiator into srp
  Merge branches 'misc', 'nes' and 'srp' into for-next

Sean Hefty (1):
  RDMA/addr: Fix return of uninitialized ret value

 drivers/infiniband/core/addr.c   |2 +-
 drivers/infiniband/core/agent.c  |3 +-
 drivers/infiniband/hw/mthca/mthca_main.c |3 +
 drivers/infiniband/hw/nes/nes.c  |2 +-
 drivers/infiniband/ulp/srp/ib_srp.c  |  725 --
 drivers/infiniband/ulp/srp/ib_srp.h  |   38 ++-
 6 files changed, 521 insertions(+), 252 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [opensm] routing segfault + LMC 0 routing bug?

2011-03-23 Thread Jim Schutt

Albert Chu wrote:

Hey Jim,

On Wed, 2011-03-23 at 09:01 -0700, Jim Schutt wrote:

Hi Al,




Torus-2QoS's use of port-priv is unique because it persists
between routing sweeps.  So if another routing engine runs
after torus-2QoS and uses port-priv without having ensured
that it set it itself, there will be trouble.  9ddcf3419ea
was fixing such an issue.

I can find only two calls of osm_switch_recommend_path(),
and both seem to be to do the right thing, so I think
your patch is OK.


Sounds good.  When reading over your comments about the 2Qos patches
that affected this area, I wasn't quite sure how you were dealing with
the p_port-priv, so I was unsure how my patch would affect things.



There's some comments at the top of osm_torus.c,
in the definitions of struct endpoint and struct t_switch,
that describe the rules for how torus-2QoS code can
safely use -priv.

They may shed some extra light...

-- Jim

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] please pull infiniband.git

2011-03-23 Thread Roland Dreier
On Wed, Mar 23, 2011 at 10:47 AM, Roland Dreier rol...@kernel.org wrote:
 drivers/infiniband/core/addr.c           |    2 +-
  drivers/infiniband/core/agent.c          |    3 +-
  drivers/infiniband/hw/mthca/mthca_main.c |    3 +
  drivers/infiniband/hw/nes/nes.c          |    2 +-
  drivers/infiniband/ulp/srp/ib_srp.c      |  725 
 --
  drivers/infiniband/ulp/srp/ib_srp.h      |   38 ++-
  6 files changed, 521 insertions(+), 252 deletions(-)

Somehow I missed a hunk of

 IB: Increase DMA max_segment_size on Mellanox hardware

diffstat now should look like

 drivers/infiniband/core/addr.c   |2 +-
 drivers/infiniband/core/agent.c  |3 +-
 drivers/infiniband/hw/mthca/mthca_main.c |3 +
 drivers/infiniband/hw/nes/nes.c  |2 +-
 drivers/infiniband/ulp/srp/ib_srp.c  |  725 --
 drivers/infiniband/ulp/srp/ib_srp.h  |   38 ++-
 drivers/net/mlx4/main.c  |3 +
 7 files changed, 524 insertions(+), 252 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [opensm] RFC: new routing options (repost)

2011-03-23 Thread Albert Chu
Hi Alex,

As discussed in a private thread, here are the patches again, with some
tweaks.  Most notably, the tweak ensures that the remote_guid_sorting
option is independent of port_shifting, so users may enable either,
none, or both options at their discretion.

Al

On Thu, 2011-02-10 at 17:33 -0800, Albert Chu wrote:
 [This is a repost from Oct 2010 with rebased patches]
 
 We recently got a new cluster and I've been experimenting with some
 routing changes to improve the average bandwidth of the cluster.  They
 are attached as patches with description of the routing goals below.
 
 We're using mpiGraph (http://sourceforge.net/projects/mpigraph/) to
 measure min, peak, and average send/recv bandwidth across the cluster.
 What we found with the original updn routing was an average of around
 420 MB/s send bandwidth and 508 MB/s recv bandwidth.  The following two
 patches were able to get the average send bandwidth up to 1045 MB/s and
 recv bandwidth up to 1228 MB/s.
 
 I'm sure this is only round 1 of the patches and I'm looking for
 comments.  Many areas could be cleaned up w/ some rearchitecture, but I
 elected to implement the most non-invasive implementation first.  I'm
 also open to name changes on the options.
 
 1) Port Shifting
 
 This is similar to what was done with some of the LMC  0 code.
 Congestion would occur due to alignment of routes w/ common traffic
 patterns.  However, we found that it was also necessary for LMC=0 and
 only for used-ports.  For example, lets say there are 4 ports (called A,
 B, C, D) and we are routing lids 1-9 through them.  Suppose only routing
 through A, B, and C will reach lids 1-9.
 
 The LFT would normally be:
 
 A: 1 4 7
 B: 2 5 8
 C: 3 6 9
 D:
 
 The Port Shifting option would make this:
 
 A: 1 6 8
 B: 2 4 9
 C: 3 5 7
 D:
 
 This option by itself improved the mpiGraph average send/recv bandwidth
 from 420 MB/s and 508 MB/s to to 991 MB/s and 1172 MB/s.
 
 2) Remote Guid Sorting
 
 Most core/spine switches we've seen thus far have had line boards
 connected to spine boards in a consistent pattern.  However, we recently
 got some Qlogic switches that connect from line/leaf boards to spine
 boards in a (to the casual observer) random pattern.  I'm sure there was
 a good electrical/board reason for this design, but it does hurt routing
 b/c updn doesn't account for this.  Here's an output from iblinkinfo as
 an example.
 
 Switch 0x00066a00ec0029b8 ibcore1 L123:
  1801[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 254   19[  ] 
 ibsw55 ( )
  1802[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 253   19[  ] 
 ibsw56 ( )
  1803[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 258   19[  ] 
 ibsw57 ( )
  1804[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 257   19[  ] 
 ibsw58 ( )
  1805[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 256   19[  ] 
 ibsw59 ( )
  1806[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 255   19[  ] 
 ibsw60 ( )
  1807[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 261   19[  ] 
 ibsw61 ( )
  1808[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 262   19[  ] 
 ibsw62 ( )
  1809[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 260   19[  ] 
 ibsw63 ( )
  180   10[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 259   19[  ] 
 ibsw64 ( )
  180   11[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 284   19[  ] 
 ibsw65 ( )
  180   12[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 285   19[  ] 
 ibsw66 ( )
  180   13[  ] ==( 4X 10.0 Gbps Active/  LinkUp)==2227   19[  ] 
 ibsw67 ( )
  180   14[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 283   19[  ] 
 ibsw68 ( )
  180   15[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 267   19[  ] 
 ibsw69 ( )
  180   16[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 270   19[  ] 
 ibsw70 ( )
  180   17[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 269   19[  ] 
 ibsw71 ( )
  180   18[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 268   19[  ] 
 ibsw72 ( )
  180   19[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 222   17[  ] 
 ibcore1 S117B ( )
  180   20[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 209   19[  ] 
 ibcore1 S211B ( )
  180   21[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 218   21[  ] 
 ibcore1 S117A ( )
  180   22[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 192   23[  ] 
 ibcore1 S215B ( )
  180   23[  ] ==( 4X 10.0 Gbps Active/  LinkUp)==  85   15[  ] 
 ibcore1 S209A ( )
  180   24[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 182   13[  ] 
 ibcore1 S215A ( )
  180   25[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 200   11[  ] 
 ibcore1 S115B ( )
  180   26[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 129   25[  ] 
 ibcore1 S209B ( )
  180   27[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 213   27[  ] 
 ibcore1 S115A ( )
  180   28[  ] ==( 4X 10.0 Gbps Active/  LinkUp)== 197   29[  ] 
 

[PATCH] rdma/ucm: Fix race in ucma_create_id

2011-03-23 Thread Hefty, Sean
The following problem was reported by Roland Dreier based
on code inspection:

I do see what seems like an exploitable race in
ucma_create_id():

- one thread create an id with an invalid userspace
  pointer (so the copy_to_user in ucma_create_id
  returns -EFAULT and calls rdma_destroy_id before
  idr_remove)
- another thread guess the id that is going to be
  returned and call ucma_destroy_id()

if the second thread hits the window where the cm_id is
destroyed but the ctx is still in the idr, it can
trigger a double free.

There is an issue here that the second thread can try to use
the new rdma_cm_id in another call after it has been destroyed.
(The problem isn't just restricted to the second thread calling
ucma_destroy_id.)  Fix this by holding the file-mux around
the entire creation process, so another thread cannot find the
id until it has been fully created.

Signed-off-by: Sean Hefty sean.he...@intel.com
---
 drivers/infiniband/core/ucma.c |   17 +
 1 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index ca12acf..fd6b980 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -384,31 +384,32 @@ static ssize_t ucma_create_id(struct ucma_file *file,
 
mutex_lock(file-mut);
ctx = ucma_alloc_ctx(file);
-   mutex_unlock(file-mut);
-   if (!ctx)
+   if (!ctx) {
+   mutex_unlock(file-mut);
return -ENOMEM;
+   }
 
ctx-uid = cmd.uid;
ctx-cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps);
if (IS_ERR(ctx-cm_id)) {
ret = PTR_ERR(ctx-cm_id);
-   goto err1;
+   goto err;
}
 
resp.id = ctx-id;
if (copy_to_user((void __user *)(unsigned long)cmd.response,
 resp, sizeof(resp))) {
ret = -EFAULT;
-   goto err2;
+   goto err;
}
+   mutex_unlock(file-mut);
return 0;
 
-err2:
-   rdma_destroy_id(ctx-cm_id);
-err1:
-   mutex_lock(mut);
+err:
idr_remove(ctx_idr, ctx-id);
mutex_unlock(mut);
+   if (!IS_ERR(ctx-cm_id))
+   rdma_destroy_id(ctx-cm_id);
kfree(ctx);
return ret;
 }


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] rdma/ucm: Fix race in ucma_create_id

2011-03-23 Thread Hefty, Sean
ignore this - there are 2 locks being used in ucma_create_id

I will resubmit


 diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
 index ca12acf..fd6b980 100644
 --- a/drivers/infiniband/core/ucma.c
 +++ b/drivers/infiniband/core/ucma.c
 @@ -384,31 +384,32 @@ static ssize_t ucma_create_id(struct ucma_file *file,
 
   mutex_lock(file-mut);
   ctx = ucma_alloc_ctx(file);
 - mutex_unlock(file-mut);
 - if (!ctx)
 + if (!ctx) {
 + mutex_unlock(file-mut);
   return -ENOMEM;
 + }
 
   ctx-uid = cmd.uid;
   ctx-cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps);
   if (IS_ERR(ctx-cm_id)) {
   ret = PTR_ERR(ctx-cm_id);
 - goto err1;
 + goto err;
   }
 
   resp.id = ctx-id;
   if (copy_to_user((void __user *)(unsigned long)cmd.response,
resp, sizeof(resp))) {
   ret = -EFAULT;
 - goto err2;
 + goto err;
   }
 + mutex_unlock(file-mut);
   return 0;
 
 -err2:
 - rdma_destroy_id(ctx-cm_id);
 -err1:
 - mutex_lock(mut);
 +err:
   idr_remove(ctx_idr, ctx-id);
   mutex_unlock(mut);
 + if (!IS_ERR(ctx-cm_id))
 + rdma_destroy_id(ctx-cm_id);
   kfree(ctx);
   return ret;
  }
 
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html