Segmentation Fault when accessing QPN of a RC Queue pair
Hello list, I have a very peculiar problem with a simple code of mine. In the last week I am trying to bring up a simple RC-based programm to run and exchange a single message between two peers, but right now I am stuck with a segmentation fault when accessing one of the QPs QPN. It is a peculiar problem because I create two identical QPs (one on the sending peer and one on the receiving peer), and when I access the fields of the one queue pair structure I got no problems, but when I do this on the other I get a segmentation fault. I am trying to create a reliable connection between two QPs residing on the same HCA and the send messages via the loopback mechanism. I first open the HCA and init the corresponding context with it: CODE static void gpeIBopenDev(ib_thread_info *ibthr, int verbose){ struct ibv_device **infband_dev_list; struct ibv_device_attr ibdev_attr; int ret; infband_dev_list = ibv_get_device_list(NULL); if(!infband_dev_list) perror(ibv_get_device_list); if(infband_dev_list[0] != NULL){ ibthr-ibdev = infband_dev_list[0]; }else printf(Error: No IB device found!\n); ibthr-ibctx = ibv_open_device(ibthr-ibdev); if(!ibthr-ibctx) perror(ibv_open_device); } /CODE Then I allocate memory buffers and create protection domains, create a protection domain to be associated with the QPs an at last create the memory regions which the QPs will be using: CODE static void gpeIBinitMemory(ib_thread_info *ibthr, bench_args_t *barg){ static long int pg_sz; pg_sz = sysconf(_SC_PAGESIZE); if (barg-conn_type_arg==CONN_TYPE_UD) { ibthr-buf = memalign(pg_sz, ( barg-tx_byte_sz_arg + 40 ) * 2); if (!ibthr-buf) { printf(Could not allocate buffer.\n); perror(memalign); exit(1); } memset(ibthr-buf, 0, ( barg-tx_byte_sz_arg + 40 ) * 2); } else { ibthr-buf = memalign(pg_sz, barg-tx_byte_sz_arg*2); if (!ibthr-buf) { printf(Could not allocate buffer.\n); perror(memalign); exit(1); } memset(ibthr-buf, 0, barg-tx_byte_sz_arg*2); } ibthr-ib_prot_domain = ibv_alloc_pd(ibthr-ibctx); if(!ibthr-ib_prot_domain){ perror(ibv_alloc_pd); exit(1); } if(barg-verbose == 1) printf(Initialize the inbound and outbound context buffers \n); ibthr-out_data = (char *)ibthr-buf + (barg-tx_byte_sz_arg-1); ibthr-in_data = (char *)ibthr-buf + (barg-tx_byte_sz_arg-1)*2; if(barg-verbose == 1) printf(initialize memory region (MR)\n); if (barg-conn_type_arg==CONN_TYPE_UD) { ibthr-mr = ibv_reg_mr(ibthr-ib_prot_domain, ibthr-buf, (barg-tx_byte_sz_arg+40)*2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); }else{ ibthr-mr = ibv_reg_mr(ibthr-ib_prot_domain, ibthr-buf, (barg-tx_byte_sz_arg)*2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); } if(!ibthr-mr) perror(ibv_reg_mr); } /CODE After this being successfully done, I create and initialize the completition channels for this QP context's Queue Pair, create the uninitialized Queue Pairs themselves. CODE static void gpeIBcreateQP(ib_thread_info *ibthr, bench_args_t *barg){ struct ibv_qp_init_attr qp_init_attr; memset(qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); // fill ibqp_ini_attr with needed values qp_init_attr.cap.max_send_sge = 0; qp_init_attr.cap.max_recv_sge = 0; qp_init_attr.cap.max_send_wr = barg-tx_dpth_arg; qp_init_attr.cap.max_recv_wr = barg-tx_dpth_arg; qp_init_attr.sq_sig_all = 0; switch (barg-conn_type_arg) { case CONN_TYPE_RC : qp_init_attr.qp_type = IBV_QPT_RC; break; case CONN_TYPE_UC : qp_init_attr.qp_type = IBV_QPT_UC; break; case CONN_TYPE_UD : qp_init_attr.qp_type = IBV_QPT_UD; break; default: printf(Unknown connection type %d \n,barg-conn_type_arg); exit(1); } // First create an uninitialized instance of the Queue Pairs if(barg-verbose == 1) printf( Initialize completion channel \n); ibthr-ibcompl_ch = ibv_create_comp_channel(ibthr-ibctx); if(!ibthr-ibcompl_ch) perror(ibv_create_comp_channel); if(barg-verbose == 1) printf( Create the Completiotion Queues (CQs) \n); ibthr-send_cq = ibv_create_cq(ibthr-ibctx, barg-tx_dpth_arg, NULL, ibthr-ibcompl_ch, 0); if(!ibthr-send_cq) perror(ibv_create_cq); qp_init_attr.send_cq = ibthr-send_cq; ibthr-recv_cq = ibv_create_cq(ibthr-ibctx, barg-tx_dpth_arg, NULL, ibthr-ibcompl_ch, 0); if(!ibthr-recv_cq ) perror(ibv_create_cq); qp_init_attr.recv_cq = ibthr-recv_cq; ibthr-qp = ibv_create_qp(ibthr-ib_prot_domain, qp_init_attr); if (!ibthr-qp) { perror(ibv_create_qp); fprintf(stderr, Couldn't create QP, %p\n, ibthr-qp); exit(1); }
Re: [opensm] routing segfault + LMC 0 routing bug?
On Tue, Mar 22, 2011 at 9:23 PM, Albert Chu ch...@llnl.gov wrote: Hey Jim, Alex, Just hit a segfault on the main tree. It appears patch commit 9ddcf3419eade13bdc0a54f93930c49fe67efd63 Author: Jim Schutt jasc...@sandia.gov Date: Fri Sep 3 10:43:12 2010 -0600 opensm: Avoid havoc in minhop caused by torus-2QoS persistent use of osm_port_t:priv. segfaults opensm on one of our systems w/ updn routing and lmc 0 (would likely segfault dor, minhop, and maybe others too). Our system has older switches that do not support enhanced port zero, thus do not support LMC 0. (I imagine setting lmc_esp0 to FALSE, results in the same behavior.) Subsequently even if you set LMC 0 in your opensm config file, there can be ports with LMC = 0 and LMC != 0 (e.g. from HCAs). Subsequently in alloc_ports_priv(), some ports will have priv set to NULL and some will not. Because of assumptions in osm_switch.c about priv != NULL when lmc 0, we hit a segfault. The issue didn't exist before b/c we allocated p_port-priv non-NULL no matter what. The attached patch fixes the problem w/ updn. I haven't looked through all of the 2Qos code thoroughly to figure out the consequences of this change, so I'm just considering this a starting point for discussion. In addition, with the possibility that SP0 ports will be LMC = 0, this code in osm_ucast_mgr.c ucast_mgr_process_tbl() does not look good. lids_per_port = 1 p_mgr-p_subn-opt.lmc; for (i = 0; i lids_per_port; i++) { cl_qlist_t *list = p_mgr-port_order_list; cl_list_item_t *item; for (item = cl_qlist_head(list); item != cl_qlist_end(list); item = cl_qlist_next(item)) { osm_port_t *port = cl_item_obj(item, port, list_item); ucast_mgr_process_port(p_mgr, p_sw, port, i); } } It iterates over all ports with the configured LMC, not the LMC of the port? Yes, base SP0 is always LMC 0 and either LMC of port or perhaps 0 when base SP0 and configured LMC otherwise (assuming it's an endport) could be used for such loops. There used to be cases that used the latter approach. I'm not sure which is more appropriate now. -- Hal I haven't thought about this too deeply or investigated deeply, so consider this another starting point for discussion. Al -- Albert Chu ch...@llnl.gov Computer Scientist High Performance Systems Division Lawrence Livermore National Laboratory -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [opensm] routing segfault + LMC 0 routing bug?
Hi Al, Albert Chu wrote: Hey Jim, Alex, Just hit a segfault on the main tree. It appears patch commit 9ddcf3419eade13bdc0a54f93930c49fe67efd63 Author: Jim Schutt jasc...@sandia.gov Date: Fri Sep 3 10:43:12 2010 -0600 opensm: Avoid havoc in minhop caused by torus-2QoS persistent use of osm_port_t:priv. segfaults opensm on one of our systems w/ updn routing and lmc 0 (would likely segfault dor, minhop, and maybe others too). Our system has older switches that do not support enhanced port zero, thus do not support LMC 0. (I imagine setting lmc_esp0 to FALSE, results in the same behavior.) Subsequently even if you set LMC 0 in your opensm config file, there can be ports with LMC = 0 and LMC != 0 (e.g. from HCAs). Subsequently in alloc_ports_priv(), some ports will have priv set to NULL and some will not. Because of assumptions in osm_switch.c about priv != NULL when lmc 0, we hit a segfault. The issue didn't exist before b/c we allocated p_port-priv non-NULL no matter what. OK, I think I see. But this segfault can only occur in the case where LMC is configured 0, right? The issue is in osm_switch_recommend_path() when routing_for_lmc is true, but p_port-priv is NULL, right? The attached patch fixes the problem w/ updn. I haven't looked through all of the 2Qos code thoroughly to figure out the consequences of this change, so I'm just considering this a starting point for discussion. Torus-2QoS's use of port-priv is unique because it persists between routing sweeps. So if another routing engine runs after torus-2QoS and uses port-priv without having ensured that it set it itself, there will be trouble. 9ddcf3419ea was fixing such an issue. I can find only two calls of osm_switch_recommend_path(), and both seem to be to do the right thing, so I think your patch is OK. In addition, with the possibility that SP0 ports will be LMC = 0, this code in osm_ucast_mgr.c ucast_mgr_process_tbl() does not look good. lids_per_port = 1 p_mgr-p_subn-opt.lmc; for (i = 0; i lids_per_port; i++) { cl_qlist_t *list = p_mgr-port_order_list; cl_list_item_t *item; for (item = cl_qlist_head(list); item != cl_qlist_end(list); item = cl_qlist_next(item)) { osm_port_t *port = cl_item_obj(item, port, list_item); ucast_mgr_process_port(p_mgr, p_sw, port, i); } } It iterates over all ports with the configured LMC, not the LMC of the port? I haven't thought about this too deeply or investigated deeply, so consider this another starting point for discussion. Hmm, looks like ucast_mgr_process_port() DTRT, though; it ignores lids that aren't in the range configured on the port? Al Subject: [PATCH] fix segfault corner case w/ updn routing and LMC 0 From: Albert L.Chu ch...@llnl.gov Date: Tue, 22 Mar 2011 17:36:16 -0700 Signed-off-by: Albert L. Chu ch...@llnl.gov Reviewed-by: Jim Schutt jasc...@sandia.gov -- Jim --- opensm/osm_ucast_mgr.c |4 1 files changed, 0 insertions(+), 4 deletions(-) diff --git a/opensm/osm_ucast_mgr.c b/opensm/osm_ucast_mgr.c index 4019589..211d6e0 100644 --- a/opensm/osm_ucast_mgr.c +++ b/opensm/osm_ucast_mgr.c @@ -318,10 +318,6 @@ static void alloc_ports_priv(osm_ucast_mgr_t * mgr) item = cl_qmap_next(item)) { port = (osm_port_t *) item; lmc = ib_port_info_get_lmc(port-p_physp-port_info); - if (!lmc) { - port-priv = NULL; - continue; - } r = malloc(sizeof(*r) + sizeof(r-guids[0]) * (1 lmc)); if (!r) { OSM_LOG(mgr-p_log, OSM_LOG_ERROR, ERR 3A09: -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[Patch] mlx4: make mcg entry size a module parameter
We ran across a problem at a customer's site where the qp array in the mcg entry was being filling up and denying further qp attaches. In addition, the upcoming SRIOV support is expected to increase use of this array as well for unicast address steering (or so the comments in a patch Yevgeny's original patch that statically increased the size of the mcg entries stated). But, since increasing the size of this to an arbitrarily large number is just a waste of memory, and since we don't know that 0x200 will be large enough for all use cases, make the option a module parameter instead. This has been tested at our customer's site and solves their problem. commit ff608ce370b49d2e5b614ff91f4e23b5deaac8a4 Author: Doug Ledford dledf...@redhat.com Date: Wed Mar 23 12:20:47 2011 -0400 mlx4: make the size of the mcg entry a module parameter Testing showed that the default size of 0x100 could be overrun. Bumping to 0x200 would fix the problem, but only until we hit the cap again, and at the expense of making memory consumption in all scenarios worse. So, make the size of the mcg entry a module parameter and let those people who need to bump the size do so. Signed-off-by: Doug Ledford dledf...@redhat.com diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c7a6213..322f0af 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -158,7 +158,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props-masked_atomic_cap = IB_ATOMIC_HCA; props-max_pkeys = dev-dev-caps.pkey_table_len[1]; props-max_mcast_grp = dev-dev-caps.num_mgms + dev-dev-caps.num_amgms; - props-max_mcast_qp_attach = dev-dev-caps.num_qp_per_mgm; + props-max_mcast_qp_attach = dev-dev-caps.num_qp_per_mcg; props-max_total_mcast_qp_attach = props-max_mcast_qp_attach * props-max_mcast_grp; props-max_map_per_fmr = (1 (32 - ilog2(dev-dev-caps.num_mpts))) - 1; diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 5de1db8..800cb2d 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -434,8 +434,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap-reserved_mrws, dev_cap-reserved_mtts); mlx4_dbg(dev, Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n, dev_cap-max_pds, dev_cap-reserved_pds, dev_cap-reserved_uars); - mlx4_dbg(dev, Max QP/MCG: %d, reserved MGMs: %d\n, -dev_cap-max_pds, dev_cap-reserved_mgms); + mlx4_dbg(dev, Max QP/MCG: %d, Max MCGs: %d, reserved MGMs: %d\n, +dev_cap-max_qp_per_mcg, dev_cap-max_mcgs, dev_cap-reserved_mgms); mlx4_dbg(dev, Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n, dev_cap-max_cq_sz, dev_cap-max_qp_sz, dev_cap-max_srq_sz); mlx4_dbg(dev, Local CA ACK delay: %d, max MTU: %d, port width cap: %d\n, diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c index 2765a3c..5306141 100644 --- a/drivers/net/mlx4/main.c +++ b/drivers/net/mlx4/main.c @@ -101,6 +101,10 @@ module_param_named(use_prio, use_prio, bool, 0444); MODULE_PARM_DESC(use_prio, Enable steering by VLAN priority on ETH ports (0/1, default 0)); +static int log_mcg_size = 8; +module_param_named(log_mcg_size, log_mcg_size, int, 0444); +MODULE_PARM_DESC(log_mcg_size, Log2 size of MCG struct (8-11)); + static int log_mtts_per_seg = ilog2(MLX4_MTT_ENTRY_PER_SEG); module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444); MODULE_PARM_DESC(log_mtts_per_seg, Log2 number of MTT entries per segment (1-7)); @@ -203,7 +207,14 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev-caps.reserved_srqs = dev_cap-reserved_srqs; dev-caps.max_sq_desc_sz = dev_cap-max_sq_desc_sz; dev-caps.max_rq_desc_sz = dev_cap-max_rq_desc_sz; - dev-caps.num_qp_per_mgm = MLX4_QP_PER_MGM; + dev-caps.max_qp_per_mcg = dev_cap-max_qp_per_mcg; + dev-caps.max_mcgs = dev_cap-max_mcgs; + i = 0; + do { + dev-caps.mcg_entry_size = 1 (log_mcg_size - i++); + dev-caps.num_qp_per_mcg = 4 * (dev-caps.mcg_entry_size / 16 - 2); + } while (dev-caps.num_qp_per_mcg dev-caps.max_qp_per_mcg); + /* * Subtract 1 from the limit because we need to allocate a * spare CQE so the HCA HW can tell the difference between an @@ -642,7 +653,7 @@ static int mlx4_init_icm(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap, * and it's a lot easier than trying to track ref counts. */ err = mlx4_init_icm_table(dev, priv-mcg_table.table, - init_hca-mc_base, MLX4_MGM_ENTRY_SIZE, + init_hca-mc_base, dev-caps.mcg_entry_size,
RE: Segmentation Fault when accessing QPN of a RC Queue pair
I have little to no idea why the one causes a SEGFAULT and the other not, after all I have created and initialized them in the same way. I am really getting frustrated by this, I spend the past 3 days looking for some cause but to no avail. If I ommit the line where the SEGFAULT occurs I get another later when trying to get the QP to the RTS state, so sonething is definetly wrong, but right now I don't have the slightest clue what. So please, if someone has some idea what I might be doing wrong, share it with me. I am really starting to hate this code, as there is no comprehensible description as to what are the requirements to create a QP for the different transport types, and also no examples (at least I didn't found any). Both libibverbs and librdmacm provide sample programs, and you could also look at some of the perftest samples. The simplest of these for RC QPs is probably the rdma_server/rdma_client samples with the librdmacm, but these make use of newer APIs. For UC QPs, you would need to look at libibverbs. UD QPs are setup differently, but both libibverbs and librdmacm have samples. There wasn't enough context provided in the code snippet for me to see the cause of the crash. - Sean -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [opensm] routing segfault + LMC 0 routing bug?
Hey Jim, On Wed, 2011-03-23 at 09:01 -0700, Jim Schutt wrote: Hi Al, Albert Chu wrote: Hey Jim, Alex, Just hit a segfault on the main tree. It appears patch commit 9ddcf3419eade13bdc0a54f93930c49fe67efd63 Author: Jim Schutt jasc...@sandia.gov Date: Fri Sep 3 10:43:12 2010 -0600 opensm: Avoid havoc in minhop caused by torus-2QoS persistent use of osm_port_t:priv. segfaults opensm on one of our systems w/ updn routing and lmc 0 (would likely segfault dor, minhop, and maybe others too). Our system has older switches that do not support enhanced port zero, thus do not support LMC 0. (I imagine setting lmc_esp0 to FALSE, results in the same behavior.) Subsequently even if you set LMC 0 in your opensm config file, there can be ports with LMC = 0 and LMC != 0 (e.g. from HCAs). Subsequently in alloc_ports_priv(), some ports will have priv set to NULL and some will not. Because of assumptions in osm_switch.c about priv != NULL when lmc 0, we hit a segfault. The issue didn't exist before b/c we allocated p_port-priv non-NULL no matter what. OK, I think I see. But this segfault can only occur in the case where LMC is configured 0, right? The issue is in osm_switch_recommend_path() when routing_for_lmc is true, but p_port-priv is NULL, right? Yup. The attached patch fixes the problem w/ updn. I haven't looked through all of the 2Qos code thoroughly to figure out the consequences of this change, so I'm just considering this a starting point for discussion. Torus-2QoS's use of port-priv is unique because it persists between routing sweeps. So if another routing engine runs after torus-2QoS and uses port-priv without having ensured that it set it itself, there will be trouble. 9ddcf3419ea was fixing such an issue. I can find only two calls of osm_switch_recommend_path(), and both seem to be to do the right thing, so I think your patch is OK. Sounds good. When reading over your comments about the 2Qos patches that affected this area, I wasn't quite sure how you were dealing with the p_port-priv, so I was unsure how my patch would affect things. In addition, with the possibility that SP0 ports will be LMC = 0, this code in osm_ucast_mgr.c ucast_mgr_process_tbl() does not look good. lids_per_port = 1 p_mgr-p_subn-opt.lmc; for (i = 0; i lids_per_port; i++) { cl_qlist_t *list = p_mgr-port_order_list; cl_list_item_t *item; for (item = cl_qlist_head(list); item != cl_qlist_end(list); item = cl_qlist_next(item)) { osm_port_t *port = cl_item_obj(item, port, list_item); ucast_mgr_process_port(p_mgr, p_sw, port, i); } } It iterates over all ports with the configured LMC, not the LMC of the port? I haven't thought about this too deeply or investigated deeply, so consider this another starting point for discussion. Hmm, looks like ucast_mgr_process_port() DTRT, though; it ignores lids that aren't in the range configured on the port? Ahh, I think you're right. It does appear to do the right thing there. I don't think it's a problem afterall. Al Al Subject: [PATCH] fix segfault corner case w/ updn routing and LMC 0 From: Albert L.Chu ch...@llnl.gov Date: Tue, 22 Mar 2011 17:36:16 -0700 Signed-off-by: Albert L. Chu ch...@llnl.gov Reviewed-by: Jim Schutt jasc...@sandia.gov -- Jim --- opensm/osm_ucast_mgr.c |4 1 files changed, 0 insertions(+), 4 deletions(-) diff --git a/opensm/osm_ucast_mgr.c b/opensm/osm_ucast_mgr.c index 4019589..211d6e0 100644 --- a/opensm/osm_ucast_mgr.c +++ b/opensm/osm_ucast_mgr.c @@ -318,10 +318,6 @@ static void alloc_ports_priv(osm_ucast_mgr_t * mgr) item = cl_qmap_next(item)) { port = (osm_port_t *) item; lmc = ib_port_info_get_lmc(port-p_physp-port_info); - if (!lmc) { - port-priv = NULL; - continue; - } r = malloc(sizeof(*r) + sizeof(r-guids[0]) * (1 lmc)); if (!r) { OSM_LOG(mgr-p_log, OSM_LOG_ERROR, ERR 3A09: -- Albert Chu ch...@llnl.gov Computer Scientist High Performance Systems Division Lawrence Livermore National Laboratory -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[GIT PULL] please pull infiniband.git
Linus, please pull from master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git for-linus This tree is also available from kernel.org mirrors at: git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git for-linus This will get the second batch of 2.6.39 changes: - Dave Dillow's SRP initiator changes - Fix to uninitialized return value introduced by net tree merge - A couple of other minor cleanups David Dillow (7): IB/srp: always avoid non-zero offsets into an FMR IB/srp: move IB CM setup completion into its own function IB/srp: allow sg_tablesize to be set for each target IB/srp: rework mapping engine to use multiple FMR entries IB/srp: add support for indirect tables that don't fit in SRP_CMD IB/srp: try to use larger FMR sizes to cover our mappings IB: Increase DMA max_segment_size on Mellanox hardware Michael Heinz (1): IB/mad: Improve an error message so error code is included Roland Dreier (3): RDMA/nes: Don't print success message at level KERN_ERR Merge branch 'external-indirect' of git://git.kernel.org/.../dad/srp-initiator into srp Merge branches 'misc', 'nes' and 'srp' into for-next Sean Hefty (1): RDMA/addr: Fix return of uninitialized ret value drivers/infiniband/core/addr.c |2 +- drivers/infiniband/core/agent.c |3 +- drivers/infiniband/hw/mthca/mthca_main.c |3 + drivers/infiniband/hw/nes/nes.c |2 +- drivers/infiniband/ulp/srp/ib_srp.c | 725 -- drivers/infiniband/ulp/srp/ib_srp.h | 38 ++- 6 files changed, 521 insertions(+), 252 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [opensm] routing segfault + LMC 0 routing bug?
Albert Chu wrote: Hey Jim, On Wed, 2011-03-23 at 09:01 -0700, Jim Schutt wrote: Hi Al, Torus-2QoS's use of port-priv is unique because it persists between routing sweeps. So if another routing engine runs after torus-2QoS and uses port-priv without having ensured that it set it itself, there will be trouble. 9ddcf3419ea was fixing such an issue. I can find only two calls of osm_switch_recommend_path(), and both seem to be to do the right thing, so I think your patch is OK. Sounds good. When reading over your comments about the 2Qos patches that affected this area, I wasn't quite sure how you were dealing with the p_port-priv, so I was unsure how my patch would affect things. There's some comments at the top of osm_torus.c, in the definitions of struct endpoint and struct t_switch, that describe the rules for how torus-2QoS code can safely use -priv. They may shed some extra light... -- Jim -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [GIT PULL] please pull infiniband.git
On Wed, Mar 23, 2011 at 10:47 AM, Roland Dreier rol...@kernel.org wrote: drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/agent.c | 3 +- drivers/infiniband/hw/mthca/mthca_main.c | 3 + drivers/infiniband/hw/nes/nes.c | 2 +- drivers/infiniband/ulp/srp/ib_srp.c | 725 -- drivers/infiniband/ulp/srp/ib_srp.h | 38 ++- 6 files changed, 521 insertions(+), 252 deletions(-) Somehow I missed a hunk of IB: Increase DMA max_segment_size on Mellanox hardware diffstat now should look like drivers/infiniband/core/addr.c |2 +- drivers/infiniband/core/agent.c |3 +- drivers/infiniband/hw/mthca/mthca_main.c |3 + drivers/infiniband/hw/nes/nes.c |2 +- drivers/infiniband/ulp/srp/ib_srp.c | 725 -- drivers/infiniband/ulp/srp/ib_srp.h | 38 ++- drivers/net/mlx4/main.c |3 + 7 files changed, 524 insertions(+), 252 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [opensm] RFC: new routing options (repost)
Hi Alex, As discussed in a private thread, here are the patches again, with some tweaks. Most notably, the tweak ensures that the remote_guid_sorting option is independent of port_shifting, so users may enable either, none, or both options at their discretion. Al On Thu, 2011-02-10 at 17:33 -0800, Albert Chu wrote: [This is a repost from Oct 2010 with rebased patches] We recently got a new cluster and I've been experimenting with some routing changes to improve the average bandwidth of the cluster. They are attached as patches with description of the routing goals below. We're using mpiGraph (http://sourceforge.net/projects/mpigraph/) to measure min, peak, and average send/recv bandwidth across the cluster. What we found with the original updn routing was an average of around 420 MB/s send bandwidth and 508 MB/s recv bandwidth. The following two patches were able to get the average send bandwidth up to 1045 MB/s and recv bandwidth up to 1228 MB/s. I'm sure this is only round 1 of the patches and I'm looking for comments. Many areas could be cleaned up w/ some rearchitecture, but I elected to implement the most non-invasive implementation first. I'm also open to name changes on the options. 1) Port Shifting This is similar to what was done with some of the LMC 0 code. Congestion would occur due to alignment of routes w/ common traffic patterns. However, we found that it was also necessary for LMC=0 and only for used-ports. For example, lets say there are 4 ports (called A, B, C, D) and we are routing lids 1-9 through them. Suppose only routing through A, B, and C will reach lids 1-9. The LFT would normally be: A: 1 4 7 B: 2 5 8 C: 3 6 9 D: The Port Shifting option would make this: A: 1 6 8 B: 2 4 9 C: 3 5 7 D: This option by itself improved the mpiGraph average send/recv bandwidth from 420 MB/s and 508 MB/s to to 991 MB/s and 1172 MB/s. 2) Remote Guid Sorting Most core/spine switches we've seen thus far have had line boards connected to spine boards in a consistent pattern. However, we recently got some Qlogic switches that connect from line/leaf boards to spine boards in a (to the casual observer) random pattern. I'm sure there was a good electrical/board reason for this design, but it does hurt routing b/c updn doesn't account for this. Here's an output from iblinkinfo as an example. Switch 0x00066a00ec0029b8 ibcore1 L123: 1801[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 254 19[ ] ibsw55 ( ) 1802[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 253 19[ ] ibsw56 ( ) 1803[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 258 19[ ] ibsw57 ( ) 1804[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 257 19[ ] ibsw58 ( ) 1805[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 256 19[ ] ibsw59 ( ) 1806[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 255 19[ ] ibsw60 ( ) 1807[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 261 19[ ] ibsw61 ( ) 1808[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 262 19[ ] ibsw62 ( ) 1809[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 260 19[ ] ibsw63 ( ) 180 10[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 259 19[ ] ibsw64 ( ) 180 11[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 284 19[ ] ibsw65 ( ) 180 12[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 285 19[ ] ibsw66 ( ) 180 13[ ] ==( 4X 10.0 Gbps Active/ LinkUp)==2227 19[ ] ibsw67 ( ) 180 14[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 283 19[ ] ibsw68 ( ) 180 15[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 267 19[ ] ibsw69 ( ) 180 16[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 270 19[ ] ibsw70 ( ) 180 17[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 269 19[ ] ibsw71 ( ) 180 18[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 268 19[ ] ibsw72 ( ) 180 19[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 222 17[ ] ibcore1 S117B ( ) 180 20[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 209 19[ ] ibcore1 S211B ( ) 180 21[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 218 21[ ] ibcore1 S117A ( ) 180 22[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 192 23[ ] ibcore1 S215B ( ) 180 23[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 85 15[ ] ibcore1 S209A ( ) 180 24[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 182 13[ ] ibcore1 S215A ( ) 180 25[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 200 11[ ] ibcore1 S115B ( ) 180 26[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 129 25[ ] ibcore1 S209B ( ) 180 27[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 213 27[ ] ibcore1 S115A ( ) 180 28[ ] ==( 4X 10.0 Gbps Active/ LinkUp)== 197 29[ ]
[PATCH] rdma/ucm: Fix race in ucma_create_id
The following problem was reported by Roland Dreier based on code inspection: I do see what seems like an exploitable race in ucma_create_id(): - one thread create an id with an invalid userspace pointer (so the copy_to_user in ucma_create_id returns -EFAULT and calls rdma_destroy_id before idr_remove) - another thread guess the id that is going to be returned and call ucma_destroy_id() if the second thread hits the window where the cm_id is destroyed but the ctx is still in the idr, it can trigger a double free. There is an issue here that the second thread can try to use the new rdma_cm_id in another call after it has been destroyed. (The problem isn't just restricted to the second thread calling ucma_destroy_id.) Fix this by holding the file-mux around the entire creation process, so another thread cannot find the id until it has been fully created. Signed-off-by: Sean Hefty sean.he...@intel.com --- drivers/infiniband/core/ucma.c | 17 + 1 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ca12acf..fd6b980 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -384,31 +384,32 @@ static ssize_t ucma_create_id(struct ucma_file *file, mutex_lock(file-mut); ctx = ucma_alloc_ctx(file); - mutex_unlock(file-mut); - if (!ctx) + if (!ctx) { + mutex_unlock(file-mut); return -ENOMEM; + } ctx-uid = cmd.uid; ctx-cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps); if (IS_ERR(ctx-cm_id)) { ret = PTR_ERR(ctx-cm_id); - goto err1; + goto err; } resp.id = ctx-id; if (copy_to_user((void __user *)(unsigned long)cmd.response, resp, sizeof(resp))) { ret = -EFAULT; - goto err2; + goto err; } + mutex_unlock(file-mut); return 0; -err2: - rdma_destroy_id(ctx-cm_id); -err1: - mutex_lock(mut); +err: idr_remove(ctx_idr, ctx-id); mutex_unlock(mut); + if (!IS_ERR(ctx-cm_id)) + rdma_destroy_id(ctx-cm_id); kfree(ctx); return ret; } -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] rdma/ucm: Fix race in ucma_create_id
ignore this - there are 2 locks being used in ucma_create_id I will resubmit diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index ca12acf..fd6b980 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -384,31 +384,32 @@ static ssize_t ucma_create_id(struct ucma_file *file, mutex_lock(file-mut); ctx = ucma_alloc_ctx(file); - mutex_unlock(file-mut); - if (!ctx) + if (!ctx) { + mutex_unlock(file-mut); return -ENOMEM; + } ctx-uid = cmd.uid; ctx-cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps); if (IS_ERR(ctx-cm_id)) { ret = PTR_ERR(ctx-cm_id); - goto err1; + goto err; } resp.id = ctx-id; if (copy_to_user((void __user *)(unsigned long)cmd.response, resp, sizeof(resp))) { ret = -EFAULT; - goto err2; + goto err; } + mutex_unlock(file-mut); return 0; -err2: - rdma_destroy_id(ctx-cm_id); -err1: - mutex_lock(mut); +err: idr_remove(ctx_idr, ctx-id); mutex_unlock(mut); + if (!IS_ERR(ctx-cm_id)) + rdma_destroy_id(ctx-cm_id); kfree(ctx); return ret; } -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html