Re: [ewg] bug 1918 - openmpi broken due to rdma-cm changes
On 2/7/2010 6:39 PM, Steve Wise wrote: If ofed-1.5.1 is based on 2.6.33 then it will get this patch automatically (assuming it goes upstream and makes 2.6.33). Or we can pull it in as a kernel_patches/fixes/ patch. OFED 1.5.1 is not based on 2.6.33, but on 2.6.30, so we need the patch under fixes. Steve - can you prepare such a patch? Tziporet -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices
On 2/8/2010 8:02 AM, Sean Hefty wrote: Since iWarp devices are not guaranteed to support loopback connections, prevent rdma_bind_addr from associating the loopback address with an iWarp device. Signed-off-by: Sean Heftysean.he...@intel.com Steve Have you tested this patch? When accepted to kernel can you prepare a patch for OFED 1.5.1 under fixes Thanks Tziporet -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V2 02/9] ib/iser: new recv buffer posting logic
Currently, the recv buffer posting logic is based on the transactional nature of iser which allows for posting a buffer before sending a PDU. Change this to post only when the number of outstanding recv buffers is below a water mark and in a batched manner, thus simplifying and optimizing the data path. Use a pre-allocated ring of recv buffers instead of allocating from kmem cache. A special treatment is given to the login response buffer whose size must be 8K unlike the size of buffers used for any other purpose which is 128 bytes. Signed-off-by: Or Gerlitz ogerl...@voltaire.com --- drivers/infiniband/ulp/iser/iscsi_iser.c |2 drivers/infiniband/ulp/iser/iscsi_iser.h | 40 +++- drivers/infiniband/ulp/iser/iser_initiator.c | 235 +-- drivers/infiniband/ulp/iser/iser_verbs.c | 134 +-- 4 files changed, 235 insertions(+), 176 deletions(-) Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iscsi_iser.h +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -102,9 +102,9 @@ #define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * * SCSI_TMFUNC(2), LOGOUT(1) */ -#define ISER_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX + \ - ISER_MAX_RX_MISC_PDUS+ \ - ISER_MAX_TX_MISC_PDUS) +#define ISER_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) + +#define ISER_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX 2) /* the max TX (send) WR supported by the iSER QP is defined by * * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * @@ -132,6 +132,12 @@ struct iser_hdr { __be64 read_va; } __attribute__((packed)); +/* Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) + +#define ISER_RECV_DATA_SEG_LEN 128 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) /* Length of an object name string */ #define ISER_OBJECT_NAME_SIZE 64 @@ -212,7 +218,6 @@ struct iser_dto { }; enum iser_desc_type { - ISCSI_RX, ISCSI_TX_CONTROL , ISCSI_TX_SCSI_COMMAND, ISCSI_TX_DATAOUT @@ -228,6 +233,17 @@ struct iser_desc { struct iser_dto dto; }; +#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ + sizeof(u64) + sizeof(struct ib_sge))) +struct iser_rx_desc { + struct iser_hdr iser_header; + struct iscsi_hdr iscsi_header; + char data[ISER_RECV_DATA_SEG_LEN]; + u64 dma_addr; + struct ib_sgerx_sg; + char pad[ISER_RX_PAD_SIZE]; +} __attribute__((packed)); + struct iser_device { struct ib_device *ib_device; struct ib_pd *pd; @@ -256,6 +272,12 @@ struct iser_conn { struct iser_page_vec *page_vec; /* represents SG to fmr maps* * maps serialized as tx is*/ struct list_head conn_list; /* entry in ig conn list */ + + char *login_buf; + u64 login_dma; + unsigned int rx_desc_head; + struct iser_rx_desc *rx_descs; + struct ib_recv_wrrx_wr[ISER_MIN_POSTED_RX]; }; struct iscsi_iser_conn { @@ -319,8 +341,9 @@ void iser_conn_put(struct iser_conn *ib_ void iser_conn_terminate(struct iser_conn *ib_conn); -void iser_rcv_completion(struct iser_desc *desc, -unsigned longdto_xfer_len); +void iser_rcv_completion(struct iser_rx_desc *desc, +unsigned longdto_xfer_len, + struct iser_conn *ib_conn); void iser_snd_completion(struct iser_desc *desc); @@ -332,6 +355,8 @@ void iser_dto_buffs_release(struct iser_ int iser_regd_buff_release(struct iser_regd_buf *regd_buf); +void iser_free_rx_descriptors(struct iser_conn *ib_conn); + void iser_reg_single(struct iser_device *device, struct iser_regd_buf*regd_buf, enum dma_data_direction direction); @@ -353,7 +378,8 @@ int iser_reg_page_vec(struct iser_conn void iser_unreg_mem(struct iser_mem_reg *mem_reg); -int iser_post_recv(struct iser_desc *rx_desc); +int iser_post_recvl(struct iser_conn *ib_conn); +int iser_post_recvm(struct iser_conn *ib_conn, int count); int iser_post_send(struct iser_desc *tx_desc); int iser_conn_state_comp(struct iser_conn *ib_conn, Index:
[PATCH V2 03/9] ib/iser: remove atomic counter for posted recv buffers
With both the posting and reaping of recv buffers being in the completion path, their outstanding number counter need not be atomic. Signed-off-by: Or Gerlitz ogerl...@voltaire.com --- drivers/infiniband/ulp/iser/iscsi_iser.h |2 +- drivers/infiniband/ulp/iser/iser_initiator.c |6 +++--- drivers/infiniband/ulp/iser/iser_verbs.c | 16 3 files changed, 12 insertions(+), 12 deletions(-) Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iscsi_iser.h +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -266,7 +266,7 @@ struct iser_conn { struct ib_fmr_pool *fmr_pool; /* pool of IB FMRs */ int disc_evt_flag; /* disconn event delivered */ wait_queue_head_twait; /* waitq for conn/disconn */ - atomic_t post_recv_buf_count; /* posted rx count */ + int post_recv_buf_count; /* posted rx count */ atomic_t post_send_buf_count; /* posted tx count */ char name[ISER_OBJECT_NAME_SIZE]; struct iser_page_vec *page_vec; /* represents SG to fmr maps* Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c @@ -268,7 +268,7 @@ int iser_conn_set_full_featured_mode(str /* Check that there is no posted recv or send buffers left - */ /* they must be consumed during the login phase */ - BUG_ON(atomic_read(iser_conn-ib_conn-post_recv_buf_count) != 0); + BUG_ON(iser_conn-ib_conn-post_recv_buf_count != 0); BUG_ON(atomic_read(iser_conn-ib_conn-post_send_buf_count) != 0); if (iser_alloc_rx_descriptors(iser_conn-ib_conn)) @@ -569,12 +569,12 @@ void iser_rcv_completion(struct iser_rx_ * task eliminates the need to worry on tasks which are completed in * * parallel to the execution of iser_conn_term. So the code that waits * * for the posted rx bufs refcount to become zero handles everything */ - atomic_dec(conn-ib_conn-post_recv_buf_count); + conn-ib_conn-post_recv_buf_count--; if (rx_dma == ib_conn-login_dma) return; - outstanding = atomic_read(ib_conn-post_recv_buf_count); + outstanding = ib_conn-post_recv_buf_count; if (outstanding + ISER_MIN_POSTED_RX = ISER_QP_MAX_RECV_DTOS) { count = min(ISER_QP_MAX_RECV_DTOS - outstanding, ISER_MIN_POSTED_RX); Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_verbs.c === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_verbs.c +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_verbs.c @@ -453,7 +453,7 @@ static void iser_disconnected_handler(st ISCSI_ERR_CONN_FAILED); /* Complete the termination process if no posts are pending */ - if ((atomic_read(ib_conn-post_recv_buf_count) == 0) + if (ib_conn-post_recv_buf_count == 0 (atomic_read(ib_conn-post_send_buf_count) == 0)) { ib_conn-state = ISER_CONN_DOWN; wake_up_interruptible(ib_conn-wait); @@ -500,7 +500,7 @@ void iser_conn_init(struct iser_conn *ib { ib_conn-state = ISER_CONN_INIT; init_waitqueue_head(ib_conn-wait); - atomic_set(ib_conn-post_recv_buf_count, 0); + ib_conn-post_recv_buf_count = 0; atomic_set(ib_conn-post_send_buf_count, 0); atomic_set(ib_conn-refcount, 1); INIT_LIST_HEAD(ib_conn-conn_list); @@ -651,11 +651,11 @@ int iser_post_recvl(struct iser_conn *ib rx_wr.num_sge = 1; rx_wr.next= NULL; - atomic_inc(ib_conn-post_recv_buf_count); + ib_conn-post_recv_buf_count++; ib_ret = ib_post_recv(ib_conn-qp, rx_wr, rx_wr_failed); if (ib_ret) { iser_err(ib_post_recv failed ret=%d\n, ib_ret); - atomic_dec(ib_conn-post_recv_buf_count); + ib_conn-post_recv_buf_count--; } return ib_ret; } @@ -679,11 +679,11 @@ int iser_post_recvm(struct iser_conn *ib rx_wr--; rx_wr-next = NULL; /* mark end of work requests list */ - atomic_add(count, ib_conn-post_recv_buf_count); + ib_conn-post_recv_buf_count += count; ib_ret = ib_post_recv(ib_conn-qp, ib_conn-rx_wr, rx_wr_failed); if (ib_ret) { iser_err(ib_post_recv failed ret=%d\n, ib_ret); - atomic_sub(count, ib_conn-post_recv_buf_count); +
[PATCH V2 04/9] ib/iser: use different CQ for send completions
Use a different CQ for send completions, where send completions are being polled by the interrupt driven recv completion handler. As such, interrupts aren't used for the send CQ. Signed-off-by: Or Gerlitz ogerl...@voltaire.com --- drivers/infiniband/ulp/iser/iscsi_iser.h |3 drivers/infiniband/ulp/iser/iser_verbs.c | 110 --- 2 files changed, 76 insertions(+), 37 deletions(-) Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iscsi_iser.h +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -247,7 +247,8 @@ struct iser_rx_desc { struct iser_device { struct ib_device *ib_device; struct ib_pd *pd; - struct ib_cq *cq; + struct ib_cq *rx_cq; + struct ib_cq *tx_cq; struct ib_mr *mr; struct tasklet_structcq_tasklet; struct list_head ig_list; /* entry in ig devices list */ Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_verbs.c === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_verbs.c +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_verbs.c @@ -37,9 +37,8 @@ #include iscsi_iser.h #define ISCSI_ISER_MAX_CONN8 -#define ISER_MAX_CQ_LEN((ISER_QP_MAX_RECV_DTOS + \ - ISER_QP_MAX_REQ_DTOS) * \ -ISCSI_ISER_MAX_CONN) +#define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) static void iser_cq_tasklet_fn(unsigned long data); static void iser_cq_callback(struct ib_cq *cq, void *cq_context); @@ -67,15 +66,23 @@ static int iser_create_device_ib_res(str if (IS_ERR(device-pd)) goto pd_err; - device-cq = ib_create_cq(device-ib_device, + device-rx_cq = ib_create_cq(device-ib_device, iser_cq_callback, iser_cq_event_callback, (void *)device, - ISER_MAX_CQ_LEN, 0); - if (IS_ERR(device-cq)) - goto cq_err; + ISER_MAX_RX_CQ_LEN, 0); + if (IS_ERR(device-rx_cq)) + goto rx_cq_err; - if (ib_req_notify_cq(device-cq, IB_CQ_NEXT_COMP)) + device-tx_cq = ib_create_cq(device-ib_device, + NULL, iser_cq_event_callback, + (void *)device, + ISER_MAX_TX_CQ_LEN, 0); + + if (IS_ERR(device-tx_cq)) + goto tx_cq_err; + + if (ib_req_notify_cq(device-rx_cq, IB_CQ_NEXT_COMP)) goto cq_arm_err; tasklet_init(device-cq_tasklet, @@ -93,8 +100,10 @@ static int iser_create_device_ib_res(str dma_mr_err: tasklet_kill(device-cq_tasklet); cq_arm_err: - ib_destroy_cq(device-cq); -cq_err: + ib_destroy_cq(device-tx_cq); +tx_cq_err: + ib_destroy_cq(device-rx_cq); +rx_cq_err: ib_dealloc_pd(device-pd); pd_err: iser_err(failed to allocate an IB resource\n); @@ -112,11 +121,13 @@ static void iser_free_device_ib_res(stru tasklet_kill(device-cq_tasklet); (void)ib_dereg_mr(device-mr); - (void)ib_destroy_cq(device-cq); + (void)ib_destroy_cq(device-tx_cq); + (void)ib_destroy_cq(device-rx_cq); (void)ib_dealloc_pd(device-pd); device-mr = NULL; - device-cq = NULL; + device-tx_cq = NULL; + device-rx_cq = NULL; device-pd = NULL; } @@ -179,8 +190,8 @@ static int iser_create_ib_conn_res(struc init_attr.event_handler = iser_qp_event_callback; init_attr.qp_context= (void *)ib_conn; - init_attr.send_cq = device-cq; - init_attr.recv_cq = device-cq; + init_attr.send_cq = device-tx_cq; + init_attr.recv_cq = device-rx_cq; init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; init_attr.cap.max_send_sge = MAX_REGD_BUF_VECTOR_LEN; @@ -772,18 +783,8 @@ int iser_post_send(struct iser_desc *tx_ static void iser_handle_comp_error(struct iser_desc *desc, struct iser_conn *ib_conn) { - struct iser_rx_desc *rx = (struct iser_rx_desc *)desc; - struct iser_rx_desc *rx_first = ib_conn-rx_descs; - struct iser_rx_desc *rx_last = rx_first + (ISER_QP_MAX_RECV_DTOS - 1); - - if ((char *)desc == ib_conn-login_buf || - (rx_first = rx rx = rx_last)) - ib_conn-post_recv_buf_count--; -else { /* type is TX control/command/dataout
[PATCH V2 05/9] ib/iser: simplify send flow/descriptors
Simplify and shrink the logic/code used for the send descriptors. Changes include removal of struct iser_dto which is unnecessary abstraction, use struct iser_regd_buf only for handling SCSI commands, use dma_sync instead of dma_map/unmap, etc. Signed-off-by: Or Gerlitz ogerl...@voltaire.com --- changes from V1: - remove white spaces that slipped in drivers/infiniband/ulp/iser/iscsi_iser.c | 34 - drivers/infiniband/ulp/iser/iscsi_iser.h | 48 +-- drivers/infiniband/ulp/iser/iser_initiator.c | 178 +-- drivers/infiniband/ulp/iser/iser_memory.c| 60 - drivers/infiniband/ulp/iser/iser_verbs.c | 75 +-- 5 files changed, 116 insertions(+), 279 deletions(-) Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c @@ -39,26 +39,6 @@ #include iscsi_iser.h - -/* iser_dto_add_regd_buff - increments the reference count for * - * the registered buffer adds it to the DTO object */ -static void iser_dto_add_regd_buff(struct iser_dto *dto, - struct iser_regd_buf *regd_buf, - unsigned long use_offset, - unsigned long use_size) -{ - int add_idx; - - atomic_inc(regd_buf-ref_count); - - add_idx = dto-regd_vector_len; - dto-regd[add_idx] = regd_buf; - dto-used_sz[add_idx] = use_size; - dto-offset[add_idx] = use_offset; - - dto-regd_vector_len++; -} - /* Register user buffer memory and initialize passive rdma * dto descriptor. Total data size is stored in * iser_task-data[ISER_DIR_IN].data_len @@ -119,9 +99,9 @@ iser_prepare_write_cmd(struct iscsi_task struct iscsi_iser_task *iser_task = task-dd_data; struct iser_regd_buf *regd_buf; int err; - struct iser_dto *send_dto = iser_task-desc.dto; struct iser_hdr *hdr = iser_task-desc.iser_header; struct iser_data_buf *buf_out = iser_task-data[ISER_DIR_OUT]; + struct ib_sge *tx_dsg = iser_task-desc.tx_sg[1]; err = iser_dma_map_task_data(iser_task, buf_out, @@ -160,37 +140,36 @@ iser_prepare_write_cmd(struct iscsi_task if (imm_sz 0) { iser_dbg(Cmd itt:%d, WRITE, adding imm.data sz: %d\n, task-itt, imm_sz); - iser_dto_add_regd_buff(send_dto, - regd_buf, - 0, - imm_sz); + tx_dsg-addr = regd_buf-reg.va; + tx_dsg-length = imm_sz; + tx_dsg-lkey = regd_buf-reg.lkey; + iser_task-desc.num_sge = 2; } return 0; } /* creates a new tx descriptor and adds header regd buffer */ -static void iser_create_send_desc(struct iscsi_iser_conn *iser_conn, - struct iser_desc *tx_desc) +static void iser_create_send_desc(struct iser_conn *ib_conn, + struct iser_tx_desc *tx_desc) { - struct iser_regd_buf *regd_hdr = tx_desc-hdr_regd_buf; - struct iser_dto *send_dto = tx_desc-dto; + struct iser_device *device = ib_conn-device; - memset(regd_hdr, 0, sizeof(struct iser_regd_buf)); - regd_hdr-device = iser_conn-ib_conn-device; - regd_hdr-virt_addr = tx_desc; /* == tx_desc-iser_header */ - regd_hdr-data_size = ISER_HEADERS_LEN; - - send_dto-ib_conn = iser_conn-ib_conn; - send_dto-notify_enable = 1; - send_dto-regd_vector_len = 0; + ib_dma_sync_single_for_cpu(device-ib_device, + tx_desc-dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); memset(tx_desc-iser_header, 0, sizeof(struct iser_hdr)); tx_desc-iser_header.flags = ISER_VER; - iser_dto_add_regd_buff(send_dto, regd_hdr, 0, 0); + tx_desc-num_sge = 1; + + if (tx_desc-tx_sg[0].lkey != device-mr-lkey) { + tx_desc-tx_sg[0].lkey = device-mr-lkey; + iser_dbg(sdesc %p lkey mismatch, fixing\n, tx_desc); + } } + int iser_alloc_rx_descriptors(struct iser_conn *ib_conn) { int i, j; @@ -303,12 +282,12 @@ int iser_send_command(struct iscsi_conn { struct iscsi_iser_conn *iser_conn = conn-dd_data; struct iscsi_iser_task *iser_task = task-dd_data; - struct iser_dto *send_dto = NULL; unsigned long edtl; int err; struct iser_data_buf *data_buf; struct iscsi_cmd *hdr = (struct iscsi_cmd *)task-hdr; struct scsi_cmnd *sc = task-sc; + struct iser_tx_desc *tx_desc = iser_task-desc; if (!iser_conn_state_comp(iser_conn-ib_conn, ISER_CONN_UP)) {
[PATCH V2 06/9] ib/iser: use atomic allocations
Two minor flows in iser's data path still use allocations, move them to be atomic as a preperation step towards moving to use libiscsi passthrough mode. Signed-off-by: Or Gerlitz ogerl...@voltaire.com --- drivers/infiniband/ulp/iser/iser_initiator.c |2 +- drivers/infiniband/ulp/iser/iser_memory.c|4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c @@ -373,7 +373,7 @@ int iser_send_data_out(struct iscsi_conn iser_dbg(%s itt %d dseg_len %d offset %d\n, __func__,(int)itt,(int)data_seg_len,(int)buf_offset); - tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_NOIO); + tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC); if (tx_desc == NULL) { iser_err(Failed to alloc desc for post dataout\n); return -ENOMEM; Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_memory.c === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_memory.c +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_memory.c @@ -53,10 +53,10 @@ static int iser_start_rdma_unaligned_sg( unsigned long cmd_data_len = data-data_len; if (cmd_data_len ISER_KMALLOC_THRESHOLD) - mem = (void *)__get_free_pages(GFP_NOIO, + mem = (void *)__get_free_pages(GFP_ATOMIC, ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); else - mem = kmalloc(cmd_data_len, GFP_NOIO); + mem = kmalloc(cmd_data_len, GFP_ATOMIC); if (mem == NULL) { iser_err(Failed to allocate mem size %d %d for copying sglist\n, -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V2 09/9] remove redundant locking from iser scsi command response flow
currently iser recv completion flow takes the session lock twice. optimize it to avoid the first one by letting iser_task_rdma_finalize() be called only from the cleanup_task callback invoked by iscsi_free_task, thus reducing the contention on the session lock between the scsi command submission to the scsi command completion flows. Signed-off-by: Or Gerlitz ogerl...@voltaire.com Reviewed-by: Mike Christie micha...@cs.wisc.edu --- drivers/infiniband/ulp/iser/iser_initiator.c | 25 - 1 file changed, 25 deletions(-) Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c @@ -440,10 +440,7 @@ void iser_rcv_completion(struct iser_rx_ struct iser_conn *ib_conn) { struct iscsi_iser_conn *conn = ib_conn-iser_conn; - struct iscsi_task *task; - struct iscsi_iser_task *iser_task; struct iscsi_hdr *hdr; - unsigned char opcode; u64 rx_dma; int rx_buflen, outstanding, count, err; @@ -464,28 +461,6 @@ void iser_rcv_completion(struct iser_rx_ iser_dbg(op 0x%x itt 0x%x dlen %d\n, hdr-opcode, hdr-itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); - opcode = hdr-opcode ISCSI_OPCODE_MASK; - - if (opcode == ISCSI_OP_SCSI_CMD_RSP) { - spin_lock(conn-iscsi_conn-session-lock); - task = iscsi_itt_to_ctask(conn-iscsi_conn, hdr-itt); - if (task) - __iscsi_get_task(task); - spin_unlock(conn-iscsi_conn-session-lock); - - if (!task) - iser_err(itt can't be matched to task!!! -conn %p opcode %d itt %d\n, -conn-iscsi_conn, opcode, hdr-itt); - else { - iser_task = task-dd_data; - iser_dbg(itt %d task %p\n,hdr-itt, task); - iser_task-status = ISER_TASK_STATUS_COMPLETED; - iser_task_rdma_finalize(iser_task); - iscsi_put_task(task); - } - } - iscsi_iser_recv(conn-iscsi_conn, hdr, rx_desc-data, rx_xfer_len - ISER_HEADERS_LEN); -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH RESEND V2 09/9] ib/iser: remove redundant locking from iser scsi command response flow
currently iser recv completion flow takes the session lock twice. optimize it to avoid the first one by letting iser_task_rdma_finalize() be called only from the cleanup_task callback invoked by iscsi_free_task, thus reducing the contention on the session lock between the scsi command submission to the scsi command completion flows. Signed-off-by: Or Gerlitz ogerl...@voltaire.com Reviewed-by: Mike Christie micha...@cs.wisc.edu --- resending with a fixed subject line which contains the ib/iser: prefix drivers/infiniband/ulp/iser/iser_initiator.c | 25 - 1 file changed, 25 deletions(-) Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c === --- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c +++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c @@ -440,10 +440,7 @@ void iser_rcv_completion(struct iser_rx_ struct iser_conn *ib_conn) { struct iscsi_iser_conn *conn = ib_conn-iser_conn; - struct iscsi_task *task; - struct iscsi_iser_task *iser_task; struct iscsi_hdr *hdr; - unsigned char opcode; u64 rx_dma; int rx_buflen, outstanding, count, err; @@ -464,28 +461,6 @@ void iser_rcv_completion(struct iser_rx_ iser_dbg(op 0x%x itt 0x%x dlen %d\n, hdr-opcode, hdr-itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); - opcode = hdr-opcode ISCSI_OPCODE_MASK; - - if (opcode == ISCSI_OP_SCSI_CMD_RSP) { - spin_lock(conn-iscsi_conn-session-lock); - task = iscsi_itt_to_ctask(conn-iscsi_conn, hdr-itt); - if (task) - __iscsi_get_task(task); - spin_unlock(conn-iscsi_conn-session-lock); - - if (!task) - iser_err(itt can't be matched to task!!! -conn %p opcode %d itt %d\n, -conn-iscsi_conn, opcode, hdr-itt); - else { - iser_task = task-dd_data; - iser_dbg(itt %d task %p\n,hdr-itt, task); - iser_task-status = ISER_TASK_STATUS_COMPLETED; - iser_task_rdma_finalize(iser_task); - iscsi_put_task(task); - } - } - iscsi_iser_recv(conn-iscsi_conn, hdr, rx_desc-data, rx_xfer_len - ISER_HEADERS_LEN); -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] opensm: bug in trap report for MC create(66) and delete(67) traps
On Sun, Feb 7, 2010 at 4:47 AM, Eli Dorfman (Voltaire) dorfman@gmail.com wrote: Hal Rosenstock wrote: On Fri, Feb 5, 2010 at 9:18 AM, Eli Dorfman dorfman@gmail.com wrote: On Thu, Feb 4, 2010 at 10:52 PM, Hal Rosenstock hal.rosenst...@gmail.com wrote: On Thu, Feb 4, 2010 at 12:43 PM, Eli Dorfman (Voltaire) dorfman@gmail.com wrote: Subject: [PATCH] Wrong handling of MC create and delete traps For these traps the GID in the data details is the MGID and not the source port gid. So the SM should check that subscriber port has the pkey of the MC group. There was also an error in comparing the subnet prefix and guid due to host/network order mismatch. Signed-off-by: Eli Dorfman e...@voltaire.com --- opensm/opensm/osm_inform.c | 151 --- 1 files changed, 98 insertions(+), 53 deletions(-) diff --git a/opensm/opensm/osm_inform.c b/opensm/opensm/osm_inform.c index 8108213..ae4fe71 100644 --- a/opensm/opensm/osm_inform.c +++ b/opensm/opensm/osm_inform.c @@ -341,6 +341,103 @@ Exit: return status; } +static int is_access_permitted( osm_infr_t *p_infr_rec, + osm_infr_match_ctxt_t *p_infr_match ) +{ + cl_list_t *p_infr_to_remove_list = p_infr_match-p_remove_infr_list; + ib_inform_info_t *p_ii = (p_infr_rec-inform_record.inform_info); + ib_mad_notice_attr_t *p_ntc = p_infr_match-p_ntc; + uint16_t trap_num = cl_ntoh16(p_ntc-g_or_v.generic.trap_num); + osm_subn_t *p_subn = p_infr_rec-sa-p_subn; + osm_log_t *p_log = p_infr_rec-sa-p_log; + char gid_str[INET6_ADDRSTRLEN]; + osm_mgrp_t *p_mgrp; + ib_gid_t source_gid; + osm_port_t *p_src_port; + osm_port_t *p_dest_port; + + /* In case of GID_IN(64) or GID_OUT(65) traps the source gid + comparison should be done on the trap source (saved as the gid in the + data details field). + For traps MC_CREATE(66) or MC_DELETE(67) the data details gid is + the MGID. We need to check whether subscriber has the pky of typo pkey + the MC group. Shouldn't this be the subscriber has a compatible pkey with MC group ? The MC group has a full member PKey and the members can be full or limited. I accept the correction. Doesn't this require a code change for handling trap cases 66-67 ? I think that you referred to the comment since the code is handling this properly (in my opinion). I was referring to both the comment and the code since a port with a compatible limited pkey should be able to receive the reports for MC groups. Sasha, can you please change this in the commit (only if there are not other remarks). Is that what you are asking Sasha to do (beyond the typos) ? I asked Sasha to fix only the typo in commit. BTW, there is no explicit reference in the IB spec for MC group create/delete trap (at least I didn't find it). Not sure what you mean by this. What didn't you find ? in the spec see o13-17.1.2 Yes, there appear to be some holes in the spec in terms of this and maybe more in this area (event forwarding) but the intent seems clear. -- Hal Thanks, Eli -- Hal + In all other cases the issuer gis is the trap source. typo ^^^ gid and this typo of course. Thanks, Eli -- Hal + */ + if (trap_num = 64 trap_num = 67 ) + /* The issuer of these traps is the SM so source_gid + is the gid saved on the data details */ + source_gid = p_ntc-data_details.ntc_64_67.gid; + else + source_gid = p_ntc-issuer_gid; + + p_dest_port = + cl_ptr_vector_get(p_subn-port_lid_tbl, + cl_ntoh16(p_infr_rec-report_addr.dest_lid)); + if (!p_dest_port) { + OSM_LOG(p_log, OSM_LOG_INFO, + Cannot find destination port with LID:%u\n, + cl_ntoh16(p_infr_rec-report_addr.dest_lid)); + goto Exit; + } + + switch (trap_num) { + case 66: + case 67: + p_mgrp = osm_get_mgrp_by_mgid(p_subn, source_gid); + if (!p_mgrp) { + OSM_LOG(p_log, OSM_LOG_INFO, + Cannot find MGID %s\n, + inet_ntop(AF_INET6, source_gid.raw, gid_str, sizeof gid_str)); + goto Exit; + } + + if (!osm_physp_has_pkey(p_log, + p_mgrp-mcmember_rec.pkey, + p_dest_port-p_physp)) { + OSM_LOG(p_log,
[PATCH] Add new device IDs for ConnectX VPI HCAs
Signed-off-by: Vladimir Sokolovsky v...@mellanox.co.il --- src/mlx4.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/src/mlx4.c b/src/mlx4.c index 1295c53..973df68 100644 --- a/src/mlx4.c +++ b/src/mlx4.c @@ -66,6 +66,8 @@ struct { HCA(MELLANOX, 0x6354), /* MT25408 Hermon QDR */ HCA(MELLANOX, 0x6732), /* MT25408 Hermon DDR PCIe gen2 */ HCA(MELLANOX, 0x673c), /* MT25408 Hermon QDR PCIe gen2 */ + HCA(MELLANOX, 0x6746), /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE Virt+ */ + HCA(MELLANOX, 0x6778), /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR / 10GigE Virt+ */ }; static struct ibv_context_ops mlx4_ctx_ops = { -- 1.5.4.3 -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] bug 1918 - openmpi broken due to rdma-cm changes
Tziporet Koren wrote: On 2/7/2010 6:39 PM, Steve Wise wrote: If ofed-1.5.1 is based on 2.6.33 then it will get this patch automatically (assuming it goes upstream and makes 2.6.33). Or we can pull it in as a kernel_patches/fixes/ patch. OFED 1.5.1 is not based on 2.6.33, but on 2.6.30, so we need the patch under fixes. Steve - can you prepare such a patch? Tziporet The reason I thought it was based on 2.6.33, is because I see 2.6.33 git tags in the ofed kernel tree. I misinterpreted what that meant. I can develop a patch, but it will disable _all_ 127.0.0.1 binds. Otherwise openmpi is still broken on IB. Steve. -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices
This patch doesn't solve the openmpi/IB regression. So for OFED, IMO, we need a different patch... Tziporet Koren wrote: On 2/8/2010 8:02 AM, Sean Hefty wrote: Since iWarp devices are not guaranteed to support loopback connections, prevent rdma_bind_addr from associating the loopback address with an iWarp device. Signed-off-by: Sean Heftysean.he...@intel.com Steve Have you tested this patch? When accepted to kernel can you prepare a patch for OFED 1.5.1 under fixes Thanks Tziporet -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] opensm: bug in trap report for MC create(66) and delete(67) traps
Hal Rosenstock wrote: On Sun, Feb 7, 2010 at 4:47 AM, Eli Dorfman (Voltaire) dorfman@gmail.com wrote: Hal Rosenstock wrote: On Fri, Feb 5, 2010 at 9:18 AM, Eli Dorfman dorfman@gmail.com wrote: On Thu, Feb 4, 2010 at 10:52 PM, Hal Rosenstock hal.rosenst...@gmail.com wrote: On Thu, Feb 4, 2010 at 12:43 PM, Eli Dorfman (Voltaire) dorfman@gmail.com wrote: Subject: [PATCH] Wrong handling of MC create and delete traps For these traps the GID in the data details is the MGID and not the source port gid. So the SM should check that subscriber port has the pkey of the MC group. There was also an error in comparing the subnet prefix and guid due to host/network order mismatch. Signed-off-by: Eli Dorfman e...@voltaire.com --- opensm/opensm/osm_inform.c | 151 --- 1 files changed, 98 insertions(+), 53 deletions(-) diff --git a/opensm/opensm/osm_inform.c b/opensm/opensm/osm_inform.c index 8108213..ae4fe71 100644 --- a/opensm/opensm/osm_inform.c +++ b/opensm/opensm/osm_inform.c @@ -341,6 +341,103 @@ Exit: return status; } +static int is_access_permitted( osm_infr_t *p_infr_rec, + osm_infr_match_ctxt_t *p_infr_match ) +{ + cl_list_t *p_infr_to_remove_list = p_infr_match-p_remove_infr_list; + ib_inform_info_t *p_ii = (p_infr_rec-inform_record.inform_info); + ib_mad_notice_attr_t *p_ntc = p_infr_match-p_ntc; + uint16_t trap_num = cl_ntoh16(p_ntc-g_or_v.generic.trap_num); + osm_subn_t *p_subn = p_infr_rec-sa-p_subn; + osm_log_t *p_log = p_infr_rec-sa-p_log; + char gid_str[INET6_ADDRSTRLEN]; + osm_mgrp_t *p_mgrp; + ib_gid_t source_gid; + osm_port_t *p_src_port; + osm_port_t *p_dest_port; + + /* In case of GID_IN(64) or GID_OUT(65) traps the source gid + comparison should be done on the trap source (saved as the gid in the + data details field). + For traps MC_CREATE(66) or MC_DELETE(67) the data details gid is + the MGID. We need to check whether subscriber has the pky of typo pkey + the MC group. Shouldn't this be the subscriber has a compatible pkey with MC group ? The MC group has a full member PKey and the members can be full or limited. I accept the correction. Doesn't this require a code change for handling trap cases 66-67 ? I think that you referred to the comment since the code is handling this properly (in my opinion). I was referring to both the comment and the code since a port with a compatible limited pkey should be able to receive the reports for MC groups. I agree and I think that the code is handling this case properly. osm_physp_has_pkey() takes the 15 lower MGID pkey bits and checks whether it is the physp pkey table. Eli Sasha, can you please change this in the commit (only if there are not other remarks). Is that what you are asking Sasha to do (beyond the typos) ? I asked Sasha to fix only the typo in commit. BTW, there is no explicit reference in the IB spec for MC group create/delete trap (at least I didn't find it). Not sure what you mean by this. What didn't you find ? in the spec see o13-17.1.2 Yes, there appear to be some holes in the spec in terms of this and maybe more in this area (event forwarding) but the intent seems clear. -- Hal Thanks, Eli -- Hal + In all other cases the issuer gis is the trap source. typo ^^^ gid and this typo of course. Thanks, Eli -- Hal + */ + if (trap_num = 64 trap_num = 67 ) + /* The issuer of these traps is the SM so source_gid + is the gid saved on the data details */ + source_gid = p_ntc-data_details.ntc_64_67.gid; + else + source_gid = p_ntc-issuer_gid; + + p_dest_port = + cl_ptr_vector_get(p_subn-port_lid_tbl, + cl_ntoh16(p_infr_rec-report_addr.dest_lid)); + if (!p_dest_port) { + OSM_LOG(p_log, OSM_LOG_INFO, + Cannot find destination port with LID:%u\n, + cl_ntoh16(p_infr_rec-report_addr.dest_lid)); + goto Exit; + } + + switch (trap_num) { + case 66: + case 67: + p_mgrp = osm_get_mgrp_by_mgid(p_subn, source_gid); + if (!p_mgrp) { + OSM_LOG(p_log, OSM_LOG_INFO, + Cannot find MGID %s\n, + inet_ntop(AF_INET6, source_gid.raw, gid_str, sizeof gid_str)); + goto Exit; + } + + if
Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices
This patch doesn't solve the openmpi/IB regression. So for OFED, IMO, we need a different patch... If this doesn't solve the regression the we should have a different patch for upstream too. The goal for 2.6.33 should be to keep open mpi working, even if that requires us to go back to old breakage. -- Roland Dreier rola...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/index.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC Patch] net: reserve ports for applications using fixed port numbers
On Monday 08 February 2010 05:21:50 you wrote: Octavian Purdila wrote: On Friday 05 February 2010 06:45:38 you wrote: Again, using bitmap algorithm is not a problem and it's better, the problem is sysctl interface, how would you plan to interact with users via sysctl/proc if you use bitmap to handle this? I would like to hear more details about this. We could use something like positive values for setting and negative for reset (e.g. 3 would set the port in the bitmap and -3 would reset it). Hmm, then how do you output the info of those ports? Arrays of bitmaps? See the patch bellow (work in progress). BTW, while working on it I added some helpers, which we can use to rewrite the proc_doint/long stuff. I think it will help with readability and eliminates some code duplication as well. What do you guys think about that? --- linux_2.6.32/main/src/kernel/sysctl.c +++ linux_2.6.32/main/src/kernel/sysctl.c @@ -250,6 +250,11 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ #endif +static unsigned long test_bitmap[65535/sizeof(long)]; +static int proc_dobitmap(struct ctl_table *table, int write, +void __user *buf, size_t *lenp, loff_t *ppos); + + static struct ctl_table kern_table[] = { { .ctl_name = CTL_UNNUMBERED, @@ -1032,6 +1037,15 @@ .proc_handler = proc_dointvec, }, #endif + { + .ctl_name = CTL_UNNUMBERED, + .procname = bitmap_test, + .data = test_bitmap, + .maxlen = 65535, + .mode = 0644, + .proc_handler = proc_dobitmap, + }, + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -2902,6 +2916,194 @@ return 0; } +static int proc_skip_wspace(char __user **buf, size_t *size) +{ + char c; + + while (*size) { + if (get_user(c, *buf)) + return -EFAULT; + if (!isspace(c)) + break; + *size--; *buf++; + } + + return 0; +} + +static inline int _proc_get_ulong(char __user **buf, size_t *size, + unsigned long *val, bool *neg) +{ +#define TMPBUFLEN 21 + int len = *size; + char *p, tmp[TMPBUFLEN]; + + if (len TMPBUFLEN-1) + len = TMPBUFLEN-1; + + if (copy_from_user(tmp, *buf, len)) + return -EFAULT; + + tmp[len] = 0; + p = tmp; + if (*p == '-' *size 1) { + *neg = 1; + p++; + } + if (*p '0' || *p '9') + return -EINVAL; + + *val = simple_strtoul(p, p, 0); + + len = p - tmp; + if ((len *size) *p !isspace(*p)) + return -EINVAL; + + *buf += len; *size -= len; + + return 0; +#undef TMPBUFLEN +} + +static int proc_get_long(char __user **buf, size_t *size, long *val) +{ + int err; + bool neg; + unsigned long uval; + + err = _proc_get_ulong(buf, size, uval, neg); + if (err) + return err; + + if (neg) + *val = -uval; + else + *val = uval; + + return 0; +} + +static int proc_get_ulong(char __user **buf, size_t *size, unsigned long *val) +{ + int err; + bool neg; + + err = _proc_get_ulong(buf, size, val, neg); + if (err) + return err; + if (neg) + return -EINVAL; + + return 0; +} + +static int proc_put_ulong(char __user **buf, size_t *size, unsigned long val, + bool first) +{ +#define TMPBUFLEN 21 + int len; + char tmp[TMPBUFLEN], *p = tmp; + + if (!first) + *p++ = '\t'; + sprintf(p, %lu, val); + len = strlen(tmp); + if (len *size) + len = *size; + if (copy_to_user(*buf, tmp, len)) + return -EFAULT; + *size -= len; + *buf += len; + return 0; +#undef TMPBUFLEN +} + +static int proc_put_newline(char __user **buf, size_t *size) +{ + if (*size) { + if (put_user('\n', *buf)) + return -EFAULT; + *size--, *buf++; + } + return 0; +} + +static int proc_dobitmap(struct ctl_table *table, int write, +void __user *buf, size_t *lenp, loff_t *ppos) +{ + bool first = 1; + unsigned long *bitmap = (unsigned long *) table-data; + unsigned long bitmap_len = table-maxlen; + int left = *lenp, err = 0; + char __user *buffer = (char __user *) buf; + + if (!bitmap_len || !left || (*ppos !write)) { + *lenp = 0; + return 0; + } + + if (write) { + while (left) { + long val; + +
Re: Patch series from Dec 2009 which needs reviewing/applying
[PATCH 0/2] fix SRQ WQE buffer initialization in liblmx4 and in mlx4_ib http://www.spinics.net/lists/linux-rdma/msg01911.html This isn't a patch AFAICT. [PATCH 1/2] libmlx4: initialize SRQ scatter entries when creating an SRQ http://www.spinics.net/lists/linux-rdma/msg01912.html Just applied this. [PATCH 2/2] mlx4_ib: initialize SRQ scatter entries when creating an SRQ http://www.spinics.net/lists/linux-rdma/msg01910.html Has been upstream as 4c425588 for a few weeks. -- Roland Dreier rola...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/index.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] opensm: bug in trap report for MC create(66) and delete(67) traps
On Mon, Feb 8, 2010 at 11:05 AM, Eli Dorfman (Voltaire) dorfman@gmail.com wrote: Hal Rosenstock wrote: On Sun, Feb 7, 2010 at 4:47 AM, Eli Dorfman (Voltaire) dorfman@gmail.com wrote: Hal Rosenstock wrote: On Fri, Feb 5, 2010 at 9:18 AM, Eli Dorfman dorfman@gmail.com wrote: On Thu, Feb 4, 2010 at 10:52 PM, Hal Rosenstock hal.rosenst...@gmail.com wrote: On Thu, Feb 4, 2010 at 12:43 PM, Eli Dorfman (Voltaire) dorfman@gmail.com wrote: Subject: [PATCH] Wrong handling of MC create and delete traps For these traps the GID in the data details is the MGID and not the source port gid. So the SM should check that subscriber port has the pkey of the MC group. There was also an error in comparing the subnet prefix and guid due to host/network order mismatch. Signed-off-by: Eli Dorfman e...@voltaire.com --- opensm/opensm/osm_inform.c | 151 --- 1 files changed, 98 insertions(+), 53 deletions(-) diff --git a/opensm/opensm/osm_inform.c b/opensm/opensm/osm_inform.c index 8108213..ae4fe71 100644 --- a/opensm/opensm/osm_inform.c +++ b/opensm/opensm/osm_inform.c @@ -341,6 +341,103 @@ Exit: return status; } +static int is_access_permitted( osm_infr_t *p_infr_rec, + osm_infr_match_ctxt_t *p_infr_match ) +{ + cl_list_t *p_infr_to_remove_list = p_infr_match-p_remove_infr_list; + ib_inform_info_t *p_ii = (p_infr_rec-inform_record.inform_info); + ib_mad_notice_attr_t *p_ntc = p_infr_match-p_ntc; + uint16_t trap_num = cl_ntoh16(p_ntc-g_or_v.generic.trap_num); + osm_subn_t *p_subn = p_infr_rec-sa-p_subn; + osm_log_t *p_log = p_infr_rec-sa-p_log; + char gid_str[INET6_ADDRSTRLEN]; + osm_mgrp_t *p_mgrp; + ib_gid_t source_gid; + osm_port_t *p_src_port; + osm_port_t *p_dest_port; + + /* In case of GID_IN(64) or GID_OUT(65) traps the source gid + comparison should be done on the trap source (saved as the gid in the + data details field). + For traps MC_CREATE(66) or MC_DELETE(67) the data details gid is + the MGID. We need to check whether subscriber has the pky of typo pkey + the MC group. Shouldn't this be the subscriber has a compatible pkey with MC group ? The MC group has a full member PKey and the members can be full or limited. I accept the correction. Doesn't this require a code change for handling trap cases 66-67 ? I think that you referred to the comment since the code is handling this properly (in my opinion). I was referring to both the comment and the code since a port with a compatible limited pkey should be able to receive the reports for MC groups. I agree and I think that the code is handling this case properly. osm_physp_has_pkey() takes the 15 lower MGID pkey bits and checks whether it is the physp pkey table. You're right; the code handles it. I missed the ib_pkey_get_base call there. -- Hal Eli Sasha, can you please change this in the commit (only if there are not other remarks). Is that what you are asking Sasha to do (beyond the typos) ? I asked Sasha to fix only the typo in commit. BTW, there is no explicit reference in the IB spec for MC group create/delete trap (at least I didn't find it). Not sure what you mean by this. What didn't you find ? in the spec see o13-17.1.2 Yes, there appear to be some holes in the spec in terms of this and maybe more in this area (event forwarding) but the intent seems clear. -- Hal Thanks, Eli -- Hal + In all other cases the issuer gis is the trap source. typo ^^^ gid and this typo of course. Thanks, Eli -- Hal + */ + if (trap_num = 64 trap_num = 67 ) + /* The issuer of these traps is the SM so source_gid + is the gid saved on the data details */ + source_gid = p_ntc-data_details.ntc_64_67.gid; + else + source_gid = p_ntc-issuer_gid; + + p_dest_port = + cl_ptr_vector_get(p_subn-port_lid_tbl, + cl_ntoh16(p_infr_rec-report_addr.dest_lid)); + if (!p_dest_port) { + OSM_LOG(p_log, OSM_LOG_INFO, + Cannot find destination port with LID:%u\n, + cl_ntoh16(p_infr_rec-report_addr.dest_lid)); + goto Exit; + } + + switch (trap_num) { + case 66: + case 67: + p_mgrp = osm_get_mgrp_by_mgid(p_subn, source_gid); + if (!p_mgrp) { + OSM_LOG(p_log, OSM_LOG_INFO, + Cannot find MGID %s\n, +
Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices
Jason Gunthorpe wrote: On Mon, Feb 08, 2010 at 08:52:10AM -0800, Roland Dreier wrote: This patch doesn't solve the openmpi/IB regression. So for OFED, IMO, we need a different patch... If this doesn't solve the regression the we should have a different patch for upstream too. The goal for 2.6.33 should be to keep open mpi working, even if that requires us to go back to old breakage. Steve, I thought you said earlier in the thread that the rdmacm OMPI method is not used that often with IB - and the other IB connect methods work fine. Maybe Jeff can chime in here, but he mentioned to me that Sandia Labs were using IB/rdmacm. This really is a bug in OMPI, how long do you think this new feature should remain outside the upstream kernel? Is someone going to commit to fixing OMPI soon if the patch is removed? IMO 127.0.0.1 should be for SW loopback, not HW RDMA loopback. But I believe Jeff asked at least that we pull it from 2.6.33 and let OMPI get its next release out with the OMPI fix. Then you can push it into 2.6.34 if we really want this feature. I will commit to get the fix in openmpi asap. Steve. -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH RFC] Here is a not tested patch that I think removes support for binding
to 127.0.0.1. Sean will this work? If we agree to do this for 2.6.33, then I'll build/test this and resubmit. rdma/cma: Disallow binding rdma endpoints to 127.0.0.1. Currently this functionality breaks openmpi. Once openmpi is fixed to correctly ignore 127.0.0.1 as a valid external rdma address, we can re-enable this functionality. Signed-off-by: Steve Wise sw...@opengridcomputing.com --- drivers/infiniband/core/cma.c | 16 ++-- 1 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index cc9b594..cd3d351 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -628,19 +628,9 @@ static inline int cma_zero_addr(struct sockaddr *addr) } } -static inline int cma_loopback_addr(struct sockaddr *addr) -{ - if (addr-sa_family == AF_INET) - return ipv4_is_loopback( - ((struct sockaddr_in *) addr)-sin_addr.s_addr); - else - return ipv6_addr_loopback( - ((struct sockaddr_in6 *) addr)-sin6_addr); -} - static inline int cma_any_addr(struct sockaddr *addr) { - return cma_zero_addr(addr) || cma_loopback_addr(addr); + return cma_zero_addr(addr); } static inline __be16 cma_port(struct sockaddr *addr) @@ -2115,9 +2105,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; - if (cma_loopback_addr(addr)) { - ret = cma_bind_loopback(id_priv); - } else if (!cma_zero_addr(addr)) { + if (!cma_zero_addr(addr)) { ret = rdma_translate_ip(addr, id-route.addr.dev_addr); if (ret) goto err1; -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices
IMO 127.0.0.1 should be for SW loopback, not HW RDMA loopback. I disagree, but what does it matter? So, we add a 'software' loopback that uses 127.0.0.1. Openmpi still wouldn't work. I will commit to get the fix in openmpi asap. If we don't care if the fix is in the kernel or user space, then we could add an a 'disable-loopback-support' build option to librdmacm, which can fail any attempt to bind to a loopback address. -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices
Sean Hefty wrote: IMO 127.0.0.1 should be for SW loopback, not HW RDMA loopback. I disagree, but what does it matter? So, we add a 'software' loopback that uses 127.0.0.1. Openmpi still wouldn't work. I guess that's true. I will commit to get the fix in openmpi asap. If we don't care if the fix is in the kernel or user space, then we could add an a 'disable-loopback-support' build option to librdmacm, which can fail any attempt to bind to a loopback address. I'd rather see it removed from 2.6.33 kernel before it shipts, and then we fix openmpi, and then re-submit 127.0.0.1 support once openmpi publishes a release with its fix. See my other email that submits a potential commit to remove 127.0.0.1 support for 2.6.33. Steve. -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 00/18] Increase maximum number of Infiniband HCAs per system
Hi Roland, Any thoughts on this patch series? Thanks, /ac * Alex Chiang achi...@hp.com: This is v2 of a patch series that increases the maximum number of IB HCAs supported per system. The original mail thread is here: http://lkml.org/lkml/2010/1/29/346 One note, I decided to copy/paste since factoring out the overflow code in the three drivers seemed like overkill. If so desired, I could factor those three separate functions into something provided by the core, but that seemed like more trouble than it was worth at the time. As before, I still don't have access to a giant system, so what I did to test was to stick 4 cards into a small system, and then modify the limits with debug patches similar to this: diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 7bf0a82..8581e64 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -102,7 +102,7 @@ struct ib_ucm_event { enum { IB_UCM_MAJOR = 231, IB_UCM_BASE_MINOR = 224, - IB_UCM_MAX_DEVICES = 32 + IB_UCM_MAX_DEVICES = 2 }; I tested all 3 drivers this way (uverbs, umad, ucm). I verified that we're not leaking device numbers on multiple modprobe/rmmod cycles, that there aren't any funny interactions when various combinations of the drivers are loaded. I did not test the rest of the OFED stack. I did write some trivial programs to open the devices in /dev and close them again. Here's an example of some of the testing: dl585g2:~ # modprobe ib_uverbs dl585g2:~ # modprobe ib_umad dl585g2:~ # modprobe ib_ucm dl585g2:~ # ls -l /dev/uverb* crw-rw 1 root root 231, 192 Feb 2 05:55 /dev/uverbs0 crw-rw 1 root root 231, 193 Feb 2 05:55 /dev/uverbs1 crw-rw 1 root root 249, 0 Feb 2 05:55 /dev/uverbs2 crw-rw 1 root root 249, 1 Feb 2 05:55 /dev/uverbs3 dl585g2:~ # ls -l /dev/umad* crw-rw 1 root root 231, 0 Feb 2 05:55 /dev/umad0 crw-rw 1 root root 231, 1 Feb 2 05:55 /dev/umad1 crw-rw 1 root root 231, 2 Feb 2 05:55 /dev/umad2 crw-rw 1 root root 231, 3 Feb 2 05:55 /dev/umad3 crw-rw 1 root root 248, 0 Feb 2 05:55 /dev/umad4 crw-rw 1 root root 248, 1 Feb 2 05:55 /dev/umad5 crw-rw 1 root root 248, 2 Feb 2 05:55 /dev/umad6 crw-rw 1 root root 248, 3 Feb 2 05:55 /dev/umad7 dl585g2:~ # ls -l /dev/issm* crw-rw 1 root root 231, 4 Feb 2 05:55 /dev/issm0 crw-rw 1 root root 231, 5 Feb 2 05:55 /dev/issm1 crw-rw 1 root root 231, 6 Feb 2 05:55 /dev/issm2 crw-rw 1 root root 231, 7 Feb 2 05:55 /dev/issm3 crw-rw 1 root root 248, 4 Feb 2 05:55 /dev/issm4 crw-rw 1 root root 248, 5 Feb 2 05:55 /dev/issm5 crw-rw 1 root root 248, 6 Feb 2 05:55 /dev/issm6 crw-rw 1 root root 248, 7 Feb 2 05:55 /dev/issm7 dl585g2:~ # ls -l /dev/ucm* crw-rw 1 root root 231, 224 Feb 2 05:55 /dev/ucm0 crw-rw 1 root root 231, 225 Feb 2 05:55 /dev/ucm1 crw-rw 1 root root 247, 0 Feb 2 05:55 /dev/ucm2 crw-rw 1 root root 247, 1 Feb 2 05:55 /dev/ucm3 Note that the major and minor numbers are behaving rather sanely. dl585g2:~ # rmmod ib_ucm dl585g2:~ # rmmod ib_uverbs dl585g2:~ # rmmod ib_umad Reset. dl585g2:~ # modprobe ib_ucm dl585g2:~ # ls -l /dev/ucm* crw-rw 1 root root 231, 224 Feb 2 05:57 /dev/ucm0 crw-rw 1 root root 231, 225 Feb 2 05:57 /dev/ucm1 crw-rw 1 root root 248, 0 Feb 2 05:57 /dev/ucm2 crw-rw 1 root root 248, 1 Feb 2 05:57 /dev/ucm3 See that /dev/ucm* devices now have a different major number compared to last time(248 vs 247), since we loaded that driver first. But wait, why is it 248 and not 249? Is there a leak somewhere? dl585g2:~ # ls -l /dev/uverb* crw-rw 1 root root 231, 192 Feb 2 05:57 /dev/uverbs0 crw-rw 1 root root 231, 193 Feb 2 05:57 /dev/uverbs1 crw-rw 1 root root 249, 0 Feb 2 05:57 /dev/uverbs2 crw-rw 1 root root 249, 1 Feb 2 05:57 /dev/uverbs3 dl585g2:~ # rmmod ib_uverbs ERROR: Module ib_uverbs is in use by ib_ucm Ah, ib_ucm is dependent on ib_uverbs, so when we modprobed ib_ucm, in reality ib_uverbs got loaded first. See how it has a higher major number. dl585g2:~ # rmmod ib_ucm dl585g2:~ # rmmod ib_uverbs dl585g2:~ # modprobe ib_umad dl585g2:~ # ls -l /dev/umad* crw-rw 1 root root 231, 0 Feb 2 05:58 /dev/umad0 crw-rw 1 root root 231, 1 Feb 2 05:58 /dev/umad1 crw-rw 1 root root 231, 2 Feb 2 05:58 /dev/umad2 crw-rw 1 root root 231, 3 Feb 2 05:58 /dev/umad3 crw-rw 1 root root 249, 0 Feb 2 05:58 /dev/umad4 crw-rw 1 root root 249, 1 Feb 2 05:58
Re: [ewg] rdma/cm: disallow loopback address for iwarp devices
Sorry -- I missed many of these mails today due to mail filtering (don't ask). FWIW: - I'm not opposed to adding LOOPBACK checks into OMPI to avoid this problem (I'm waiting for a patch, actually). I'm just saying that we're not going to get a release out immediately with this fix. Our next release was scheduled to be 1.4.2, and it is still at least several weeks away. So allowing this in 2.6.33 would be Bad because a) we know it breaks OMPI, and b) OMPI can't get a release out immediately to fix the issue. - There are customers who are using RDMA CM with IB (e.g., Sandia with their Mesh/IB routing stuff). - I see the following in rdma_bind_addr(3): - DESCRIPTION Associates a source address with an rdma_cm_id. The address may be wildcarded. If binding to a specific local address, the rdma_cm_id will also be bound to a local RDMA device. - What RDMA device is bound to when you use 127.0.0.1? I'm not 100% sure, but I think that this might be where we got the rationale that we didn't need additional LOOPBACK tests in OMPI... (if anyone else agrees with this interpretation, then it's at least one argument that allowing binding to LOOPBACK devices *is* a change in semantics, and therefore should be treated extremely carefully) On Feb 8, 2010, at 4:16 PM, Steve Wise wrote: Sean Hefty wrote: IMO 127.0.0.1 should be for SW loopback, not HW RDMA loopback. I disagree, but what does it matter? So, we add a 'software' loopback that uses 127.0.0.1. Openmpi still wouldn't work. I guess that's true. I will commit to get the fix in openmpi asap. If we don't care if the fix is in the kernel or user space, then we could add an a 'disable-loopback-support' build option to librdmacm, which can fail any attempt to bind to a loopback address. I'd rather see it removed from 2.6.33 kernel before it shipts, and then we fix openmpi, and then re-submit 127.0.0.1 support once openmpi publishes a release with its fix. See my other email that submits a potential commit to remove 127.0.0.1 support for 2.6.33. Steve. -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/ -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH resend] rdma/cma: Disallow binding rdma endpoints to 127.0.0.1.
Here is a not tested patch that I think removes support for binding to 127.0.0.1. Sean will this work? If we agree to do this for 2.6.33, then I'll build/test this and resubmit. rdma/cma: Disallow binding rdma endpoints to 127.0.0.1. Currently this functionality breaks openmpi. Once openmpi is fixed to correctly ignore 127.0.0.1 as a valid external rdma address, we can re-enable this functionality. Signed-off-by: Steve Wise sw...@opengridcomputing.com --- drivers/infiniband/core/cma.c | 16 ++-- 1 files changed, 2 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index cc9b594..cd3d351 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -628,19 +628,9 @@ static inline int cma_zero_addr(struct sockaddr *addr) } } -static inline int cma_loopback_addr(struct sockaddr *addr) -{ - if (addr-sa_family == AF_INET) - return ipv4_is_loopback( - ((struct sockaddr_in *) addr)-sin_addr.s_addr); - else - return ipv6_addr_loopback( - ((struct sockaddr_in6 *) addr)-sin6_addr); -} - static inline int cma_any_addr(struct sockaddr *addr) { - return cma_zero_addr(addr) || cma_loopback_addr(addr); + return cma_zero_addr(addr); } static inline __be16 cma_port(struct sockaddr *addr) @@ -2115,9 +2105,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; - if (cma_loopback_addr(addr)) { - ret = cma_bind_loopback(id_priv); - } else if (!cma_zero_addr(addr)) { + if (!cma_zero_addr(addr)) { ret = rdma_translate_ip(addr, id-route.addr.dev_addr); if (ret) goto err1; -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] rdma/cm: disallow loopback address for iwarp devices
On Feb 8, 2010, at 5:09 PM, Jason Gunthorpe wrote: DESCRIPTION Associates a source address with an rdma_cm_id. The address may be wildcarded. If binding to a specific local address, the rdma_cm_id will also be bound to a local RDMA device. This statement is trying to say that if a source address is given then the rdma_cm_id will be bound to a device. Which device is bound to if you specify 127.0.0.1 as the source address? (which is what OMPI is doing) Is it possible to assign 127.0.0.1 to an RDMA device? -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/ -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] rdma/cm: disallow loopback address for iwarp devices
On Feb 8, 2010, at 5:13 PM, Sean Hefty wrote: Are you certain that rdma_bind_addr does NOT work with 127.0.0.1, and that this is now the problem? It does appear to work on OFED 1.4 and on 2.6.26 based on ucmatose. Is the problem really with rdma_bind_addr succeeding, or with rdma_connect, which now works, or rdma_bind_addr now assigning a device? On my OFED 1.4.1 RHEL4u6 systems, rdma_bind_addr() fails when attempting to bind to 127.0.0.1 per the email I sent Friday: http://www.spinics.net/lists/linux-rdma/msg02568.html I have not checked any other combinations; Steve was saying that he saw it rdma_bind_addr() succeeding on his machines with OFED 1.5.1rcwhatever (I don't recall the OS he said he was using). -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/ -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [ewg] rdma/cm: disallow loopback address for iwarp devices
On my OFED 1.4.1 RHEL4u6 systems, rdma_bind_addr() fails when attempting to bind to 127.0.0.1 per the email I sent Friday: http://www.spinics.net/lists/linux-rdma/msg02568.html This is what I see over IB on 2.6.26, with a couple extra prints added to cmatose: cst-lin1:/home/mshefty/librdmacm# examples/ucmatose -b 127.0.0.1 cmatose: starting server src addr 0x17f rdma_bind_addr: 0 so we're missing something else. -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] rdma/cm: disallow loopback address for iwarp devices
Sean, can you try openmpi? It fails for me, and yet ucmatose succeeds. I don't understand the difference yet... Sean Hefty wrote: On my OFED 1.4.1 RHEL4u6 systems, rdma_bind_addr() fails when attempting to bind to 127.0.0.1 per the email I sent Friday: http://www.spinics.net/lists/linux-rdma/msg02568.html This is what I see over IB on 2.6.26, with a couple extra prints added to cmatose: cst-lin1:/home/mshefty/librdmacm# examples/ucmatose -b 127.0.0.1 cmatose: starting server src addr 0x17f rdma_bind_addr: 0 so we're missing something else. -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [ewg] rdma/cm: disallow loopback address for iwarp devices
Sean, can you try openmpi? It fails for me, and yet ucmatose succeeds. I don't understand the difference yet... I believe the issue is that rdma_bind_addr succeeds (returns 0), but no device is assigned to the rdma_cm_id (verbs field is NULL). This was a change from commit 6f8372b69c3198e06cecb1df2cb9682d0c55e657: The defined behavior of rdma_bind_addr is to associate an RDMA device with an rdma_cm_id, as long as the user specified a non- zero address. (ie they weren't just trying to reserve a port) Currently, if the loopback address is passed to rdma_bind_addr, no device is associated with the rdma_cm_id. Fix this. There are two places where rdma_bind_addr() is called in the openmpi source code (based on a tarball download of the trunk). One is btl_openib_iwarp.c: rc = rdma_bind_addr(cm_id, ipaddr); if (rc || !cm_id-verbs) { rc = OMPI_SUCCESS; goto out3; } The other is btl_openib_connect_rdmacm.c, but that deals with listening. I can't quickly determine if btl_openib_iwarp.c is usually used for IB or not. So, to fully keep the behavior of 2.6.32, rdma_bind_addr for 127.0.0.1 should succeed, but not assign a device. I think this was the change from commit ..c55e657 that changed the behavior: @@ -2089,7 +2096,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; - if (!cma_any_addr(addr)) { + if (cma_loopback_addr(addr)) { + ret = cma_bind_loopback(id_priv); + } else if (!cma_zero_addr(addr)) { ret = rdma_translate_ip(addr, id-route.addr.dev_addr); if (ret) goto err1; I'll see if reverting this gives the desired(?) behavior. - Sean -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] rdma/cm: disallow loopback address for iwarp devices
On Feb 8, 2010, at 6:48 PM, Sean Hefty wrote: rc = rdma_bind_addr(cm_id, ipaddr); if (rc || !cm_id-verbs) { rc = OMPI_SUCCESS; goto out3; } Ah, yes! Per the OMPI code you cited, I amended my printf's and see: [svbu-mpi.cisco.com:19315] FAILED to bind to 127.0.0.1: rc=0, verbs=(nil) So the rc from from rdma_bind_addr was 0, but you're right that the verbs pointer was NULL, and we therefore rule that it was no good. The other is btl_openib_connect_rdmacm.c, but that deals with listening. I can't quickly determine if btl_openib_iwarp.c is usually used for IB or not. It is. So, to fully keep the behavior of 2.6.32, rdma_bind_addr for 127.0.0.1 should succeed, but not assign a device. I think this was the change from commit ..c55e657 that changed the behavior: @@ -2089,7 +2096,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND)) return -EINVAL; - if (!cma_any_addr(addr)) { + if (cma_loopback_addr(addr)) { + ret = cma_bind_loopback(id_priv); + } else if (!cma_zero_addr(addr)) { ret = rdma_translate_ip(addr, id-route.addr.dev_addr); if (ret) goto err1; I'll see if reverting this gives the desired(?) behavior. Thanks! -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/ -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] rdma/cm: disallow loopback address for iwarp devices
Jeff Squyres wrote: On Feb 8, 2010, at 7:30 PM, Pradeep Satyanarayana wrote: elm3b199:/usr/lib # /usr/mpi/gcc/openmpi-1.4.1/bin/mpirun -np 2 --bynode --mca btl_openib_cpc_include rdmacm ring -- mpirun was unable to launch the specified application as it could not find an executable: Executable: ring Node: elm3b199 while attempting to start process rank 0. -- elm3b199:/usr/lib # Is there an executable named ring either in your $PATH or in /usr/lib? Open MPI is telling you it can't find an executable named ring. Hi Jeff, No, there is none. I got this command from one of the mails in the thread. What should I use instead? Thanks Pradeep -- To unsubscribe from this list: send the line unsubscribe linux-rdma in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ewg] rdma/cm: disallow loopback address for iwarp devices
On Feb 8, 2010, at 7:50 PM, Pradeep Satyanarayana wrote: No, there is none. I got this command from one of the mails in the thread. What should I use instead? You need to compile and run an MPI program. ring is a typical test program that sends a message around in a ring. I think that OFED installs those test apps somewhere, but I don't recall where offhand. ring_c.c is attached. Compile it with: mpicc ring_c.c -o ring (you might need the full path to mpicc if it's not in your path?) A better mpirun command line would be: /usr/mpi/gcc/openmpi-1.4.1/bin/mpirun -np 2 --host HOSTNAME1,HOSTNAME2 \ --mca btl openib,sm,self --mca btl_openib_cpc_include rdmacm ring Put in your own HOSTNAME1 and HOSTNAME2 values. You'll also need to ensure that both Open MPI and ring are available on both names (preferably in the same filesystem locations on both nodes, for simplicity) and that you can ssh to from one node to the other without being prompted for a password or passphrase. This will run a 2-process MPI job across the two nodes, passing a message between the two processes a few times before quitting. The various --mca parameters on this mpirun command line ensure that you are definitely using the OpenFabrics verbs support and forcing the use of RDMA CM. -- Jeff Squyres jsquy...@cisco.com For corporate legal information go to: http://www.cisco.com/web/about/doing_business/legal/cri/ ring_c.c Description: Binary data
Re: [PATCH] opensm: Add a name to IB subnet and include it in syslog messages
Hi Sasha, I have incorporated your feedback and sent you the modified patch with a new subject line [PATCH V2] opensm: Add option to specify prefix to syslog messages Regards, Benjamin Sasha Khapyorsky wrote: Hi Benjamin, On 14:16 Thu 21 Jan , Arputham Benjamin wrote: Added a text based name to an IB subnet to help user in identifying an IB subnet or understanding its function in a multi-fabric IB cluster. For example, in a dual-fabric (or dual-rail) IB cluster, one subnet could be named mpi and the other subnet could be named storage. Summary of changes: o Added the option 'subnet_name' to OpenSM command line and config file. o Enhanced OpenSM logging facility to include the subnet name in syslog messages. Looking at the usage below I can see that his is done as adding free text prefix to syslog prints. This is fine and seems could be useful for any purpose (not only different subnets) when syslog message mark is desired. Assuming so, wouldn't it be better to change subnet_name to something more generic, let say log_prefix? Signed-off-by: Arputham Benjamin abenja...@sgi.com --- diff -rup a/include/opensm/osm_log.h b/include/opensm/osm_log.h --- a/include/opensm/osm_log.h 2010-01-18 21:32:12.195328129 -0800 +++ b/include/opensm/osm_log.h 2010-01-18 21:34:46.573932164 -0800 @@ -120,6 +120,7 @@ typedef struct osm_log { boolean_t accum_log_file; boolean_t daemon; char *log_file_name; +char *subnet_name; } osm_log_t; /*/ diff -rup a/include/opensm/osm_subnet.h b/include/opensm/osm_subnet.h --- a/include/opensm/osm_subnet.h2010-01-18 21:32:12.195328129 -0800 +++ b/include/opensm/osm_subnet.h2010-01-18 21:34:55.782087826 -0800 @@ -224,6 +224,7 @@ typedef struct osm_subn_opt { char *event_plugin_name; char *node_name_map_name; char *prefix_routes_file; +char *subnet_name; boolean_t consolidate_ipv6_snm_req; struct osm_subn_opt *file_opts; /* used for update */ uint8_t lash_start_vl; /* starting vl to use in lash */ diff -rup a/man/opensm.8.in b/man/opensm.8.in --- a/man/opensm.8.in2010-01-19 11:29:03.954832199 -0800 +++ b/man/opensm.8.in2010-01-21 10:56:58.901836423 -0800 @@ -49,6 +49,7 @@ opensm \- InfiniBand subnet manager and [\-\-perfmgr_sweep_time_s seconds] [\-\-prefix_routes_file path] [\-\-consolidate_ipv6_snm_req] +[\-\-subnet_name subnet name] [\-v(erbose)] [\-V] [\-D flags] [\-d(ebug) number] [\-h(elp)] [\-?] @@ -345,6 +346,9 @@ effect if --enable-perfmgr was specified Use shared MLID for IPv6 Solicited Node Multicast groups per MGID scope and P_Key. .TP +\fB\-\-subnet_name\fR subnet name +This option specifies the text based name of the subnet. +.TP \fB\-v\fR, \fB\-\-verbose\fR This option increases the log verbosity level. The -v option may be specified multiple times diff -rup a/opensm/main.c b/opensm/main.c --- a/opensm/main.c 2010-01-18 21:31:43.318842260 -0800 +++ b/opensm/main.c 2010-01-19 11:51:45.566967909 -0800 @@ -324,6 +324,8 @@ static void show_usage(void) printf(--consolidate_ipv6_snm_req\n Use shared MLID for IPv6 Solicited Node Multicast groups\n per MGID scope and P_Key.\n\n); +printf(--subnet_name subnet name\n + Text based name of the IB subnet.\n\n); printf(--verbose, -v\n This option increases the log verbosity level.\n The -v option may be specified multiple times\n @@ -607,6 +609,7 @@ int main(int argc, char *argv[]) {lash_start_vl, 1, NULL, 6}, {sm_sl, 1, NULL, 7}, {retries, 1, NULL, 8}, +{subnet_name, 1, NULL, 9}, {NULL, 0, NULL, 0} /* Required at the end of the array */ }; @@ -985,6 +988,11 @@ int main(int argc, char *argv[]) printf( Transaction retries = %u\n, opt.transaction_retries); break; +case 9: +SET_STR_OPT(opt.subnet_name, optarg); +printf(IB subnet name = %s\n, + opt.subnet_name); +break; case 'h': case '?': case ':': diff -rup a/opensm/osm_log.c b/opensm/osm_log.c --- a/opensm/osm_log.c 2010-01-18 21:31:43.318842260 -0800 +++ b/opensm/osm_log.c 2010-01-18 21:33:47.808939648 -0800 @@ -107,6 +107,7 @@ void osm_log(IN osm_log_t * p_log, IN os char buffer[LOG_ENTRY_SIZE_MAX]; va_list args; int ret; +uint8_t ind = 0; #ifdef __WIN__ SYSTEMTIME st; uint32_t pid = GetCurrentThreadId(); @@ -123,7 +124,14 @@ void osm_log(IN osm_log_t * p_log, IN os return; va_start(args, p_str); -vsprintf(buffer, p_str, args); +if(p_log-subnet_name) { +