Re: [ewg] bug 1918 - openmpi broken due to rdma-cm changes

2010-02-08 Thread Tziporet Koren

On 2/7/2010 6:39 PM, Steve Wise wrote:


If ofed-1.5.1 is based on 2.6.33 then it will get this patch
automatically (assuming it goes upstream and makes 2.6.33).  Or we can
pull it in as a kernel_patches/fixes/ patch.
   
OFED 1.5.1 is not based on 2.6.33, but on 2.6.30, so we need the patch 
under fixes.

Steve - can you prepare such a patch?

Tziporet



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Tziporet Koren

On 2/8/2010 8:02 AM, Sean Hefty wrote:

Since iWarp devices are not guaranteed to support loopback connections,
prevent rdma_bind_addr from associating the loopback address with
an iWarp device.

Signed-off-by: Sean Heftysean.he...@intel.com
   


Steve
Have you tested this patch?
When accepted to kernel can you prepare a patch for OFED 1.5.1 under fixes

Thanks
Tziporet
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 02/9] ib/iser: new recv buffer posting logic

2010-02-08 Thread Or Gerlitz
Currently, the recv buffer posting logic is based on the transactional
nature of iser which allows for posting a buffer before sending a PDU.
Change this to post only when the number of outstanding recv buffers is
below a water mark and in a batched manner, thus simplifying and
optimizing the data path. Use a pre-allocated ring of recv buffers
instead of allocating from kmem cache. A special treatment is given
to the login response buffer whose size must be 8K unlike the size
of buffers used for any other purpose which is 128 bytes.

Signed-off-by: Or Gerlitz ogerl...@voltaire.com

---
 drivers/infiniband/ulp/iser/iscsi_iser.c |2
 drivers/infiniband/ulp/iser/iscsi_iser.h |   40 +++-
 drivers/infiniband/ulp/iser/iser_initiator.c |  235 +--
 drivers/infiniband/ulp/iser/iser_verbs.c |  134 +--
 4 files changed, 235 insertions(+), 176 deletions(-)

Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -102,9 +102,9 @@
 #define ISER_MAX_TX_MISC_PDUS  6 /* NOOP_OUT(2), TEXT(1), *
   * SCSI_TMFUNC(2), LOGOUT(1) */

-#define ISER_QP_MAX_RECV_DTOS  (ISCSI_DEF_XMIT_CMDS_MAX + \
-   ISER_MAX_RX_MISC_PDUS+  \
-   ISER_MAX_TX_MISC_PDUS)
+#define ISER_QP_MAX_RECV_DTOS  (ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISER_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX  2)

 /* the max TX (send) WR supported by the iSER QP is defined by 
*
  * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   
*
@@ -132,6 +132,12 @@ struct iser_hdr {
__be64  read_va;
 } __attribute__((packed));

+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN 128
+#define ISER_RX_PAYLOAD_SIZE   (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)

 /* Length of an object name string */
 #define ISER_OBJECT_NAME_SIZE  64
@@ -212,7 +218,6 @@ struct iser_dto {
 };

 enum iser_desc_type {
-   ISCSI_RX,
ISCSI_TX_CONTROL ,
ISCSI_TX_SCSI_COMMAND,
ISCSI_TX_DATAOUT
@@ -228,6 +233,17 @@ struct iser_desc {
struct iser_dto  dto;
 };

+#define ISER_RX_PAD_SIZE   (256 - (ISER_RX_PAYLOAD_SIZE + \
+   sizeof(u64) + sizeof(struct ib_sge)))
+struct iser_rx_desc {
+   struct iser_hdr  iser_header;
+   struct iscsi_hdr iscsi_header;
+   char data[ISER_RECV_DATA_SEG_LEN];
+   u64  dma_addr;
+   struct ib_sgerx_sg;
+   char pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
 struct iser_device {
struct ib_device *ib_device;
struct ib_pd *pd;
@@ -256,6 +272,12 @@ struct iser_conn {
struct iser_page_vec *page_vec; /* represents SG to fmr 
maps*
 * maps serialized as tx 
is*/
struct list_head conn_list;   /* entry in ig conn list 
*/
+
+   char *login_buf;
+   u64  login_dma;
+   unsigned int rx_desc_head;
+   struct iser_rx_desc  *rx_descs;
+   struct ib_recv_wrrx_wr[ISER_MIN_POSTED_RX];
 };

 struct iscsi_iser_conn {
@@ -319,8 +341,9 @@ void iser_conn_put(struct iser_conn *ib_

 void iser_conn_terminate(struct iser_conn *ib_conn);

-void iser_rcv_completion(struct iser_desc *desc,
-unsigned longdto_xfer_len);
+void iser_rcv_completion(struct iser_rx_desc *desc,
+unsigned longdto_xfer_len,
+   struct iser_conn *ib_conn);

 void iser_snd_completion(struct iser_desc *desc);

@@ -332,6 +355,8 @@ void iser_dto_buffs_release(struct iser_

 int  iser_regd_buff_release(struct iser_regd_buf *regd_buf);

+void iser_free_rx_descriptors(struct iser_conn *ib_conn);
+
 void iser_reg_single(struct iser_device  *device,
 struct iser_regd_buf*regd_buf,
 enum dma_data_direction direction);
@@ -353,7 +378,8 @@ int  iser_reg_page_vec(struct iser_conn

 void iser_unreg_mem(struct iser_mem_reg *mem_reg);

-int  iser_post_recv(struct iser_desc *rx_desc);
+int  iser_post_recvl(struct iser_conn *ib_conn);
+int  iser_post_recvm(struct iser_conn *ib_conn, int count);
 int  iser_post_send(struct iser_desc *tx_desc);

 int iser_conn_state_comp(struct iser_conn *ib_conn,
Index: 

[PATCH V2 03/9] ib/iser: remove atomic counter for posted recv buffers

2010-02-08 Thread Or Gerlitz
With both the posting and reaping of recv buffers being in the
completion path, their outstanding number counter need not be atomic.

Signed-off-by: Or Gerlitz ogerl...@voltaire.com

---
 drivers/infiniband/ulp/iser/iscsi_iser.h |2 +-
 drivers/infiniband/ulp/iser/iser_initiator.c |6 +++---
 drivers/infiniband/ulp/iser/iser_verbs.c |   16 
 3 files changed, 12 insertions(+), 12 deletions(-)

Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -266,7 +266,7 @@ struct iser_conn {
struct ib_fmr_pool   *fmr_pool; /* pool of IB FMRs 
*/
int  disc_evt_flag; /* disconn event delivered 
*/
wait_queue_head_twait;  /* waitq for conn/disconn  
*/
-   atomic_t post_recv_buf_count; /* posted rx count   
*/
+   int  post_recv_buf_count; /* posted rx count  */
atomic_t post_send_buf_count; /* posted tx count   
*/
char name[ISER_OBJECT_NAME_SIZE];
struct iser_page_vec *page_vec; /* represents SG to fmr 
maps*
Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -268,7 +268,7 @@ int iser_conn_set_full_featured_mode(str

/* Check that there is no posted recv or send buffers left - */
/* they must be consumed during the login phase */
-   BUG_ON(atomic_read(iser_conn-ib_conn-post_recv_buf_count) != 0);
+   BUG_ON(iser_conn-ib_conn-post_recv_buf_count != 0);
BUG_ON(atomic_read(iser_conn-ib_conn-post_send_buf_count) != 0);

if (iser_alloc_rx_descriptors(iser_conn-ib_conn))
@@ -569,12 +569,12 @@ void iser_rcv_completion(struct iser_rx_
 * task eliminates the need to worry on tasks which are completed in   *
 * parallel to the execution of iser_conn_term. So the code that waits *
 * for the posted rx bufs refcount to become zero handles everything   
*/
-   atomic_dec(conn-ib_conn-post_recv_buf_count);
+   conn-ib_conn-post_recv_buf_count--;

if (rx_dma == ib_conn-login_dma)
return;

-   outstanding = atomic_read(ib_conn-post_recv_buf_count);
+   outstanding = ib_conn-post_recv_buf_count;
if (outstanding + ISER_MIN_POSTED_RX = ISER_QP_MAX_RECV_DTOS) {
count = min(ISER_QP_MAX_RECV_DTOS - outstanding,
ISER_MIN_POSTED_RX);
Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_verbs.c
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_verbs.c
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -453,7 +453,7 @@ static void iser_disconnected_handler(st
   ISCSI_ERR_CONN_FAILED);

/* Complete the termination process if no posts are pending */
-   if ((atomic_read(ib_conn-post_recv_buf_count) == 0) 
+   if (ib_conn-post_recv_buf_count == 0 
(atomic_read(ib_conn-post_send_buf_count) == 0)) {
ib_conn-state = ISER_CONN_DOWN;
wake_up_interruptible(ib_conn-wait);
@@ -500,7 +500,7 @@ void iser_conn_init(struct iser_conn *ib
 {
ib_conn-state = ISER_CONN_INIT;
init_waitqueue_head(ib_conn-wait);
-   atomic_set(ib_conn-post_recv_buf_count, 0);
+   ib_conn-post_recv_buf_count = 0;
atomic_set(ib_conn-post_send_buf_count, 0);
atomic_set(ib_conn-refcount, 1);
INIT_LIST_HEAD(ib_conn-conn_list);
@@ -651,11 +651,11 @@ int iser_post_recvl(struct iser_conn *ib
rx_wr.num_sge = 1;
rx_wr.next= NULL;

-   atomic_inc(ib_conn-post_recv_buf_count);
+   ib_conn-post_recv_buf_count++;
ib_ret  = ib_post_recv(ib_conn-qp, rx_wr, rx_wr_failed);
if (ib_ret) {
iser_err(ib_post_recv failed ret=%d\n, ib_ret);
-   atomic_dec(ib_conn-post_recv_buf_count);
+   ib_conn-post_recv_buf_count--;
}
return ib_ret;
 }
@@ -679,11 +679,11 @@ int iser_post_recvm(struct iser_conn *ib
rx_wr--;
rx_wr-next = NULL; /* mark end of work requests list */

-   atomic_add(count, ib_conn-post_recv_buf_count);
+   ib_conn-post_recv_buf_count += count;
ib_ret  = ib_post_recv(ib_conn-qp, ib_conn-rx_wr, rx_wr_failed);
if (ib_ret) {
iser_err(ib_post_recv failed ret=%d\n, ib_ret);
-   atomic_sub(count, ib_conn-post_recv_buf_count);
+   

[PATCH V2 04/9] ib/iser: use different CQ for send completions

2010-02-08 Thread Or Gerlitz
Use a different CQ for send completions, where send completions are
being polled by the interrupt driven recv completion handler.
As such, interrupts aren't used for the send CQ.

Signed-off-by: Or Gerlitz ogerl...@voltaire.com

---
 drivers/infiniband/ulp/iser/iscsi_iser.h |3
 drivers/infiniband/ulp/iser/iser_verbs.c |  110 ---
 2 files changed, 76 insertions(+), 37 deletions(-)

Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -247,7 +247,8 @@ struct iser_rx_desc {
 struct iser_device {
struct ib_device *ib_device;
struct ib_pd *pd;
-   struct ib_cq *cq;
+   struct ib_cq *rx_cq;
+   struct ib_cq *tx_cq;
struct ib_mr *mr;
struct tasklet_structcq_tasklet;
struct list_head ig_list; /* entry in ig devices list */
Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_verbs.c
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_verbs.c
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -37,9 +37,8 @@
 #include iscsi_iser.h

 #define ISCSI_ISER_MAX_CONN8
-#define ISER_MAX_CQ_LEN((ISER_QP_MAX_RECV_DTOS + \
-   ISER_QP_MAX_REQ_DTOS) *   \
-ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)

 static void iser_cq_tasklet_fn(unsigned long data);
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
@@ -67,15 +66,23 @@ static int iser_create_device_ib_res(str
if (IS_ERR(device-pd))
goto pd_err;

-   device-cq = ib_create_cq(device-ib_device,
+   device-rx_cq = ib_create_cq(device-ib_device,
  iser_cq_callback,
  iser_cq_event_callback,
  (void *)device,
- ISER_MAX_CQ_LEN, 0);
-   if (IS_ERR(device-cq))
-   goto cq_err;
+ ISER_MAX_RX_CQ_LEN, 0);
+   if (IS_ERR(device-rx_cq))
+   goto rx_cq_err;

-   if (ib_req_notify_cq(device-cq, IB_CQ_NEXT_COMP))
+   device-tx_cq = ib_create_cq(device-ib_device,
+ NULL, iser_cq_event_callback,
+ (void *)device,
+ ISER_MAX_TX_CQ_LEN, 0);
+
+   if (IS_ERR(device-tx_cq))
+   goto tx_cq_err;
+
+   if (ib_req_notify_cq(device-rx_cq, IB_CQ_NEXT_COMP))
goto cq_arm_err;

tasklet_init(device-cq_tasklet,
@@ -93,8 +100,10 @@ static int iser_create_device_ib_res(str
 dma_mr_err:
tasklet_kill(device-cq_tasklet);
 cq_arm_err:
-   ib_destroy_cq(device-cq);
-cq_err:
+   ib_destroy_cq(device-tx_cq);
+tx_cq_err:
+   ib_destroy_cq(device-rx_cq);
+rx_cq_err:
ib_dealloc_pd(device-pd);
 pd_err:
iser_err(failed to allocate an IB resource\n);
@@ -112,11 +121,13 @@ static void iser_free_device_ib_res(stru
tasklet_kill(device-cq_tasklet);

(void)ib_dereg_mr(device-mr);
-   (void)ib_destroy_cq(device-cq);
+   (void)ib_destroy_cq(device-tx_cq);
+   (void)ib_destroy_cq(device-rx_cq);
(void)ib_dealloc_pd(device-pd);

device-mr = NULL;
-   device-cq = NULL;
+   device-tx_cq = NULL;
+   device-rx_cq = NULL;
device-pd = NULL;
 }

@@ -179,8 +190,8 @@ static int iser_create_ib_conn_res(struc

init_attr.event_handler = iser_qp_event_callback;
init_attr.qp_context= (void *)ib_conn;
-   init_attr.send_cq   = device-cq;
-   init_attr.recv_cq   = device-cq;
+   init_attr.send_cq   = device-tx_cq;
+   init_attr.recv_cq   = device-rx_cq;
init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
init_attr.cap.max_send_sge = MAX_REGD_BUF_VECTOR_LEN;
@@ -772,18 +783,8 @@ int iser_post_send(struct iser_desc *tx_
 static void iser_handle_comp_error(struct iser_desc *desc,
struct iser_conn *ib_conn)
 {
-   struct iser_rx_desc *rx   = (struct iser_rx_desc *)desc;
-   struct iser_rx_desc *rx_first = ib_conn-rx_descs;
-   struct iser_rx_desc *rx_last  = rx_first + (ISER_QP_MAX_RECV_DTOS - 1);
-
-   if ((char *)desc == ib_conn-login_buf ||
-   (rx_first = rx  rx = rx_last))
-   ib_conn-post_recv_buf_count--;
-else { /* type is TX control/command/dataout 

[PATCH V2 05/9] ib/iser: simplify send flow/descriptors

2010-02-08 Thread Or Gerlitz
Simplify and shrink the logic/code used for the send descriptors.
Changes include removal of struct iser_dto which is unnecessary
abstraction, use struct iser_regd_buf only for handling SCSI commands,
use dma_sync instead of dma_map/unmap, etc.

Signed-off-by: Or Gerlitz ogerl...@voltaire.com
---

changes from V1:

- remove white spaces that slipped in

 drivers/infiniband/ulp/iser/iscsi_iser.c |   34 -
 drivers/infiniband/ulp/iser/iscsi_iser.h |   48 +--
 drivers/infiniband/ulp/iser/iser_initiator.c |  178 +--
 drivers/infiniband/ulp/iser/iser_memory.c|   60 -
 drivers/infiniband/ulp/iser/iser_verbs.c |   75 +--
 5 files changed, 116 insertions(+), 279 deletions(-)

Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -39,26 +39,6 @@

 #include iscsi_iser.h

-
-/* iser_dto_add_regd_buff - increments the reference count for *
- * the registered buffer  adds it to the DTO object   */
-static void iser_dto_add_regd_buff(struct iser_dto *dto,
-  struct iser_regd_buf *regd_buf,
-  unsigned long use_offset,
-  unsigned long use_size)
-{
-   int add_idx;
-
-   atomic_inc(regd_buf-ref_count);
-
-   add_idx = dto-regd_vector_len;
-   dto-regd[add_idx] = regd_buf;
-   dto-used_sz[add_idx] = use_size;
-   dto-offset[add_idx] = use_offset;
-
-   dto-regd_vector_len++;
-}
-
 /* Register user buffer memory and initialize passive rdma
  *  dto descriptor. Total data size is stored in
  *  iser_task-data[ISER_DIR_IN].data_len
@@ -119,9 +99,9 @@ iser_prepare_write_cmd(struct iscsi_task
struct iscsi_iser_task *iser_task = task-dd_data;
struct iser_regd_buf *regd_buf;
int err;
-   struct iser_dto *send_dto = iser_task-desc.dto;
struct iser_hdr *hdr = iser_task-desc.iser_header;
struct iser_data_buf *buf_out = iser_task-data[ISER_DIR_OUT];
+   struct ib_sge *tx_dsg = iser_task-desc.tx_sg[1];

err = iser_dma_map_task_data(iser_task,
 buf_out,
@@ -160,37 +140,36 @@ iser_prepare_write_cmd(struct iscsi_task
if (imm_sz  0) {
iser_dbg(Cmd itt:%d, WRITE, adding imm.data sz: %d\n,
 task-itt, imm_sz);
-   iser_dto_add_regd_buff(send_dto,
-  regd_buf,
-  0,
-  imm_sz);
+   tx_dsg-addr   = regd_buf-reg.va;
+   tx_dsg-length = imm_sz;
+   tx_dsg-lkey   = regd_buf-reg.lkey;
+   iser_task-desc.num_sge = 2;
}

return 0;
 }

 /* creates a new tx descriptor and adds header regd buffer */
-static void iser_create_send_desc(struct iscsi_iser_conn *iser_conn,
- struct iser_desc   *tx_desc)
+static void iser_create_send_desc(struct iser_conn *ib_conn,
+ struct iser_tx_desc   *tx_desc)
 {
-   struct iser_regd_buf *regd_hdr = tx_desc-hdr_regd_buf;
-   struct iser_dto  *send_dto = tx_desc-dto;
+   struct iser_device *device = ib_conn-device;

-   memset(regd_hdr, 0, sizeof(struct iser_regd_buf));
-   regd_hdr-device  = iser_conn-ib_conn-device;
-   regd_hdr-virt_addr  = tx_desc; /* == tx_desc-iser_header */
-   regd_hdr-data_size  = ISER_HEADERS_LEN;
-
-   send_dto-ib_conn = iser_conn-ib_conn;
-   send_dto-notify_enable   = 1;
-   send_dto-regd_vector_len = 0;
+   ib_dma_sync_single_for_cpu(device-ib_device,
+   tx_desc-dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);

memset(tx_desc-iser_header, 0, sizeof(struct iser_hdr));
tx_desc-iser_header.flags = ISER_VER;

-   iser_dto_add_regd_buff(send_dto, regd_hdr, 0, 0);
+   tx_desc-num_sge = 1;
+
+   if (tx_desc-tx_sg[0].lkey != device-mr-lkey) {
+   tx_desc-tx_sg[0].lkey = device-mr-lkey;
+   iser_dbg(sdesc %p lkey mismatch, fixing\n, tx_desc);
+   }
 }

+
 int iser_alloc_rx_descriptors(struct iser_conn *ib_conn)
 {
int i, j;
@@ -303,12 +282,12 @@ int iser_send_command(struct iscsi_conn
 {
struct iscsi_iser_conn *iser_conn = conn-dd_data;
struct iscsi_iser_task *iser_task = task-dd_data;
-   struct iser_dto *send_dto = NULL;
unsigned long edtl;
int err;
struct iser_data_buf *data_buf;
struct iscsi_cmd *hdr =  (struct iscsi_cmd *)task-hdr;
struct scsi_cmnd *sc  =  task-sc;
+   struct iser_tx_desc *tx_desc = iser_task-desc;

if (!iser_conn_state_comp(iser_conn-ib_conn, ISER_CONN_UP)) {

[PATCH V2 06/9] ib/iser: use atomic allocations

2010-02-08 Thread Or Gerlitz
Two minor flows in iser's data path still use allocations,
move them to be atomic as a preperation step towards moving
to use libiscsi passthrough mode.

Signed-off-by: Or Gerlitz ogerl...@voltaire.com

---
 drivers/infiniband/ulp/iser/iser_initiator.c |2 +-
 drivers/infiniband/ulp/iser/iser_memory.c|4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -373,7 +373,7 @@ int iser_send_data_out(struct iscsi_conn
iser_dbg(%s itt %d dseg_len %d offset %d\n,
 __func__,(int)itt,(int)data_seg_len,(int)buf_offset);

-   tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_NOIO);
+   tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC);
if (tx_desc == NULL) {
iser_err(Failed to alloc desc for post dataout\n);
return -ENOMEM;
Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_memory.c
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_memory.c
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_memory.c
@@ -53,10 +53,10 @@ static int iser_start_rdma_unaligned_sg(
unsigned long  cmd_data_len = data-data_len;

if (cmd_data_len  ISER_KMALLOC_THRESHOLD)
-   mem = (void *)__get_free_pages(GFP_NOIO,
+   mem = (void *)__get_free_pages(GFP_ATOMIC,
  ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT);
else
-   mem = kmalloc(cmd_data_len, GFP_NOIO);
+   mem = kmalloc(cmd_data_len, GFP_ATOMIC);

if (mem == NULL) {
iser_err(Failed to allocate mem size %d %d for copying 
sglist\n,
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2 09/9] remove redundant locking from iser scsi command response flow

2010-02-08 Thread Or Gerlitz
currently iser recv completion flow takes the session lock twice.
optimize it to avoid the first one by letting iser_task_rdma_finalize()
be called only from the cleanup_task callback invoked by iscsi_free_task,
thus reducing the contention on the session lock between the scsi
command submission to the scsi command completion flows.

Signed-off-by: Or Gerlitz ogerl...@voltaire.com
Reviewed-by: Mike Christie micha...@cs.wisc.edu

---

 drivers/infiniband/ulp/iser/iser_initiator.c |   25 -
 1 file changed, 25 deletions(-)

Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -440,10 +440,7 @@ void iser_rcv_completion(struct iser_rx_
 struct iser_conn *ib_conn)
 {
struct iscsi_iser_conn *conn = ib_conn-iser_conn;
-   struct iscsi_task *task;
-   struct iscsi_iser_task *iser_task;
struct iscsi_hdr *hdr;
-   unsigned char opcode;
u64 rx_dma;
int rx_buflen, outstanding, count, err;

@@ -464,28 +461,6 @@ void iser_rcv_completion(struct iser_rx_
iser_dbg(op 0x%x itt 0x%x dlen %d\n, hdr-opcode,
hdr-itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));

-   opcode = hdr-opcode  ISCSI_OPCODE_MASK;
-
-   if (opcode == ISCSI_OP_SCSI_CMD_RSP) {
-   spin_lock(conn-iscsi_conn-session-lock);
-   task = iscsi_itt_to_ctask(conn-iscsi_conn, hdr-itt);
-   if (task)
-   __iscsi_get_task(task);
-   spin_unlock(conn-iscsi_conn-session-lock);
-
-   if (!task)
-   iser_err(itt can't be matched to task!!! 
-conn %p opcode %d itt %d\n,
-conn-iscsi_conn, opcode, hdr-itt);
-   else {
-   iser_task = task-dd_data;
-   iser_dbg(itt %d task %p\n,hdr-itt, task);
-   iser_task-status = ISER_TASK_STATUS_COMPLETED;
-   iser_task_rdma_finalize(iser_task);
-   iscsi_put_task(task);
-   }
-   }
-
iscsi_iser_recv(conn-iscsi_conn, hdr,
rx_desc-data, rx_xfer_len - ISER_HEADERS_LEN);

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RESEND V2 09/9] ib/iser: remove redundant locking from iser scsi command response flow

2010-02-08 Thread Or Gerlitz
currently iser recv completion flow takes the session lock twice.
optimize it to avoid the first one by letting iser_task_rdma_finalize()
be called only from the cleanup_task callback invoked by iscsi_free_task,
thus reducing the contention on the session lock between the scsi
command submission to the scsi command completion flows.

Signed-off-by: Or Gerlitz ogerl...@voltaire.com
Reviewed-by: Mike Christie micha...@cs.wisc.edu

---

resending with a fixed subject line which contains the ib/iser: prefix

 drivers/infiniband/ulp/iser/iser_initiator.c |   25 -
 1 file changed, 25 deletions(-)

Index: linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
===
--- linux-2.6.33-rc7.orig/drivers/infiniband/ulp/iser/iser_initiator.c
+++ linux-2.6.33-rc7/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -440,10 +440,7 @@ void iser_rcv_completion(struct iser_rx_
 struct iser_conn *ib_conn)
 {
struct iscsi_iser_conn *conn = ib_conn-iser_conn;
-   struct iscsi_task *task;
-   struct iscsi_iser_task *iser_task;
struct iscsi_hdr *hdr;
-   unsigned char opcode;
u64 rx_dma;
int rx_buflen, outstanding, count, err;

@@ -464,28 +461,6 @@ void iser_rcv_completion(struct iser_rx_
iser_dbg(op 0x%x itt 0x%x dlen %d\n, hdr-opcode,
hdr-itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));

-   opcode = hdr-opcode  ISCSI_OPCODE_MASK;
-
-   if (opcode == ISCSI_OP_SCSI_CMD_RSP) {
-   spin_lock(conn-iscsi_conn-session-lock);
-   task = iscsi_itt_to_ctask(conn-iscsi_conn, hdr-itt);
-   if (task)
-   __iscsi_get_task(task);
-   spin_unlock(conn-iscsi_conn-session-lock);
-
-   if (!task)
-   iser_err(itt can't be matched to task!!! 
-conn %p opcode %d itt %d\n,
-conn-iscsi_conn, opcode, hdr-itt);
-   else {
-   iser_task = task-dd_data;
-   iser_dbg(itt %d task %p\n,hdr-itt, task);
-   iser_task-status = ISER_TASK_STATUS_COMPLETED;
-   iser_task_rdma_finalize(iser_task);
-   iscsi_put_task(task);
-   }
-   }
-
iscsi_iser_recv(conn-iscsi_conn, hdr,
rx_desc-data, rx_xfer_len - ISER_HEADERS_LEN);

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] opensm: bug in trap report for MC create(66) and delete(67) traps

2010-02-08 Thread Hal Rosenstock
On Sun, Feb 7, 2010 at 4:47 AM, Eli Dorfman (Voltaire)
dorfman@gmail.com wrote:
 Hal Rosenstock wrote:
 On Fri, Feb 5, 2010 at 9:18 AM, Eli Dorfman dorfman@gmail.com wrote:
 On Thu, Feb 4, 2010 at 10:52 PM, Hal Rosenstock
 hal.rosenst...@gmail.com wrote:
 On Thu, Feb 4, 2010 at 12:43 PM, Eli Dorfman (Voltaire)
 dorfman@gmail.com wrote:
 Subject: [PATCH] Wrong handling of MC create and delete traps

 For these traps the GID in the data details is the MGID and
 not the source port gid.
 So the SM should check that subscriber port has the pkey of the MC group.
 There was also an error in comparing the subnet prefix and guid due to
 host/network order mismatch.

 Signed-off-by: Eli Dorfman e...@voltaire.com
 ---
  opensm/opensm/osm_inform.c |  151 
 ---
  1 files changed, 98 insertions(+), 53 deletions(-)

 diff --git a/opensm/opensm/osm_inform.c b/opensm/opensm/osm_inform.c
 index 8108213..ae4fe71 100644
 --- a/opensm/opensm/osm_inform.c
 +++ b/opensm/opensm/osm_inform.c
 @@ -341,6 +341,103 @@ Exit:
        return status;
  }

 +static int is_access_permitted( osm_infr_t *p_infr_rec,
 +                               osm_infr_match_ctxt_t *p_infr_match )
 +{
 +       cl_list_t *p_infr_to_remove_list = 
 p_infr_match-p_remove_infr_list;
 +       ib_inform_info_t *p_ii = (p_infr_rec-inform_record.inform_info);
 +       ib_mad_notice_attr_t *p_ntc = p_infr_match-p_ntc;
 +       uint16_t trap_num = cl_ntoh16(p_ntc-g_or_v.generic.trap_num);
 +       osm_subn_t *p_subn = p_infr_rec-sa-p_subn;
 +       osm_log_t *p_log = p_infr_rec-sa-p_log;
 +       char gid_str[INET6_ADDRSTRLEN];
 +       osm_mgrp_t *p_mgrp;
 +       ib_gid_t source_gid;
 +       osm_port_t *p_src_port;
 +       osm_port_t *p_dest_port;
 +
 +       /* In case of GID_IN(64) or GID_OUT(65) traps the source gid
 +          comparison should be done on the trap source (saved as the gid 
 in the
 +          data details field).
 +          For traps MC_CREATE(66) or MC_DELETE(67) the data details gid 
 is
 +          the MGID. We need to check whether subscriber has the pky of

                   typo  

                           pkey


 +          the MC group.
 Shouldn't this be the subscriber has a compatible pkey with MC group ?
 The MC group has a full member PKey and the members can be full or
 limited.
 I accept the correction.

 Doesn't this require a code change for handling trap cases 66-67 ?

 I think that you referred to the comment since the code is handling this 
 properly (in my opinion).

I was referring to both the comment and the code since a port with a
compatible limited pkey should be able to receive the reports for MC
groups.



 Sasha, can you please change this in the commit (only if there are not
 other remarks).

 Is that what you are asking Sasha to do (beyond the typos) ?

 I asked Sasha to fix only the typo in commit.


 BTW, there is no explicit reference in the IB spec for MC group
 create/delete trap (at least I didn't find it).

 Not sure what you mean by this. What didn't you find ?

 in the spec see o13-17.1.2

Yes, there appear to be some holes in the spec in terms of this and
maybe more in this area (event forwarding) but the intent seems clear.

-- Hal

 Thanks,
 Eli

 -- Hal

 +          In all other cases the issuer gis is the trap source.
                                               typo  ^^^
                                                       gid

 and this typo of course.

 Thanks,
 Eli
 -- Hal

 +       */
 +       if (trap_num = 64  trap_num = 67 )
 +               /* The issuer of these traps is the SM so source_gid
 +                  is the gid saved on the data details */
 +               source_gid = p_ntc-data_details.ntc_64_67.gid;
 +       else
 +               source_gid = p_ntc-issuer_gid;
 +
 +       p_dest_port =
 +           cl_ptr_vector_get(p_subn-port_lid_tbl,
 +                             
 cl_ntoh16(p_infr_rec-report_addr.dest_lid));
 +       if (!p_dest_port) {
 +               OSM_LOG(p_log, OSM_LOG_INFO,
 +                       Cannot find destination port with LID:%u\n,
 +                       cl_ntoh16(p_infr_rec-report_addr.dest_lid));
 +               goto Exit;
 +       }
 +
 +       switch (trap_num) {
 +               case 66:
 +               case 67:
 +                       p_mgrp = osm_get_mgrp_by_mgid(p_subn, 
 source_gid);
 +                       if (!p_mgrp) {
 +                               OSM_LOG(p_log, OSM_LOG_INFO,
 +                                       Cannot find MGID %s\n,
 +                                       inet_ntop(AF_INET6, 
 source_gid.raw, gid_str, sizeof gid_str));
 +                               goto Exit;
 +                       }
 +
 +                       if (!osm_physp_has_pkey(p_log,
 +                                               p_mgrp-mcmember_rec.pkey,
 +                                               p_dest_port-p_physp)) {
 +                               OSM_LOG(p_log, 

[PATCH] Add new device IDs for ConnectX VPI HCAs

2010-02-08 Thread Vladimir Sokolovsky
Signed-off-by: Vladimir Sokolovsky v...@mellanox.co.il
---
 src/mlx4.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/src/mlx4.c b/src/mlx4.c
index 1295c53..973df68 100644
--- a/src/mlx4.c
+++ b/src/mlx4.c
@@ -66,6 +66,8 @@ struct {
HCA(MELLANOX, 0x6354),  /* MT25408 Hermon QDR */
HCA(MELLANOX, 0x6732),  /* MT25408 Hermon DDR PCIe gen2 */
HCA(MELLANOX, 0x673c),  /* MT25408 Hermon QDR PCIe gen2 */
+   HCA(MELLANOX, 0x6746),  /* MT26438 ConnectX VPI PCIe 2.0 5GT/s - IB QDR 
/ 10GigE Virt+ */
+   HCA(MELLANOX, 0x6778),  /* MT26488 ConnectX VPI PCIe 2.0 5GT/s - IB DDR 
/ 10GigE Virt+ */
 };
 
 static struct ibv_context_ops mlx4_ctx_ops = {
-- 
1.5.4.3

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] bug 1918 - openmpi broken due to rdma-cm changes

2010-02-08 Thread Steve Wise

Tziporet Koren wrote:

On 2/7/2010 6:39 PM, Steve Wise wrote:


If ofed-1.5.1 is based on 2.6.33 then it will get this patch
automatically (assuming it goes upstream and makes 2.6.33).  Or we can
pull it in as a kernel_patches/fixes/ patch.
   
OFED 1.5.1 is not based on 2.6.33, but on 2.6.30, so we need the patch 
under fixes.

Steve - can you prepare such a patch?

Tziporet




The reason I thought it was based on 2.6.33, is because I see 2.6.33 git 
tags in the ofed kernel tree.  I misinterpreted what that meant.


I can develop a patch, but it will disable _all_ 127.0.0.1 binds.  
Otherwise openmpi is still broken on IB.


Steve.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Steve Wise
This patch doesn't solve the openmpi/IB regression.   So for OFED, IMO, 
we need a different patch...



Tziporet Koren wrote:

On 2/8/2010 8:02 AM, Sean Hefty wrote:

Since iWarp devices are not guaranteed to support loopback connections,
prevent rdma_bind_addr from associating the loopback address with
an iWarp device.

Signed-off-by: Sean Heftysean.he...@intel.com
   


Steve
Have you tested this patch?
When accepted to kernel can you prepare a patch for OFED 1.5.1 under 
fixes


Thanks
Tziporet


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] opensm: bug in trap report for MC create(66) and delete(67) traps

2010-02-08 Thread Eli Dorfman (Voltaire)
Hal Rosenstock wrote:
 On Sun, Feb 7, 2010 at 4:47 AM, Eli Dorfman (Voltaire)
 dorfman@gmail.com wrote:
 Hal Rosenstock wrote:
 On Fri, Feb 5, 2010 at 9:18 AM, Eli Dorfman dorfman@gmail.com wrote:
 On Thu, Feb 4, 2010 at 10:52 PM, Hal Rosenstock
 hal.rosenst...@gmail.com wrote:
 On Thu, Feb 4, 2010 at 12:43 PM, Eli Dorfman (Voltaire)
 dorfman@gmail.com wrote:
 Subject: [PATCH] Wrong handling of MC create and delete traps

 For these traps the GID in the data details is the MGID and
 not the source port gid.
 So the SM should check that subscriber port has the pkey of the MC group.
 There was also an error in comparing the subnet prefix and guid due to
 host/network order mismatch.

 Signed-off-by: Eli Dorfman e...@voltaire.com
 ---
  opensm/opensm/osm_inform.c |  151 
 ---
  1 files changed, 98 insertions(+), 53 deletions(-)

 diff --git a/opensm/opensm/osm_inform.c b/opensm/opensm/osm_inform.c
 index 8108213..ae4fe71 100644
 --- a/opensm/opensm/osm_inform.c
 +++ b/opensm/opensm/osm_inform.c
 @@ -341,6 +341,103 @@ Exit:
return status;
  }

 +static int is_access_permitted( osm_infr_t *p_infr_rec,
 +   osm_infr_match_ctxt_t *p_infr_match )
 +{
 +   cl_list_t *p_infr_to_remove_list = 
 p_infr_match-p_remove_infr_list;
 +   ib_inform_info_t *p_ii = 
 (p_infr_rec-inform_record.inform_info);
 +   ib_mad_notice_attr_t *p_ntc = p_infr_match-p_ntc;
 +   uint16_t trap_num = cl_ntoh16(p_ntc-g_or_v.generic.trap_num);
 +   osm_subn_t *p_subn = p_infr_rec-sa-p_subn;
 +   osm_log_t *p_log = p_infr_rec-sa-p_log;
 +   char gid_str[INET6_ADDRSTRLEN];
 +   osm_mgrp_t *p_mgrp;
 +   ib_gid_t source_gid;
 +   osm_port_t *p_src_port;
 +   osm_port_t *p_dest_port;
 +
 +   /* In case of GID_IN(64) or GID_OUT(65) traps the source gid
 +  comparison should be done on the trap source (saved as the 
 gid in the
 +  data details field).
 +  For traps MC_CREATE(66) or MC_DELETE(67) the data details gid 
 is
 +  the MGID. We need to check whether subscriber has the pky of
   typo  

   pkey


 +  the MC group.
 Shouldn't this be the subscriber has a compatible pkey with MC group ?
 The MC group has a full member PKey and the members can be full or
 limited.
 I accept the correction.
 Doesn't this require a code change for handling trap cases 66-67 ?
 I think that you referred to the comment since the code is handling this 
 properly (in my opinion).
 
 I was referring to both the comment and the code since a port with a
 compatible limited pkey should be able to receive the reports for MC
 groups.

I agree and I think that the code is handling this case properly.
osm_physp_has_pkey() takes the 15 lower MGID pkey bits and checks whether it is 
the physp pkey table.

Eli
 
 Sasha, can you please change this in the commit (only if there are not
 other remarks).
 Is that what you are asking Sasha to do (beyond the typos) ?
 I asked Sasha to fix only the typo in commit.

 BTW, there is no explicit reference in the IB spec for MC group
 create/delete trap (at least I didn't find it).
 Not sure what you mean by this. What didn't you find ?
 in the spec see o13-17.1.2
 
 Yes, there appear to be some holes in the spec in terms of this and
 maybe more in this area (event forwarding) but the intent seems clear.
 
 -- Hal
 
 Thanks,
 Eli
 -- Hal

 +  In all other cases the issuer gis is the trap source.
   typo  ^^^
   gid

 and this typo of course.

 Thanks,
 Eli
 -- Hal

 +   */
 +   if (trap_num = 64  trap_num = 67 )
 +   /* The issuer of these traps is the SM so source_gid
 +  is the gid saved on the data details */
 +   source_gid = p_ntc-data_details.ntc_64_67.gid;
 +   else
 +   source_gid = p_ntc-issuer_gid;
 +
 +   p_dest_port =
 +   cl_ptr_vector_get(p_subn-port_lid_tbl,
 + 
 cl_ntoh16(p_infr_rec-report_addr.dest_lid));
 +   if (!p_dest_port) {
 +   OSM_LOG(p_log, OSM_LOG_INFO,
 +   Cannot find destination port with LID:%u\n,
 +   cl_ntoh16(p_infr_rec-report_addr.dest_lid));
 +   goto Exit;
 +   }
 +
 +   switch (trap_num) {
 +   case 66:
 +   case 67:
 +   p_mgrp = osm_get_mgrp_by_mgid(p_subn, 
 source_gid);
 +   if (!p_mgrp) {
 +   OSM_LOG(p_log, OSM_LOG_INFO,
 +   Cannot find MGID %s\n,
 +   inet_ntop(AF_INET6, 
 source_gid.raw, gid_str, sizeof gid_str));
 +   goto Exit;
 +   }
 +
 +   if 

Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Roland Dreier
  This patch doesn't solve the openmpi/IB regression.   So for OFED,
  IMO, we need a different patch...

If this doesn't solve the regression the we should have a different
patch for upstream too.  The goal for 2.6.33 should be to keep open mpi
working, even if that requires us to go back to old breakage.
-- 
Roland Dreier  rola...@cisco.com
For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC Patch] net: reserve ports for applications using fixed port numbers

2010-02-08 Thread Octavian Purdila
On Monday 08 February 2010 05:21:50 you wrote:
 Octavian Purdila wrote:
  On Friday 05 February 2010 06:45:38 you wrote:
  Again, using bitmap algorithm is not a problem and it's better, the
  problem is sysctl interface, how would you plan to interact with users
  via sysctl/proc if you use bitmap to handle this? I would like to hear
  more details about this.
 
  We could use something like positive values for setting and negative for
  reset (e.g. 3 would set the port in the bitmap and -3 would reset it).
 
 Hmm, then how do you output the info of those ports? Arrays of bitmaps?
 

See the patch bellow (work in progress).

BTW, while working on it I added some helpers, which we can use to rewrite the 
proc_doint/long stuff. I think it will help with readability and eliminates 
some code duplication as well. What do you guys think about that?

--- linux_2.6.32/main/src/kernel/sysctl.c
+++ linux_2.6.32/main/src/kernel/sysctl.c
@@ -250,6 +250,11 @@
 static int max_wakeup_granularity_ns = NSEC_PER_SEC;   /* 1 second */
 #endif
 
+static unsigned long test_bitmap[65535/sizeof(long)];
+static int proc_dobitmap(struct ctl_table *table, int write,
+void __user *buf, size_t *lenp, loff_t *ppos);
+
+
 static struct ctl_table kern_table[] = {
{
.ctl_name   = CTL_UNNUMBERED,
@@ -1032,6 +1037,15 @@
.proc_handler   = proc_dointvec,
},
 #endif
+   {
+   .ctl_name   = CTL_UNNUMBERED,
+   .procname   = bitmap_test,
+   .data   = test_bitmap,
+   .maxlen = 65535,
+   .mode   = 0644,
+   .proc_handler   = proc_dobitmap,
+   },
+   
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
@@ -2902,6 +2916,194 @@
return 0;
 }
 
+static int proc_skip_wspace(char __user **buf, size_t *size)
+{
+   char c;
+
+   while (*size) {
+   if (get_user(c, *buf))
+   return -EFAULT;
+   if (!isspace(c))
+   break;
+   *size--; *buf++;
+   }
+
+   return 0;
+}
+
+static inline int _proc_get_ulong(char __user **buf, size_t *size, 
+ unsigned long *val, bool *neg)
+{
+#define TMPBUFLEN 21
+   int len = *size;
+   char *p, tmp[TMPBUFLEN];
+
+   if (len  TMPBUFLEN-1)
+   len = TMPBUFLEN-1;
+
+   if (copy_from_user(tmp, *buf, len))
+   return -EFAULT;
+
+   tmp[len] = 0;
+   p = tmp;
+   if (*p == '-'  *size  1) {
+   *neg = 1;
+   p++;
+   }
+   if (*p  '0' || *p  '9')
+   return -EINVAL;
+
+   *val = simple_strtoul(p, p, 0);
+
+   len = p - tmp;
+   if ((len  *size)  *p  !isspace(*p))
+   return -EINVAL;
+
+   *buf += len; *size -= len;
+
+   return 0;
+#undef TMPBUFLEN
+}
+
+static int proc_get_long(char __user **buf, size_t *size, long *val)
+{
+   int err;
+   bool neg;
+   unsigned long uval;
+
+   err = _proc_get_ulong(buf, size, uval, neg);
+   if (err)
+   return err;
+
+   if (neg)
+   *val = -uval;
+   else
+   *val = uval;
+
+   return 0;
+}
+
+static int proc_get_ulong(char __user **buf, size_t *size, unsigned long *val)
+{
+   int err;
+   bool neg;
+
+   err = _proc_get_ulong(buf, size, val, neg);
+   if (err)
+   return err;
+   if (neg)
+   return -EINVAL;
+
+   return 0;
+}
+
+static int proc_put_ulong(char __user **buf, size_t *size, unsigned long val,
+ bool first)
+{
+#define TMPBUFLEN 21
+   int len;
+   char tmp[TMPBUFLEN], *p = tmp;
+
+   if (!first)
+   *p++ = '\t';
+   sprintf(p, %lu, val);
+   len = strlen(tmp);
+   if (len  *size)
+   len = *size;
+   if (copy_to_user(*buf, tmp, len))
+   return -EFAULT;
+   *size -= len;
+   *buf += len;
+   return 0;
+#undef TMPBUFLEN
+}
+
+static int proc_put_newline(char __user **buf, size_t *size)
+{
+   if (*size) {
+   if (put_user('\n', *buf))
+   return -EFAULT;
+   *size--, *buf++;
+   }
+   return 0;
+}
+
+static int proc_dobitmap(struct ctl_table *table, int write,
+void __user *buf, size_t *lenp, loff_t *ppos)
+{
+   bool first = 1;
+   unsigned long *bitmap = (unsigned long *) table-data;
+   unsigned long bitmap_len = table-maxlen;
+   int left = *lenp, err = 0;
+   char __user *buffer = (char __user *) buf;
+
+   if (!bitmap_len || !left || (*ppos  !write)) {
+   *lenp = 0;
+   return 0;
+   }
+
+   if (write) {
+   while (left) {
+   long val;
+   
+ 

Re: Patch series from Dec 2009 which needs reviewing/applying

2010-02-08 Thread Roland Dreier
  [PATCH 0/2] fix SRQ WQE buffer initialization in liblmx4 and in mlx4_ib
  http://www.spinics.net/lists/linux-rdma/msg01911.html

This isn't a patch AFAICT.

  [PATCH 1/2] libmlx4: initialize SRQ scatter entries when creating an SRQ
  http://www.spinics.net/lists/linux-rdma/msg01912.html

Just applied this.

  [PATCH 2/2] mlx4_ib: initialize SRQ scatter entries when creating an SRQ
  http://www.spinics.net/lists/linux-rdma/msg01910.html

Has been upstream as 4c425588 for a few weeks.
-- 
Roland Dreier  rola...@cisco.com
For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/index.html
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] opensm: bug in trap report for MC create(66) and delete(67) traps

2010-02-08 Thread Hal Rosenstock
On Mon, Feb 8, 2010 at 11:05 AM, Eli Dorfman (Voltaire)
dorfman@gmail.com wrote:
 Hal Rosenstock wrote:
 On Sun, Feb 7, 2010 at 4:47 AM, Eli Dorfman (Voltaire)
 dorfman@gmail.com wrote:
 Hal Rosenstock wrote:
 On Fri, Feb 5, 2010 at 9:18 AM, Eli Dorfman dorfman@gmail.com wrote:
 On Thu, Feb 4, 2010 at 10:52 PM, Hal Rosenstock
 hal.rosenst...@gmail.com wrote:
 On Thu, Feb 4, 2010 at 12:43 PM, Eli Dorfman (Voltaire)
 dorfman@gmail.com wrote:
 Subject: [PATCH] Wrong handling of MC create and delete traps

 For these traps the GID in the data details is the MGID and
 not the source port gid.
 So the SM should check that subscriber port has the pkey of the MC 
 group.
 There was also an error in comparing the subnet prefix and guid due to
 host/network order mismatch.

 Signed-off-by: Eli Dorfman e...@voltaire.com
 ---
  opensm/opensm/osm_inform.c |  151 
 ---
  1 files changed, 98 insertions(+), 53 deletions(-)

 diff --git a/opensm/opensm/osm_inform.c b/opensm/opensm/osm_inform.c
 index 8108213..ae4fe71 100644
 --- a/opensm/opensm/osm_inform.c
 +++ b/opensm/opensm/osm_inform.c
 @@ -341,6 +341,103 @@ Exit:
        return status;
  }

 +static int is_access_permitted( osm_infr_t *p_infr_rec,
 +                               osm_infr_match_ctxt_t *p_infr_match )
 +{
 +       cl_list_t *p_infr_to_remove_list = 
 p_infr_match-p_remove_infr_list;
 +       ib_inform_info_t *p_ii = 
 (p_infr_rec-inform_record.inform_info);
 +       ib_mad_notice_attr_t *p_ntc = p_infr_match-p_ntc;
 +       uint16_t trap_num = cl_ntoh16(p_ntc-g_or_v.generic.trap_num);
 +       osm_subn_t *p_subn = p_infr_rec-sa-p_subn;
 +       osm_log_t *p_log = p_infr_rec-sa-p_log;
 +       char gid_str[INET6_ADDRSTRLEN];
 +       osm_mgrp_t *p_mgrp;
 +       ib_gid_t source_gid;
 +       osm_port_t *p_src_port;
 +       osm_port_t *p_dest_port;
 +
 +       /* In case of GID_IN(64) or GID_OUT(65) traps the source gid
 +          comparison should be done on the trap source (saved as the 
 gid in the
 +          data details field).
 +          For traps MC_CREATE(66) or MC_DELETE(67) the data details 
 gid is
 +          the MGID. We need to check whether subscriber has the pky of
                   typo  

                           pkey


 +          the MC group.
 Shouldn't this be the subscriber has a compatible pkey with MC group ?
 The MC group has a full member PKey and the members can be full or
 limited.
 I accept the correction.
 Doesn't this require a code change for handling trap cases 66-67 ?
 I think that you referred to the comment since the code is handling this 
 properly (in my opinion).

 I was referring to both the comment and the code since a port with a
 compatible limited pkey should be able to receive the reports for MC
 groups.

 I agree and I think that the code is handling this case properly.
 osm_physp_has_pkey() takes the 15 lower MGID pkey bits and checks whether it 
 is the physp pkey table.

You're right; the code handles it. I missed the ib_pkey_get_base call there.

-- Hal


 Eli

 Sasha, can you please change this in the commit (only if there are not
 other remarks).
 Is that what you are asking Sasha to do (beyond the typos) ?
 I asked Sasha to fix only the typo in commit.

 BTW, there is no explicit reference in the IB spec for MC group
 create/delete trap (at least I didn't find it).
 Not sure what you mean by this. What didn't you find ?
 in the spec see o13-17.1.2

 Yes, there appear to be some holes in the spec in terms of this and
 maybe more in this area (event forwarding) but the intent seems clear.

 -- Hal

 Thanks,
 Eli
 -- Hal

 +          In all other cases the issuer gis is the trap source.
                                               typo  ^^^
                                                       gid

 and this typo of course.

 Thanks,
 Eli
 -- Hal

 +       */
 +       if (trap_num = 64  trap_num = 67 )
 +               /* The issuer of these traps is the SM so source_gid
 +                  is the gid saved on the data details */
 +               source_gid = p_ntc-data_details.ntc_64_67.gid;
 +       else
 +               source_gid = p_ntc-issuer_gid;
 +
 +       p_dest_port =
 +           cl_ptr_vector_get(p_subn-port_lid_tbl,
 +                             
 cl_ntoh16(p_infr_rec-report_addr.dest_lid));
 +       if (!p_dest_port) {
 +               OSM_LOG(p_log, OSM_LOG_INFO,
 +                       Cannot find destination port with LID:%u\n,
 +                       cl_ntoh16(p_infr_rec-report_addr.dest_lid));
 +               goto Exit;
 +       }
 +
 +       switch (trap_num) {
 +               case 66:
 +               case 67:
 +                       p_mgrp = osm_get_mgrp_by_mgid(p_subn, 
 source_gid);
 +                       if (!p_mgrp) {
 +                               OSM_LOG(p_log, OSM_LOG_INFO,
 +                                       Cannot find MGID %s\n,
 +                                       

Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Steve Wise

Jason Gunthorpe wrote:

On Mon, Feb 08, 2010 at 08:52:10AM -0800, Roland Dreier wrote:
  

  This patch doesn't solve the openmpi/IB regression.   So for OFED,
  IMO, we need a different patch...

If this doesn't solve the regression the we should have a different
patch for upstream too.  The goal for 2.6.33 should be to keep open mpi
working, even if that requires us to go back to old breakage.



Steve, I thought you said earlier in the thread that the rdmacm OMPI
method is not used that often with IB - and the other IB connect
methods work fine.

  


Maybe Jeff can chime in here, but he mentioned to me that Sandia Labs 
were using IB/rdmacm.



This really is a bug in OMPI, how long do you think this new feature
should remain outside the upstream kernel? Is someone going to commit
to fixing OMPI soon if the patch is removed?

  
IMO 127.0.0.1 should be for SW loopback, not HW RDMA loopback. 

But I believe Jeff asked at least that we pull it from 2.6.33 and let 
OMPI get its next release out with the OMPI fix.  Then you can push it 
into 2.6.34 if we really want this feature.


I will commit to get the fix in openmpi asap.

Steve.


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC] Here is a not tested patch that I think removes support for binding

2010-02-08 Thread Steve Wise
to 127.0.0.1.  Sean will this work?

If we agree to do this for 2.6.33, then I'll build/test this and resubmit.



rdma/cma: Disallow binding rdma endpoints to 127.0.0.1.

Currently this functionality breaks openmpi.  Once openmpi is fixed to
correctly ignore 127.0.0.1 as a valid external rdma address, we can
re-enable this functionality.

Signed-off-by: Steve Wise sw...@opengridcomputing.com
---

 drivers/infiniband/core/cma.c |   16 ++--
 1 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index cc9b594..cd3d351 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -628,19 +628,9 @@ static inline int cma_zero_addr(struct sockaddr *addr)
}
 }
 
-static inline int cma_loopback_addr(struct sockaddr *addr)
-{
-   if (addr-sa_family == AF_INET)
-   return ipv4_is_loopback(
-   ((struct sockaddr_in *) addr)-sin_addr.s_addr);
-   else
-   return ipv6_addr_loopback(
-   ((struct sockaddr_in6 *) addr)-sin6_addr);
-}
-
 static inline int cma_any_addr(struct sockaddr *addr)
 {
-   return cma_zero_addr(addr) || cma_loopback_addr(addr);
+   return cma_zero_addr(addr);
 }
 
 static inline __be16 cma_port(struct sockaddr *addr)
@@ -2115,9 +2105,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr 
*addr)
if (ret)
goto err1;
 
-   if (cma_loopback_addr(addr)) {
-   ret = cma_bind_loopback(id_priv);
-   } else if (!cma_zero_addr(addr)) {
+   if (!cma_zero_addr(addr)) {
ret = rdma_translate_ip(addr, id-route.addr.dev_addr);
if (ret)
goto err1;

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Sean Hefty
IMO 127.0.0.1 should be for SW loopback, not HW RDMA loopback.

I disagree, but what does it matter?  So, we add a 'software' loopback that uses
127.0.0.1.  Openmpi still wouldn't work.

I will commit to get the fix in openmpi asap.

If we don't care if the fix is in the kernel or user space, then we could add an
a 'disable-loopback-support' build option to librdmacm, which can fail any
attempt to bind to a loopback address.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] [PATCH] [for-2.6.33] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Steve Wise


Sean Hefty wrote:

IMO 127.0.0.1 should be for SW loopback, not HW RDMA loopback.



I disagree, but what does it matter?  So, we add a 'software' loopback that uses
127.0.0.1.  Openmpi still wouldn't work.

  


I guess that's true.


I will commit to get the fix in openmpi asap.



If we don't care if the fix is in the kernel or user space, then we could add an
a 'disable-loopback-support' build option to librdmacm, which can fail any
attempt to bind to a loopback address.

  


I'd rather see it removed from 2.6.33 kernel before it shipts, and then 
we fix openmpi, and then re-submit 127.0.0.1 support once openmpi 
publishes a release with its fix.  See my other email that submits a 
potential commit to remove 127.0.0.1 support for 2.6.33. 


Steve.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/18] Increase maximum number of Infiniband HCAs per system

2010-02-08 Thread Alex Chiang
Hi Roland,

Any thoughts on this patch series?

Thanks,
/ac

* Alex Chiang achi...@hp.com:
 This is v2 of a patch series that increases the maximum number of
 IB HCAs supported per system.
 
 The original mail thread is here:
   http://lkml.org/lkml/2010/1/29/346
 
 One note, I decided to copy/paste since factoring out the overflow
 code in the three drivers seemed like overkill. If so desired, I could
 factor those three separate functions into something provided by the
 core, but that seemed like more trouble than it was worth at the time.
 
 As before, I still don't have access to a giant system, so what I did
 to test was to stick 4 cards into a small system, and then modify the
 limits with debug patches similar to this:
 
 diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
 index 7bf0a82..8581e64 100644
 --- a/drivers/infiniband/core/ucm.c
 +++ b/drivers/infiniband/core/ucm.c
 @@ -102,7 +102,7 @@ struct ib_ucm_event {
  enum {
 IB_UCM_MAJOR = 231,
 IB_UCM_BASE_MINOR = 224,
 -   IB_UCM_MAX_DEVICES = 32
 +   IB_UCM_MAX_DEVICES = 2
  };
 
 
 I tested all 3 drivers this way (uverbs, umad, ucm). I verified that
 we're not leaking device numbers on multiple modprobe/rmmod cycles,
 that there aren't any funny interactions when various combinations of
 the drivers are loaded.
 
 I did not test the rest of the OFED stack. I did write some trivial
 programs to open the devices in /dev and close them again.
 
 Here's an example of some of the testing:
 
   dl585g2:~ # modprobe ib_uverbs
   dl585g2:~ # modprobe ib_umad
   dl585g2:~ # modprobe ib_ucm
   dl585g2:~ # ls -l /dev/uverb*
   crw-rw 1 root root 231, 192 Feb  2 05:55 /dev/uverbs0
   crw-rw 1 root root 231, 193 Feb  2 05:55 /dev/uverbs1
   crw-rw 1 root root 249,   0 Feb  2 05:55 /dev/uverbs2
   crw-rw 1 root root 249,   1 Feb  2 05:55 /dev/uverbs3
   dl585g2:~ # ls -l /dev/umad*
   crw-rw 1 root root 231, 0 Feb  2 05:55 /dev/umad0
   crw-rw 1 root root 231, 1 Feb  2 05:55 /dev/umad1
   crw-rw 1 root root 231, 2 Feb  2 05:55 /dev/umad2
   crw-rw 1 root root 231, 3 Feb  2 05:55 /dev/umad3
   crw-rw 1 root root 248, 0 Feb  2 05:55 /dev/umad4
   crw-rw 1 root root 248, 1 Feb  2 05:55 /dev/umad5
   crw-rw 1 root root 248, 2 Feb  2 05:55 /dev/umad6
   crw-rw 1 root root 248, 3 Feb  2 05:55 /dev/umad7
   dl585g2:~ # ls -l /dev/issm*
   crw-rw 1 root root 231, 4 Feb  2 05:55 /dev/issm0
   crw-rw 1 root root 231, 5 Feb  2 05:55 /dev/issm1
   crw-rw 1 root root 231, 6 Feb  2 05:55 /dev/issm2
   crw-rw 1 root root 231, 7 Feb  2 05:55 /dev/issm3
   crw-rw 1 root root 248, 4 Feb  2 05:55 /dev/issm4
   crw-rw 1 root root 248, 5 Feb  2 05:55 /dev/issm5
   crw-rw 1 root root 248, 6 Feb  2 05:55 /dev/issm6
   crw-rw 1 root root 248, 7 Feb  2 05:55 /dev/issm7
   dl585g2:~ # ls -l /dev/ucm*
   crw-rw 1 root root 231, 224 Feb  2 05:55 /dev/ucm0
   crw-rw 1 root root 231, 225 Feb  2 05:55 /dev/ucm1
   crw-rw 1 root root 247,   0 Feb  2 05:55 /dev/ucm2
   crw-rw 1 root root 247,   1 Feb  2 05:55 /dev/ucm3
 
 Note that the major and minor numbers are behaving rather sanely.
 
   dl585g2:~ # rmmod ib_ucm
   dl585g2:~ # rmmod ib_uverbs
   dl585g2:~ # rmmod ib_umad
 
 Reset.
 
   dl585g2:~ # modprobe ib_ucm
   dl585g2:~ # ls -l /dev/ucm*
   crw-rw 1 root root 231, 224 Feb  2 05:57 /dev/ucm0
   crw-rw 1 root root 231, 225 Feb  2 05:57 /dev/ucm1
   crw-rw 1 root root 248,   0 Feb  2 05:57 /dev/ucm2
   crw-rw 1 root root 248,   1 Feb  2 05:57 /dev/ucm3
 
 See that /dev/ucm* devices now have a different major number 
 compared to last time(248 vs 247), since we loaded that driver first.
 
 But wait, why is it 248 and not 249? Is there a leak somewhere?
 
   dl585g2:~ # ls -l /dev/uverb*
   crw-rw 1 root root 231, 192 Feb  2 05:57 /dev/uverbs0
   crw-rw 1 root root 231, 193 Feb  2 05:57 /dev/uverbs1
   crw-rw 1 root root 249,   0 Feb  2 05:57 /dev/uverbs2
   crw-rw 1 root root 249,   1 Feb  2 05:57 /dev/uverbs3
   dl585g2:~ # rmmod ib_uverbs
   ERROR: Module ib_uverbs is in use by ib_ucm
 
 Ah, ib_ucm is dependent on ib_uverbs, so when we modprobed ib_ucm,
 in reality ib_uverbs got loaded first. See how it has a higher
 major number.
 
   dl585g2:~ # rmmod ib_ucm
   dl585g2:~ # rmmod ib_uverbs
   dl585g2:~ # modprobe ib_umad
   dl585g2:~ # ls -l /dev/umad*
   crw-rw 1 root root 231, 0 Feb  2 05:58 /dev/umad0
   crw-rw 1 root root 231, 1 Feb  2 05:58 /dev/umad1
   crw-rw 1 root root 231, 2 Feb  2 05:58 /dev/umad2
   crw-rw 1 root root 231, 3 Feb  2 05:58 /dev/umad3
   crw-rw 1 root root 249, 0 Feb  2 05:58 /dev/umad4
   crw-rw 1 root root 249, 1 Feb  2 05:58 

Re: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Jeff Squyres
Sorry -- I missed many of these mails today due to mail filtering (don't ask).

FWIW:

- I'm not opposed to adding LOOPBACK checks into OMPI to avoid this problem 
(I'm waiting for a patch, actually).  I'm just saying that we're not going to 
get a release out immediately with this fix.  Our next release was scheduled to 
be 1.4.2, and it is still at least several weeks away.  So allowing this in 
2.6.33 would be Bad because a) we know it breaks OMPI, and b) OMPI can't get a 
release out immediately to fix the issue.

- There are customers who are using RDMA CM with IB (e.g., Sandia with their 
Mesh/IB routing stuff).

- I see the following in rdma_bind_addr(3):

-
DESCRIPTION
   Associates a source address with an rdma_cm_id.  The  address  may  be
   wildcarded.   If  binding  to a specific local address, the rdma_cm_id
   will also be bound to a local RDMA device.
-

What RDMA device is bound to when you use 127.0.0.1?  I'm not 100% sure, but I 
think that this might be where we got the rationale that we didn't need 
additional LOOPBACK tests in OMPI...  (if anyone else agrees with this 
interpretation, then it's at least one argument that allowing binding to 
LOOPBACK devices *is* a change in semantics, and therefore should be treated 
extremely carefully)


On Feb 8, 2010, at 4:16 PM, Steve Wise wrote:

 
 Sean Hefty wrote:
  IMO 127.0.0.1 should be for SW loopback, not HW RDMA loopback.
 
 
  I disagree, but what does it matter?  So, we add a 'software' loopback that 
  uses
  127.0.0.1.  Openmpi still wouldn't work.
 
   
 
 I guess that's true.
 
  I will commit to get the fix in openmpi asap.
 
 
  If we don't care if the fix is in the kernel or user space, then we could 
  add an
  a 'disable-loopback-support' build option to librdmacm, which can fail any
  attempt to bind to a loopback address.
 
   
 
 I'd rather see it removed from 2.6.33 kernel before it shipts, and then
 we fix openmpi, and then re-submit 127.0.0.1 support once openmpi
 publishes a release with its fix.  See my other email that submits a
 potential commit to remove 127.0.0.1 support for 2.6.33.
 
 Steve.
 


-- 
Jeff Squyres
jsquy...@cisco.com

For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH resend] rdma/cma: Disallow binding rdma endpoints to 127.0.0.1.

2010-02-08 Thread Steve Wise
Here is a not tested patch that I think removes support for binding
to 127.0.0.1.  Sean will this work?

If we agree to do this for 2.6.33, then I'll build/test this and resubmit.



rdma/cma: Disallow binding rdma endpoints to 127.0.0.1.

Currently this functionality breaks openmpi.  Once openmpi is fixed to
correctly ignore 127.0.0.1 as a valid external rdma address, we can
re-enable this functionality.

Signed-off-by: Steve Wise sw...@opengridcomputing.com
---

 drivers/infiniband/core/cma.c |   16 ++--
 1 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index cc9b594..cd3d351 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -628,19 +628,9 @@ static inline int cma_zero_addr(struct sockaddr *addr)
}
 }
 
-static inline int cma_loopback_addr(struct sockaddr *addr)
-{
-   if (addr-sa_family == AF_INET)
-   return ipv4_is_loopback(
-   ((struct sockaddr_in *) addr)-sin_addr.s_addr);
-   else
-   return ipv6_addr_loopback(
-   ((struct sockaddr_in6 *) addr)-sin6_addr);
-}
-
 static inline int cma_any_addr(struct sockaddr *addr)
 {
-   return cma_zero_addr(addr) || cma_loopback_addr(addr);
+   return cma_zero_addr(addr);
 }
 
 static inline __be16 cma_port(struct sockaddr *addr)
@@ -2115,9 +2105,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr 
*addr)
if (ret)
goto err1;
 
-   if (cma_loopback_addr(addr)) {
-   ret = cma_bind_loopback(id_priv);
-   } else if (!cma_zero_addr(addr)) {
+   if (!cma_zero_addr(addr)) {
ret = rdma_translate_ip(addr, id-route.addr.dev_addr);
if (ret)
goto err1;

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Jeff Squyres
On Feb 8, 2010, at 5:09 PM, Jason Gunthorpe wrote:

 DESCRIPTION
   Associates a source address with an rdma_cm_id.  The  address  may  be
   wildcarded.   If  binding  to a specific local address, the rdma_cm_id
   will also be bound to a local RDMA device.
 This statement is trying to say that if a source address is given then
 the rdma_cm_id will be bound to a device.

Which device is bound to if you specify 127.0.0.1 as the source address?  
(which is what OMPI is doing)  Is it possible to assign 127.0.0.1 to an RDMA 
device?

-- 
Jeff Squyres
jsquy...@cisco.com

For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Jeff Squyres
On Feb 8, 2010, at 5:13 PM, Sean Hefty wrote:

 Are you certain that rdma_bind_addr does NOT work with 127.0.0.1, and that 
 this
 is now the problem?
 
 It does appear to work on OFED 1.4 and on 2.6.26 based on ucmatose.  Is the
 problem really with rdma_bind_addr succeeding, or with rdma_connect, which now
 works, or rdma_bind_addr now assigning a device?

On my OFED 1.4.1 RHEL4u6 systems, rdma_bind_addr() fails when attempting to 
bind to 127.0.0.1 per the email I sent Friday:

http://www.spinics.net/lists/linux-rdma/msg02568.html

I have not checked any other combinations; Steve was saying that he saw it 
rdma_bind_addr() succeeding on his machines with OFED 1.5.1rcwhatever (I don't 
recall the OS he said he was using).

-- 
Jeff Squyres
jsquy...@cisco.com

For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Sean Hefty
On my OFED 1.4.1 RHEL4u6 systems, rdma_bind_addr() fails when attempting to
bind to 127.0.0.1 per the email I sent Friday:

http://www.spinics.net/lists/linux-rdma/msg02568.html

This is what I see over IB on 2.6.26, with a couple extra prints added to
cmatose:

cst-lin1:/home/mshefty/librdmacm# examples/ucmatose -b 127.0.0.1
cmatose: starting server
src addr 0x17f
rdma_bind_addr: 0

so we're missing something else.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Steve Wise
Sean, can you try openmpi?  It fails for me, and yet ucmatose succeeds.  
I don't understand the difference yet...



Sean Hefty wrote:

On my OFED 1.4.1 RHEL4u6 systems, rdma_bind_addr() fails when attempting to
bind to 127.0.0.1 per the email I sent Friday:

   http://www.spinics.net/lists/linux-rdma/msg02568.html



This is what I see over IB on 2.6.26, with a couple extra prints added to
cmatose:

cst-lin1:/home/mshefty/librdmacm# examples/ucmatose -b 127.0.0.1
cmatose: starting server
src addr 0x17f
rdma_bind_addr: 0

so we're missing something else.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Sean Hefty
Sean, can you try openmpi?  It fails for me, and yet ucmatose succeeds.
I don't understand the difference yet...

I believe the issue is that rdma_bind_addr succeeds (returns 0), but no device
is assigned to the rdma_cm_id (verbs field is NULL).

This was a change from commit 6f8372b69c3198e06cecb1df2cb9682d0c55e657:

  The defined behavior of rdma_bind_addr is to associate an RDMA
  device with an rdma_cm_id, as long as the user specified a non-
  zero address.  (ie they weren't just trying to reserve a port)
  Currently, if the loopback address is passed to rdma_bind_addr,
  no device is associated with the rdma_cm_id.  Fix this.

There are two places where rdma_bind_addr() is called in the openmpi source code
(based on a tarball download of the trunk).  One is btl_openib_iwarp.c:

  rc = rdma_bind_addr(cm_id, ipaddr);
  if (rc || !cm_id-verbs) {
  rc = OMPI_SUCCESS;
  goto out3;
  }

The other is btl_openib_connect_rdmacm.c, but that deals with listening.  I
can't quickly determine if btl_openib_iwarp.c is usually used for IB or not.

So, to fully keep the behavior of 2.6.32, rdma_bind_addr for 127.0.0.1 should
succeed, but not assign a device.  I think this was the change from commit
..c55e657 that changed the behavior:

@@ -2089,7 +2096,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr
*addr)
if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
return -EINVAL;

-   if (!cma_any_addr(addr)) {
+   if (cma_loopback_addr(addr)) {
+   ret = cma_bind_loopback(id_priv);
+   } else if (!cma_zero_addr(addr)) {
ret = rdma_translate_ip(addr, id-route.addr.dev_addr);
if (ret)
goto err1;

I'll see if reverting this gives the desired(?) behavior.

- Sean

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Jeff Squyres
On Feb 8, 2010, at 6:48 PM, Sean Hefty wrote:

   rc = rdma_bind_addr(cm_id, ipaddr);
   if (rc || !cm_id-verbs) {
   rc = OMPI_SUCCESS;
   goto out3;
   }

Ah, yes!  Per the OMPI code you cited, I amended my printf's and see:

   [svbu-mpi.cisco.com:19315] FAILED to bind to 127.0.0.1: rc=0, verbs=(nil)

So the rc from from rdma_bind_addr was 0, but you're right that the verbs 
pointer was NULL, and we therefore rule that it was no good.

 The other is btl_openib_connect_rdmacm.c, but that deals with listening.  I
 can't quickly determine if btl_openib_iwarp.c is usually used for IB or not.

It is.

 So, to fully keep the behavior of 2.6.32, rdma_bind_addr for 127.0.0.1 should
 succeed, but not assign a device.  I think this was the change from commit
 ..c55e657 that changed the behavior:
 
 @@ -2089,7 +2096,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct 
 sockaddr
 *addr)
 if (!cma_comp_exch(id_priv, CMA_IDLE, CMA_ADDR_BOUND))
 return -EINVAL;
 
 -   if (!cma_any_addr(addr)) {
 +   if (cma_loopback_addr(addr)) {
 +   ret = cma_bind_loopback(id_priv);
 +   } else if (!cma_zero_addr(addr)) {
 ret = rdma_translate_ip(addr, id-route.addr.dev_addr);
 if (ret)
 goto err1;
 
 I'll see if reverting this gives the desired(?) behavior.

Thanks!

-- 
Jeff Squyres
jsquy...@cisco.com

For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Pradeep Satyanarayana
Jeff Squyres wrote:
 On Feb 8, 2010, at 7:30 PM, Pradeep Satyanarayana wrote:
 
 elm3b199:/usr/lib # /usr/mpi/gcc/openmpi-1.4.1/bin/mpirun -np 2 --bynode 
 --mca btl_openib_cpc_include rdmacm ring
 --
 mpirun was unable to launch the specified application as it could not find 
 an executable:

 Executable: ring
 Node: elm3b199

 while attempting to start process rank 0.
 --
 elm3b199:/usr/lib #
 
 Is there an executable named ring either in your $PATH or in /usr/lib?
 
 Open MPI is telling you it can't find an executable named ring.

Hi Jeff,

No, there is none. I got this command from one of the mails in the thread. What 
should I use instead?

Thanks
Pradeep


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ewg] rdma/cm: disallow loopback address for iwarp devices

2010-02-08 Thread Jeff Squyres
On Feb 8, 2010, at 7:50 PM, Pradeep Satyanarayana wrote:

 No, there is none. I got this command from one of the mails in the thread. 
 What should I use instead?

You need to compile and run an MPI program.  ring is a typical test program 
that sends a message around in a ring.  I think that OFED installs those test 
apps somewhere, but I don't recall where offhand.

ring_c.c is attached.  Compile it with:

mpicc ring_c.c -o ring

(you might need the full path to mpicc if it's not in your path?)

A better mpirun command line would be:

/usr/mpi/gcc/openmpi-1.4.1/bin/mpirun -np 2 --host HOSTNAME1,HOSTNAME2 \
--mca btl openib,sm,self --mca btl_openib_cpc_include rdmacm ring

Put in your own HOSTNAME1 and HOSTNAME2 values.  You'll also need to ensure 
that both Open MPI and ring are available on both names (preferably in the 
same filesystem locations on both nodes, for simplicity) and that you can ssh 
to from one node to the other without being prompted for a password or 
passphrase.

This will run a 2-process MPI job across the two nodes, passing a message 
between the two processes a few times before quitting.

The various --mca parameters on this mpirun command line ensure that you are 
definitely using the OpenFabrics verbs support and forcing the use of RDMA CM.

-- 
Jeff Squyres
jsquy...@cisco.com

For corporate legal information go to:
http://www.cisco.com/web/about/doing_business/legal/cri/


ring_c.c
Description: Binary data


Re: [PATCH] opensm: Add a name to IB subnet and include it in syslog messages

2010-02-08 Thread Arputham Benjamin
Hi Sasha,

I have incorporated your feedback and sent you the modified patch with a new
subject line [PATCH V2] opensm: Add option to specify prefix to syslog 
messages

Regards,
Benjamin

Sasha Khapyorsky wrote:
 Hi Benjamin,
 
 On 14:16 Thu 21 Jan , Arputham Benjamin wrote:
 Added a text based name to an IB subnet to help user in identifying
 an IB subnet or understanding its function in a multi-fabric IB cluster.
 For example, in a dual-fabric (or dual-rail) IB cluster, one subnet
 could be named mpi and the other subnet could be named storage.

 Summary of changes:
  o Added the option 'subnet_name' to OpenSM command line and config file.
  o Enhanced OpenSM logging facility to include the subnet name in
syslog messages.
 
 Looking at the usage below I can see that his is done as adding free
 text prefix to syslog prints. This is fine and seems could be useful
 for any purpose (not only different subnets) when syslog message mark
 is desired.
 
 Assuming so, wouldn't it be better to change subnet_name to something
 more generic, let say log_prefix?
 
 Signed-off-by: Arputham Benjamin abenja...@sgi.com
 ---
 diff -rup a/include/opensm/osm_log.h b/include/opensm/osm_log.h
 --- a/include/opensm/osm_log.h   2010-01-18 21:32:12.195328129 -0800
 +++ b/include/opensm/osm_log.h   2010-01-18 21:34:46.573932164 -0800
 @@ -120,6 +120,7 @@ typedef struct osm_log {
  boolean_t accum_log_file;
  boolean_t daemon;
  char *log_file_name;
 +char *subnet_name;
  } osm_log_t;
  /*/
  
 diff -rup a/include/opensm/osm_subnet.h b/include/opensm/osm_subnet.h
 --- a/include/opensm/osm_subnet.h2010-01-18 21:32:12.195328129 -0800
 +++ b/include/opensm/osm_subnet.h2010-01-18 21:34:55.782087826 -0800
 @@ -224,6 +224,7 @@ typedef struct osm_subn_opt {
  char *event_plugin_name;
  char *node_name_map_name;
  char *prefix_routes_file;
 +char *subnet_name;
  boolean_t consolidate_ipv6_snm_req;
  struct osm_subn_opt *file_opts; /* used for update */
  uint8_t lash_start_vl;  /* starting vl to use in lash */
 diff -rup a/man/opensm.8.in b/man/opensm.8.in
 --- a/man/opensm.8.in2010-01-19 11:29:03.954832199 -0800
 +++ b/man/opensm.8.in2010-01-21 10:56:58.901836423 -0800
 @@ -49,6 +49,7 @@ opensm \- InfiniBand subnet manager and 
  [\-\-perfmgr_sweep_time_s seconds]
  [\-\-prefix_routes_file path]
  [\-\-consolidate_ipv6_snm_req]
 +[\-\-subnet_name subnet name]
  [\-v(erbose)] [\-V] [\-D flags] [\-d(ebug) number]
  [\-h(elp)] [\-?]
  
 @@ -345,6 +346,9 @@ effect if --enable-perfmgr was specified
  Use shared MLID for IPv6 Solicited Node Multicast groups per MGID scope
  and P_Key.
  .TP
 +\fB\-\-subnet_name\fR subnet name
 +This option specifies the text based name of the subnet.
 +.TP
  \fB\-v\fR, \fB\-\-verbose\fR
  This option increases the log verbosity level.
  The -v option may be specified multiple times
 diff -rup a/opensm/main.c b/opensm/main.c
 --- a/opensm/main.c  2010-01-18 21:31:43.318842260 -0800
 +++ b/opensm/main.c  2010-01-19 11:51:45.566967909 -0800
 @@ -324,6 +324,8 @@ static void show_usage(void)
  printf(--consolidate_ipv6_snm_req\n
   Use shared MLID for IPv6 Solicited Node Multicast 
 groups\n
   per MGID scope and P_Key.\n\n);
 +printf(--subnet_name subnet name\n
 + Text based name of the IB subnet.\n\n);
  printf(--verbose, -v\n
   This option increases the log verbosity level.\n
   The -v option may be specified multiple times\n
 @@ -607,6 +609,7 @@ int main(int argc, char *argv[])
  {lash_start_vl, 1, NULL, 6},
  {sm_sl, 1, NULL, 7},
  {retries, 1, NULL, 8},
 +{subnet_name, 1, NULL, 9},
  {NULL, 0, NULL, 0}  /* Required at the end of the array */
  };
  
 @@ -985,6 +988,11 @@ int main(int argc, char *argv[])
  printf( Transaction retries = %u\n,
 opt.transaction_retries);
  break;
 +case 9:
 +SET_STR_OPT(opt.subnet_name, optarg);
 +printf(IB subnet name = %s\n,
 +   opt.subnet_name);
 +break;
  case 'h':
  case '?':
  case ':':
 diff -rup a/opensm/osm_log.c b/opensm/osm_log.c
 --- a/opensm/osm_log.c   2010-01-18 21:31:43.318842260 -0800
 +++ b/opensm/osm_log.c   2010-01-18 21:33:47.808939648 -0800
 @@ -107,6 +107,7 @@ void osm_log(IN osm_log_t * p_log, IN os
  char buffer[LOG_ENTRY_SIZE_MAX];
  va_list args;
  int ret;
 +uint8_t ind = 0;
  #ifdef __WIN__
  SYSTEMTIME st;
  uint32_t pid = GetCurrentThreadId();
 @@ -123,7 +124,14 @@ void osm_log(IN osm_log_t * p_log, IN os
  return;
  
  va_start(args, p_str);
 -vsprintf(buffer, p_str, args);
 +if(p_log-subnet_name) {
 +