[RFC-v3 7/9] iscsi-target: Refactor TX queue logic + export response PDU creation

2013-04-04 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This patch refactors TX immediate + response queue handling to use
the new iscsit_transport API callers, and exports the necessary
traditional iscsi PDU response creation functions for iser-target
to utilize.

This includes:

- Add iscsit_build_datain_pdu() for DATAIN PDU init + convert
  iscsit_build_datain_pdu()
- Add iscsit_build_logout_rsp() for LOGOUT_RSP PDU init + convert
  iscsit_send_logout()
- Add iscsit_build_nopin_rsp() for NOPIN_RSP PDU init + convert
  iscsit_send_nopin()
- Add iscsit_build_rsp_pdu() for SCSI_RSP PDU init + convert
  iscsit_send_response()
- Add iscsit_build_task_mgt_rsp for TM_RSP PDU init + convert
  iscsit_send_task_mgt_rsp()
- Refactor immediate queue state switch into iscsit_immediate_queue()
- Convert handle_immediate_queue() to use iscsit_transport caller
- Refactor response queue state switch into iscsit_response_queue()
- Convert handle_response_queue to use iscsit_transport caller
- Export iscsit_logout_post_handler(), iscsit_increment_maxcmdsn()
  and iscsit_tmr_post_handler() for external transport module usage

v3 changes:
- Add iscsit_build_reject for REJECT PDU init + convert
  iscsit_send_reject()

v2 changes:

- Add iscsit_queue_rsp() for iscsit_transport-iscsit_queue_data_in()
  and iscsit_transport-iscsit_queue_status()
- Update lio_queue_data_in() to use -iscsit_queue_data_in()
- Update lio_queue_status() to use -iscsit_queue_status()
- Use mutex_trylock() in iscsit_increment_maxcmdsn()

Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 drivers/target/iscsi/iscsi_target.c  |  661 ++
 drivers/target/iscsi/iscsi_target_configfs.c |7 +-
 drivers/target/iscsi/iscsi_target_device.c   |7 +-
 drivers/target/iscsi/iscsi_target_tmr.c  |1 +
 4 files changed, 374 insertions(+), 302 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c 
b/drivers/target/iscsi/iscsi_target.c
index 19d4e08..3948aa1 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -70,8 +70,7 @@ struct kmem_cache *lio_ooo_cache;
 struct kmem_cache *lio_r2t_cache;
 
 static int iscsit_handle_immediate_data(struct iscsi_cmd *,
-   unsigned char *buf, u32);
-static int iscsit_logout_post_handler(struct iscsi_cmd *, struct iscsi_conn *);
+   struct iscsi_scsi_req *, u32);
 
 struct iscsi_tiqn *iscsit_get_tiqn_for_login(unsigned char *buf)
 {
@@ -482,6 +481,15 @@ int iscsit_del_np(struct iscsi_np *np)
return 0;
 }
 
+static int iscsit_immediate_queue(struct iscsi_conn *, struct iscsi_cmd *, 
int);
+static int iscsit_response_queue(struct iscsi_conn *, struct iscsi_cmd *, int);
+
+static int iscsit_queue_rsp(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+   iscsit_add_cmd_to_response_queue(cmd, cmd-conn, cmd-i_state);
+   return 0;
+}
+
 static struct iscsit_transport iscsi_target_transport = {
.name   = iSCSI/TCP,
.transport_type = ISCSI_TCP,
@@ -493,6 +501,10 @@ static struct iscsit_transport iscsi_target_transport = {
.iscsit_get_login_rx= iscsit_get_login_rx,
.iscsit_put_login_tx= iscsit_put_login_tx,
.iscsit_get_dataout = iscsit_build_r2ts_for_cmd,
+   .iscsit_immediate_queue = iscsit_immediate_queue,
+   .iscsit_response_queue  = iscsit_response_queue,
+   .iscsit_queue_data_in   = iscsit_queue_rsp,
+   .iscsit_queue_status= iscsit_queue_rsp,
 };
 
 static int __init iscsi_target_init_module(void)
@@ -651,14 +663,6 @@ static int iscsit_add_reject(
iscsit_add_cmd_to_response_queue(cmd, conn, cmd-i_state);
 
ret = wait_for_completion_interruptible(cmd-reject_comp);
-   /*
-* Perform the kref_put now if se_cmd has been setup by
-* iscsit_setup_scsi_cmd()
-*/
-   if (cmd-se_cmd.se_tfo != NULL) {
-   pr_debug(iscsi reject: calling target_put_sess_cmd \n);
-   target_put_sess_cmd(conn-sess-se_sess, cmd-se_cmd);
-   }
if (ret != 0)
return -1;
 
@@ -2536,18 +2540,60 @@ static void iscsit_tx_thread_wait_for_tcp(struct 
iscsi_conn *conn)
}
 }
 
-static int iscsit_send_data_in(
-   struct iscsi_cmd *cmd,
-   struct iscsi_conn *conn)
+static void
+iscsit_build_datain_pdu(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
+   struct iscsi_datain *datain, struct iscsi_data_rsp *hdr,
+   bool set_statsn)
 {
-   int iov_ret = 0, set_statsn = 0;
-   u32 iov_count = 0, tx_size = 0;
+   hdr-opcode = ISCSI_OP_SCSI_DATA_IN;
+   hdr-flags  = datain-flags;
+   if (hdr-flags  ISCSI_FLAG_DATA_STATUS) {
+   if (cmd-se_cmd.se_cmd_flags  SCF_OVERFLOW_BIT) {
+   hdr-flags |= ISCSI_FLAG_DATA_OVERFLOW;
+   hdr-residual_count = 
cpu_to_be32(cmd-se_cmd.residual_count);
+

[RFC-v3 5/9] iscsi-target: Add per transport iscsi_cmd alloc/free

2013-04-04 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This patch converts struct iscsi_cmd memory allocation + free to use
-iscsit_alloc_cmd() iscsit_transport API caller, and export
iscsit_allocate_cmd() symbols

Also add iscsi_cmd-release_cmd() to be used seperately from
iscsit_transport for connection/session shutdown.

v2 changes:

- Remove unnecessary checks in iscsit_alloc_cmd (asias)
- Drop iscsit_transport-iscsit_free_cmd() usage
- Drop iscsit_transport-iscsit_unmap_cmd() usage
- Add iscsi_cmd-release_cmd()
- Convert lio_release_cmd() to use iscsi_cmd-release_cmd()

Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 drivers/target/iscsi/iscsi_target.c  |1 +
 drivers/target/iscsi/iscsi_target_configfs.c |3 ++-
 drivers/target/iscsi/iscsi_target_core.h |1 +
 drivers/target/iscsi/iscsi_target_util.c |   25 +
 drivers/target/iscsi/iscsi_target_util.h |1 +
 5 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c 
b/drivers/target/iscsi/iscsi_target.c
index 8203bf3..b01a10e 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -489,6 +489,7 @@ static struct iscsit_transport iscsi_target_transport = {
.iscsit_setup_np= iscsit_setup_np,
.iscsit_accept_np   = iscsit_accept_np,
.iscsit_free_np = iscsit_free_np,
+   .iscsit_alloc_cmd   = iscsit_alloc_cmd,
.iscsit_get_login_rx= iscsit_get_login_rx,
.iscsit_put_login_tx= iscsit_put_login_tx,
 };
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c 
b/drivers/target/iscsi/iscsi_target_configfs.c
index 78d75c8..c78b824 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -1700,7 +1700,8 @@ static void lio_release_cmd(struct se_cmd *se_cmd)
 {
struct iscsi_cmd *cmd = container_of(se_cmd, struct iscsi_cmd, se_cmd);
 
-   iscsit_release_cmd(cmd);
+   pr_debug(Entering lio_release_cmd for se_cmd: %p\n, se_cmd);
+   cmd-release_cmd(cmd);
 }
 
 /* End functions for target_core_fabric_ops */
diff --git a/drivers/target/iscsi/iscsi_target_core.h 
b/drivers/target/iscsi/iscsi_target_core.h
index 53400b0..60ec4b9 100644
--- a/drivers/target/iscsi/iscsi_target_core.h
+++ b/drivers/target/iscsi/iscsi_target_core.h
@@ -485,6 +485,7 @@ struct iscsi_cmd {
u32 first_data_sg_off;
u32 kmapped_nents;
sense_reason_t  sense_reason;
+   void (*release_cmd)(struct iscsi_cmd *);
 }  cacheline_aligned;
 
 struct iscsi_tmr_req {
diff --git a/drivers/target/iscsi/iscsi_target_util.c 
b/drivers/target/iscsi/iscsi_target_util.c
index 4cf1e7f..0b73f90 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -149,6 +149,18 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
spin_unlock_bh(cmd-r2t_lock);
 }
 
+struct iscsi_cmd *iscsit_alloc_cmd(struct iscsi_conn *conn, gfp_t gfp_mask)
+{
+   struct iscsi_cmd *cmd;
+
+   cmd = kmem_cache_zalloc(lio_cmd_cache, gfp_mask);
+   if (!cmd)
+   return NULL;
+
+   cmd-release_cmd = iscsit_release_cmd;
+   return cmd;
+}
+
 /*
  * May be called from software interrupt (timer) context for allocating
  * iSCSI NopINs.
@@ -157,13 +169,12 @@ struct iscsi_cmd *iscsit_allocate_cmd(struct iscsi_conn 
*conn, gfp_t gfp_mask)
 {
struct iscsi_cmd *cmd;
 
-   cmd = kmem_cache_zalloc(lio_cmd_cache, gfp_mask);
+   cmd = conn-conn_transport-iscsit_alloc_cmd(conn, gfp_mask);
if (!cmd) {
pr_err(Unable to allocate memory for struct iscsi_cmd.\n);
return NULL;
}
-
-   cmd-conn   = conn;
+   cmd-conn = conn;
INIT_LIST_HEAD(cmd-i_conn_node);
INIT_LIST_HEAD(cmd-datain_list);
INIT_LIST_HEAD(cmd-cmd_r2t_list);
@@ -176,6 +187,7 @@ struct iscsi_cmd *iscsit_allocate_cmd(struct iscsi_conn 
*conn, gfp_t gfp_mask)
 
return cmd;
 }
+EXPORT_SYMBOL(iscsit_allocate_cmd);
 
 struct iscsi_seq *iscsit_get_seq_holder_for_datain(
struct iscsi_cmd *cmd,
@@ -690,6 +702,11 @@ void iscsit_free_cmd(struct iscsi_cmd *cmd)
 */
switch (cmd-iscsi_opcode) {
case ISCSI_OP_SCSI_CMD:
+   if (cmd-data_direction == DMA_TO_DEVICE)
+   iscsit_stop_dataout_timer(cmd);
+   /*
+* Fallthrough
+*/
case ISCSI_OP_SCSI_TMFUNC:
transport_generic_free_cmd(cmd-se_cmd, 1);
break;
@@ -705,7 +722,7 @@ void iscsit_free_cmd(struct iscsi_cmd *cmd)
}
/* Fall-through */
default:
-   iscsit_release_cmd(cmd);
+   cmd-release_cmd(cmd);
break;
}
 }
diff --git a/drivers/target/iscsi/iscsi_target_util.h 

[RFC-v3 0/9] Add support for iSCSI Extensions for RDMA (ISER) target mode

2013-04-04 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This series is the third RFC for iSCSI Extensions for RDMA (ISER) target
mode support planned for an future v3.10 merge.  This series refactors
existing traditional iscsi-target mode logic in order for external
ib_isert.ko module code to function with areas common to traditional
TCP socket based iSCSI and RDMA verbs based ISER operation.

This includes a basic iscsit_transport API that allows different transports
to reside under the existing iscsi-target configfs control plane, using an
pre-defined network portal attribute to enable a rdma_cm listener on top
of existing ipoib portals.

At this point the code is functional and pushing sustained RDMA_WRITE +
RDMA_READ traffic using open-iscsi on top of multiple iser network portals +
multiple IB HCA ports + multiple LUNs.  Thus far we're using Mellanox IB HCAs
for initial development, and will be verfiying with RCoE capable NICs as well
in the near future.

This RFC-v3 code is available in git against v3.9-rc3 here:

  git://git.kernel.org/pub/scm/linux/kernel/git/nab/target-pending.git 
iser_target-rfc-v3

The changes included since RFC-v2 include:

  - Convert to use per isert_cq_desc-cq_[rx,tx]_work + drop tasklets (Or + nab)
  - Move IB_EVENT_QP_LAST_WQE_REACHED warn into correct
isert_qp_event_callback (Or)
  - Drop unnecessary IB_ACCESS_REMOTE_* access flag usage in
isert_create_device_ib_res (Or)
  - Add common isert_init_send_wr(), and convert isert_put_* calls (Or)
  - Move to verbs+core logic to single ib_isert.[c,h]  (Or + nab)
  - Add kmem_cache isert_cmd_cache usage for descriptor allocation (nab)
  - Move common ib_post_send() logic used by isert_put_*() to
isert_post_response() (nab)
  - Add isert_put_reject call in isert_response_queue() for posting
ISCSI_REJECT response. (nab)
  - Add ISTATE_SEND_REJECT checking in isert_do_control_comp. (nab)
  - Add extra target_put_sess_cmd call in iscsit_add_reject_from_cmd
after completion
  - Add iscsit_build_reject for REJECT PDU init + convert
iscsit_send_reject()
  - Add iscsi_post_login_start_timers FIXME for ISER

Many thanks again to Or Gerlitz from Mellanox for v2 review feedback!

With the per isert_cq_desc-cq_[rx,tx]_work cmwq dispatch now in place,
process context switch overhead is reduced to each cq callback, instead
of per each ib_poll_cq ib_wc descriptor as with tasklets in RFC-v2 code.
This allows RFC-v3 to reach within ~%5 of single-lun NULLIO small block
IOPs performance parity vs. existing STGT iser code, at almost 1/10 the
number of total context switches.

As before, review patches are broken down into:

Patch #1 adds the export of target_get_sess_cmd to be used by iscsi-target

Patch #2 - #4 include iscsi-target API template, conversion of iscsi/tcp
login path to use API template, plus add iser RFC parameter keys.

Patch #5 - #6 allow external iscsi_cmd descriptor allocation / free, and
refactoring of RX side PDU request handling to allow incoming PDU logic
to be called by external ib_isert workqueue process context.

Patch #7 allows iscsi-target to use per transport API template immediate /
response callbacks in the per-connection TX thread completion path, and
refactoring of response PDU creation for export to external ib_isert code.

Patch #8 adds the pre-defined iser network portal attribute under the
existing iscsi-target configfs tree.

Patch #9 - #12 is the external ib_isert.ko module code seperated into
individual commits for review.

Note that I'll be dropping #1 - #8 changes into target-pending/for-next
shortly, and planning to do the same with ib_isert code soon unless there
is an objection from Roland.  Please review..

Thank you!

--nab

Nicholas Bellinger (9):
  target: Add export of target_get_sess_cmd symbol
  iscsi-target: Add iscsit_transport API template
  iscsi-target: Initial traditional TCP conversion to iscsit_transport
  iscsi-target: Add iser-target parameter keys + setup during login
  iscsi-target: Add per transport iscsi_cmd alloc/free
  iscsi-target: Refactor RX PDU logic + export request PDU handling
  iscsi-target: Refactor TX queue logic + export response PDU creation
  iscsi-target: Add iser network portal attribute
  iser-target: Add iSCSI Extensions for RDMA (iSER) target driver

 drivers/infiniband/Kconfig |1 +
 drivers/infiniband/Makefile|1 +
 drivers/infiniband/ulp/isert/Kconfig   |6 +
 drivers/infiniband/ulp/isert/Makefile  |2 +
 drivers/infiniband/ulp/isert/ib_isert.c| 2270 
 drivers/infiniband/ulp/isert/ib_isert.h|  142 ++
 drivers/infiniband/ulp/isert/isert_proto.h |   47 +
 drivers/target/iscsi/Makefile  |3 +-
 drivers/target/iscsi/iscsi_target.c| 1169 -
 drivers/target/iscsi/iscsi_target.h|3 +-
 drivers/target/iscsi/iscsi_target_configfs.c   |   94 +-
 drivers/target/iscsi/iscsi_target_core.h

[RFC-v3 6/9] iscsi-target: Refactor RX PDU logic + export request PDU handling

2013-04-04 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This patch refactors existing traditional iscsi RX side PDU handling
to use iscsit_transport, and exports the necessary logic for external
transport modules.

This includes:

- Refactor iscsit_handle_scsi_cmd() into PDU setup / processing
- Add updated iscsit_handle_scsi_cmd() for tradtional iscsi code
- Add iscsit_set_unsoliticed_dataout() wrapper
- Refactor iscsit_handle_data_out() into PDU check / processing
- Add updated iscsit_handle_data_out() for tradtional iscsi code
- Add iscsit_handle_nop_out() + iscsit_handle_task_mgt_cmd() to
  accept pre-allocated struct iscsi_cmd
- Add iscsit_build_r2ts_for_cmd() caller for iscsi_target_transport
  to handle ISTATE_SEND_R2T for TX immediate queue
- Refactor main traditional iscsi iscsi_target_rx_thread() PDU switch
  into iscsi_target_rx_opcode() using iscsit_allocate_cmd()
- Turn iscsi_target_rx_thread() process context into NOP for
  ib_isert side work-queue.

v3 changes:
- Add extra target_put_sess_cmd call in iscsit_add_reject_from_cmd
  after completion

v2 changes:

- Disable iscsit_ack_from_expstatsn() usage for RDMAExtentions=Yes
- Disable iscsit_allocate_datain_req() usage for RDMAExtentions=Yes
- Add target_get_sess_cmd() reference counting to
  iscsit_setup_scsi_cmd()
- Add TFO-lio_check_stop_free() fabric API caller
- Add export of iscsit_stop_dataout_timer() symbol
- Add iscsit_build_r2ts_for_cmd() for iscsit_transport-iscsit_get_dataout()
- Convert existing usage of iscsit_build_r2ts_for_cmd() to
  -iscsit_get_dataout()
- Drop RDMAExtentions=Yes specific check in iscsit_build_r2ts_for_cmd()
- Fix RDMAExtentions - RDMAExtensions typo (andy)
- Pass correct dump_payload value into iscsit_get_immediate_data()
  for iscsit_handle_scsi_cmd()

Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 drivers/target/iscsi/iscsi_target.c  |  486 --
 drivers/target/iscsi/iscsi_target.h  |3 +-
 drivers/target/iscsi/iscsi_target_configfs.c |9 +-
 drivers/target/iscsi/iscsi_target_erl1.c |   13 +-
 drivers/target/iscsi/iscsi_target_login.c|3 +-
 drivers/target/iscsi/iscsi_target_nego.c |   15 -
 drivers/target/iscsi/iscsi_target_tmr.c  |3 +-
 drivers/target/iscsi/iscsi_target_util.c |1 +
 8 files changed, 331 insertions(+), 202 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c 
b/drivers/target/iscsi/iscsi_target.c
index b01a10e..19d4e08 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -492,6 +492,7 @@ static struct iscsit_transport iscsi_target_transport = {
.iscsit_alloc_cmd   = iscsit_alloc_cmd,
.iscsit_get_login_rx= iscsit_get_login_rx,
.iscsit_put_login_tx= iscsit_put_login_tx,
+   .iscsit_get_dataout = iscsit_build_r2ts_for_cmd,
 };
 
 static int __init iscsi_target_init_module(void)
@@ -650,6 +651,14 @@ static int iscsit_add_reject(
iscsit_add_cmd_to_response_queue(cmd, conn, cmd-i_state);
 
ret = wait_for_completion_interruptible(cmd-reject_comp);
+   /*
+* Perform the kref_put now if se_cmd has been setup by
+* iscsit_setup_scsi_cmd()
+*/
+   if (cmd-se_cmd.se_tfo != NULL) {
+   pr_debug(iscsi reject: calling target_put_sess_cmd \n);
+   target_put_sess_cmd(conn-sess-se_sess, cmd-se_cmd);
+   }
if (ret != 0)
return -1;
 
@@ -698,11 +707,20 @@ int iscsit_add_reject_from_cmd(
iscsit_add_cmd_to_response_queue(cmd, conn, cmd-i_state);
 
ret = wait_for_completion_interruptible(cmd-reject_comp);
+   /*
+* Perform the kref_put now if se_cmd has already been setup by
+* scsit_setup_scsi_cmd()
+*/
+   if (cmd-se_cmd.se_tfo != NULL) {
+   pr_debug(iscsi reject: calling target_put_sess_cmd \n);
+   target_put_sess_cmd(conn-sess-se_sess, cmd-se_cmd);
+   }
if (ret != 0)
return -1;
 
return (!fail_conn) ? 0 : -1;
 }
+EXPORT_SYMBOL(iscsit_add_reject_from_cmd);
 
 /*
  * Map some portion of the allocated scatterlist to an iovec, suitable for
@@ -761,6 +779,9 @@ static void iscsit_ack_from_expstatsn(struct iscsi_conn 
*conn, u32 exp_statsn)
 
conn-exp_statsn = exp_statsn;
 
+   if (conn-sess-sess_ops-RDMAExtensions)
+   return;
+
spin_lock_bh(conn-cmd_lock);
list_for_each_entry(cmd, conn-conn_cmd_list, i_conn_node) {
spin_lock(cmd-istate_lock);
@@ -793,12 +814,10 @@ static int iscsit_allocate_iovecs(struct iscsi_cmd *cmd)
return 0;
 }
 
-static int iscsit_handle_scsi_cmd(
-   struct iscsi_conn *conn,
-   unsigned char *buf)
+int iscsit_setup_scsi_cmd(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+ unsigned char *buf)
 {
-   int data_direction, payload_length, cmdsn_ret = 0, immed_ret;
-   struct iscsi_cmd *cmd = NULL;
+   int 

[RFC-v3 4/9] iscsi-target: Add iser-target parameter keys + setup during login

2013-04-04 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This patch adds RDMAExtensions, InitiatorRecvDataSegmentLength and
TargetRecvDataSegmentLength parameters keys necessary for iser-target
login to occur.

This includes setting the necessary parameters during login path
code within iscsi_login_zero_tsih_s2(), and currently PAGE_SIZE
aligning the target's advertised MRDSL for immediate data and
unsolicited data-out incoming payloads.

v3 changes:
- Add iscsi_post_login_start_timers FIXME for ISER

v2 changes:

- Fix RDMAExtentions - RDMAExtensions typo (andy)
- Drop unnecessary '== true' conditional checks for type bool

Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 drivers/target/iscsi/iscsi_target_core.h   |   10 +++
 drivers/target/iscsi/iscsi_target_login.c  |   77 
 drivers/target/iscsi/iscsi_target_parameters.c |   75 +--
 drivers/target/iscsi/iscsi_target_parameters.h |   16 +-
 4 files changed, 161 insertions(+), 17 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target_core.h 
b/drivers/target/iscsi/iscsi_target_core.h
index 2587677..53400b0 100644
--- a/drivers/target/iscsi/iscsi_target_core.h
+++ b/drivers/target/iscsi/iscsi_target_core.h
@@ -244,6 +244,11 @@ struct iscsi_conn_ops {
u8  IFMarker;   /* [0,1] == [No,Yes] */
u32 OFMarkInt;  /* [1..65535] */
u32 IFMarkInt;  /* [1..65535] */
+   /*
+* iSER specific connection parameters
+*/
+   u32 InitiatorRecvDataSegmentLength; /* [512..2**24-1] */
+   u32 TargetRecvDataSegmentLength;/* [512..2**24-1] */
 };
 
 struct iscsi_sess_ops {
@@ -265,6 +270,10 @@ struct iscsi_sess_ops {
u8  DataSequenceInOrder;/* [0,1] == [No,Yes] */
u8  ErrorRecoveryLevel; /* [0..2] */
u8  SessionType;/* [0,1] == [Normal,Discovery]*/
+   /*
+* iSER specific session parameters
+*/
+   u8  RDMAExtensions; /* [0,1] == [No,Yes] */
 };
 
 struct iscsi_queue_req {
@@ -284,6 +293,7 @@ struct iscsi_data_count {
 };
 
 struct iscsi_param_list {
+   booliser;
struct list_headparam_list;
struct list_headextra_response_list;
 };
diff --git a/drivers/target/iscsi/iscsi_target_login.c 
b/drivers/target/iscsi/iscsi_target_login.c
index 57f9dea..24a9e69 100644
--- a/drivers/target/iscsi/iscsi_target_login.c
+++ b/drivers/target/iscsi/iscsi_target_login.c
@@ -340,6 +340,7 @@ static int iscsi_login_zero_tsih_s2(
struct iscsi_node_attrib *na;
struct iscsi_session *sess = conn-sess;
unsigned char buf[32];
+   bool iser = false;
 
sess-tpg = conn-tpg;
 
@@ -361,7 +362,10 @@ static int iscsi_login_zero_tsih_s2(
return -1;
}
 
-   iscsi_set_keys_to_negotiate(0, conn-param_list);
+   if (conn-conn_transport-transport_type == ISCSI_INFINIBAND)
+   iser = true;
+
+   iscsi_set_keys_to_negotiate(conn-param_list, iser);
 
if (sess-sess_ops-SessionType)
return iscsi_set_keys_irrelevant_for_discovery(
@@ -399,6 +403,56 @@ static int iscsi_login_zero_tsih_s2(
 
if (iscsi_login_disable_FIM_keys(conn-param_list, conn)  0)
return -1;
+   /*
+* Set RDMAExtensions=Yes by default for iSER enabled network portals
+*/
+   if (iser) {
+   struct iscsi_param *param;
+   unsigned long mrdsl, off;
+   int rc;
+
+   sprintf(buf, RDMAExtensions=Yes);
+   if (iscsi_change_param_value(buf, conn-param_list, 0)  0) {
+   iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
+   ISCSI_LOGIN_STATUS_NO_RESOURCES);
+   return -1;
+   }
+   /*
+* Make MaxRecvDataSegmentLength PAGE_SIZE aligned for
+* Immediate Data + Unsolicitied Data-OUT if necessary..
+*/
+   param = iscsi_find_param_from_key(MaxRecvDataSegmentLength,
+ conn-param_list);
+   if (!param) {
+   iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
+   ISCSI_LOGIN_STATUS_NO_RESOURCES);
+   return -1;
+   }
+   rc = strict_strtoul(param-value, 0, mrdsl);
+   if (rc  0) {
+   iscsit_tx_login_rsp(conn, ISCSI_STATUS_CLS_TARGET_ERR,
+   ISCSI_LOGIN_STATUS_NO_RESOURCES);
+   return -1;
+   }
+   off = mrdsl % PAGE_SIZE;
+   if (!off)
+   return 0;
+
+   if (mrdsl  PAGE_SIZE)
+   mrdsl = PAGE_SIZE;
+   

[RFC-v3 3/9] iscsi-target: Initial traditional TCP conversion to iscsit_transport

2013-04-04 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

This patch performs the initial conversion of existing traditional iscsi
to use iscsit_transport API callers.  This includes:

- iscsi-np cleanups for iscsit_transport_type
- Add iscsi-np transport calls w/ -iscsit_setup_up() and -iscsit_free_np()
- Convert login thread process context to use -iscsit_accept_np() for
  connections with pre-allocated struct iscsi_conn
- Convert existing socket accept code to iscsit_accept_np()
- Convert login RX/TX callers to use -iscsit_get_login_rx() and
  -iscsit_put_login_tx() to exchange request/response PDUs
- Convert existing socket login RX/TX calls into iscsit_get_login_rx()
  and iscsit_put_login_tx()
- Change iscsit_close_connection() to invoke -iscsit_free_conn() +
  iscsit_put_transport() calls.
- Add iscsit_register_transport() + iscsit_unregister_transport() calls
  to module init/exit

v2 changes:

- Update module init/exit to use register_transport() + unregister_transport()

Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 drivers/target/iscsi/iscsi_target.c|   35 ++-
 drivers/target/iscsi/iscsi_target_core.h   |   15 +-
 drivers/target/iscsi/iscsi_target_login.c  |  414 
 drivers/target/iscsi/iscsi_target_login.h  |6 +
 drivers/target/iscsi/iscsi_target_nego.c   |  167 ++-
 drivers/target/iscsi/iscsi_target_nego.h   |   11 +-
 drivers/target/iscsi/iscsi_target_parameters.c |   12 +-
 drivers/target/iscsi/iscsi_target_tpg.c|6 +-
 drivers/target/iscsi/iscsi_target_util.c   |   27 +--
 9 files changed, 378 insertions(+), 315 deletions(-)

diff --git a/drivers/target/iscsi/iscsi_target.c 
b/drivers/target/iscsi/iscsi_target.c
index 7ea246a..8203bf3 100644
--- a/drivers/target/iscsi/iscsi_target.c
+++ b/drivers/target/iscsi/iscsi_target.c
@@ -49,6 +49,8 @@
 #include iscsi_target_device.h
 #include iscsi_target_stat.h
 
+#include target/iscsi/iscsi_transport.h
+
 static LIST_HEAD(g_tiqn_list);
 static LIST_HEAD(g_np_list);
 static DEFINE_SPINLOCK(tiqn_lock);
@@ -401,8 +403,7 @@ struct iscsi_np *iscsit_add_np(
spin_unlock_bh(np_lock);
 
pr_debug(CORE[0] - Added Network Portal: %s:%hu on %s\n,
-   np-np_ip, np-np_port, (np-np_network_transport == ISCSI_TCP) 
?
-   TCP : SCTP);
+   np-np_ip, np-np_port, np-np_transport-name);
 
return np;
 }
@@ -441,11 +442,10 @@ int iscsit_reset_np_thread(
return 0;
 }
 
-static int iscsit_del_np_comm(struct iscsi_np *np)
+static void iscsit_free_np(struct iscsi_np *np)
 {
if (np-np_socket)
sock_release(np-np_socket);
-   return 0;
 }
 
 int iscsit_del_np(struct iscsi_np *np)
@@ -467,20 +467,32 @@ int iscsit_del_np(struct iscsi_np *np)
send_sig(SIGINT, np-np_thread, 1);
kthread_stop(np-np_thread);
}
-   iscsit_del_np_comm(np);
+
+   np-np_transport-iscsit_free_np(np);
 
spin_lock_bh(np_lock);
list_del(np-np_list);
spin_unlock_bh(np_lock);
 
pr_debug(CORE[0] - Removed Network Portal: %s:%hu on %s\n,
-   np-np_ip, np-np_port, (np-np_network_transport == ISCSI_TCP) 
?
-   TCP : SCTP);
+   np-np_ip, np-np_port, np-np_transport-name);
 
+   iscsit_put_transport(np-np_transport);
kfree(np);
return 0;
 }
 
+static struct iscsit_transport iscsi_target_transport = {
+   .name   = iSCSI/TCP,
+   .transport_type = ISCSI_TCP,
+   .owner  = NULL,
+   .iscsit_setup_np= iscsit_setup_np,
+   .iscsit_accept_np   = iscsit_accept_np,
+   .iscsit_free_np = iscsit_free_np,
+   .iscsit_get_login_rx= iscsit_get_login_rx,
+   .iscsit_put_login_tx= iscsit_put_login_tx,
+};
+
 static int __init iscsi_target_init_module(void)
 {
int ret = 0;
@@ -557,6 +569,8 @@ static int __init iscsi_target_init_module(void)
goto ooo_out;
}
 
+   iscsit_register_transport(iscsi_target_transport);
+
if (iscsit_load_discovery_tpg()  0)
goto r2t_out;
 
@@ -587,6 +601,7 @@ static void __exit iscsi_target_cleanup_module(void)
iscsi_deallocate_thread_sets();
iscsi_thread_set_free();
iscsit_release_discovery_tpg();
+   iscsit_unregister_transport(iscsi_target_transport);
kmem_cache_destroy(lio_cmd_cache);
kmem_cache_destroy(lio_qr_cache);
kmem_cache_destroy(lio_dr_cache);
@@ -4053,6 +4068,12 @@ int iscsit_close_connection(
 
if (conn-sock)
sock_release(conn-sock);
+
+   if (conn-conn_transport-iscsit_free_conn)
+   conn-conn_transport-iscsit_free_conn(conn);
+
+   iscsit_put_transport(conn-conn_transport);
+
conn-thread_set = NULL;
 
pr_debug(Moving to TARG_CONN_STATE_FREE.\n);
diff --git a/drivers/target/iscsi/iscsi_target_core.h 

[RFC-v3 1/9] target: Add export of target_get_sess_cmd symbol

2013-04-04 Thread Nicholas A. Bellinger
From: Nicholas Bellinger n...@linux-iscsi.org

Export target_get_sess_cmd() symbol so that it can be used by
iscsi-target.

Signed-off-by: Nicholas Bellinger n...@linux-iscsi.org
---
 drivers/target/target_core_transport.c |4 ++--
 include/target/target_core_fabric.h|2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/target/target_core_transport.c 
b/drivers/target/target_core_transport.c
index 493e9e5..f8388b4 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -65,7 +65,6 @@ static void transport_complete_task_attr(struct se_cmd *cmd);
 static void transport_handle_queue_full(struct se_cmd *cmd,
struct se_device *dev);
 static int transport_generic_get_mem(struct se_cmd *cmd);
-static int target_get_sess_cmd(struct se_session *, struct se_cmd *, bool);
 static void transport_put_cmd(struct se_cmd *cmd);
 static void target_complete_ok_work(struct work_struct *work);
 
@@ -2179,7 +2178,7 @@ EXPORT_SYMBOL(transport_generic_free_cmd);
  * @se_cmd:command descriptor to add
  * @ack_kref:  Signal that fabric will perform an ack target_put_sess_cmd()
  */
-static int target_get_sess_cmd(struct se_session *se_sess, struct se_cmd 
*se_cmd,
+int target_get_sess_cmd(struct se_session *se_sess, struct se_cmd *se_cmd,
   bool ack_kref)
 {
unsigned long flags;
@@ -2208,6 +2207,7 @@ out:
spin_unlock_irqrestore(se_sess-sess_cmd_lock, flags);
return ret;
 }
+EXPORT_SYMBOL(target_get_sess_cmd);
 
 static void target_release_cmd_kref(struct kref *kref)
 {
diff --git a/include/target/target_core_fabric.h 
b/include/target/target_core_fabric.h
index aaa1ee6..ba3471b 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -120,7 +120,7 @@ booltransport_wait_for_tasks(struct se_cmd *);
 inttransport_check_aborted_status(struct se_cmd *, int);
 inttransport_send_check_condition_and_sense(struct se_cmd *,
sense_reason_t, int);
-
+inttarget_get_sess_cmd(struct se_session *, struct se_cmd *, bool);
 inttarget_put_sess_cmd(struct se_session *, struct se_cmd *);
 void   target_sess_cmd_list_set_waiting(struct se_session *);
 void   target_wait_for_sess_cmds(struct se_session *, int);
-- 
1.7.2.5

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Different signatures to ibv_open_xrcd - branch issue?

2013-04-04 Thread Dominique Martinet
Hi,

Hefty, Sean wrote on Tue, Mar 26, 2013 :
  I think I missed something here, can I use your branch xrc2 of
  libibverbs for testing purpose, or there are still things to merge there?
 
 The xrc2 branch is usable.  I'm about 10-15 minutes away from posting v5 of 
 those patches.

Sorry for hijacking this thread, but I was actually running into some
troubles with libibverbs-1.1.4 and a recent kernel, using
ibv_open_xrc_domain... Which is basically failing with an invalid
argument when writing to the uverb0 char device. 

So I guess the commands (IB_USER_VERBS_CMD_OPEN_XRCD vs
IB_USER_VERBS_CMD_OPEN_XRC_DOMAIN) changed since 2.6.32 and I would need
to use this libibverbs branch for now? (doesn't look like there is any
xrcd at all in 1.1.6 anyway)
What I don't get is that the abi_version did not change at all, if the
commands changed, shouldn't the kernel abi version be updated as well?
(None of the four abi_version I could find (ib_verbs, ib_mad, rdma_cm,
uverbs0 in the device) changed at all)

This looks like it's impossible to get the same library to fully work
for multiple kernel versions, while all the other basic features do work
quite well independantly.



Regards,
-- 
Dominique Martinet
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] infiniband-diags/saquery.c: switchinfo support added

2013-04-04 Thread Husam Kahalah

Added support to filter switchInfoRecords by switch LID

Signed-off-by: Husam kahalah  hkaha...@asaltech.com

---
 src/saquery.c |   55 
+++

 1 file changed, 55 insertions(+)

diff --git a/src/saquery.c b/src/saquery.c
index 72a0fe9..832bec6 100644
--- a/src/saquery.c
+++ b/src/saquery.c
@@ -487,6 +487,41 @@ static void dump_service_record(void *data)
cl_ntoh64(p_sr-service_data64[1]));
 }

+static void dump_switch_info_record(void *data)
+{
+ib_switch_info_record_t *p_sir = data;
+
+printf(SwitchInfoRecord dump:\n
+   \t\tRID\n
+   \t\tlid.%u\n
+   \t\tSwitchInfo dump:\n
+   \t\tlin_cap.0x%X\n
+   \t\trand_cap0x%X\n
+   \t\tmcast_cap...0x%X\n
+   \t\tlin_top.0x%X\n
+   \t\tdef_port%u\n
+   \t\tdef_mcast_pri_port..%u\n
+   \t\tdef_mcast_not_port..%u\n
+   \t\tlife_state..%u\n
+   \t\tlids_per_port...0x%X\n
+   \t\tenforce_cap.0x%X\n
+   \t\tflags...%u\n
+   \t\tmcast_top...0x%X\n,
+   cl_ntoh16(p_sir-lid),
+   cl_ntoh16(p_sir-switch_info.lin_cap),
+   cl_ntoh16(p_sir-switch_info.rand_cap),
+   cl_ntoh16(p_sir-switch_info.mcast_cap),
+   cl_ntoh16(p_sir-switch_info.lin_top),
+   p_sir-switch_info.def_port,
+   p_sir-switch_info.def_mcast_pri_port,
+   p_sir-switch_info.def_mcast_not_port,
+   p_sir-switch_info.life_state,
+   cl_ntoh16(p_sir-switch_info.lids_per_port),
+   cl_ntoh16(p_sir-switch_info.enforce_cap),
+   p_sir-switch_info.flags,
+   cl_ntoh16(p_sir-switch_info.mcast_top));
+}
+
 static void dump_inform_info_record(void *data)
 {
 char gid_str[INET6_ADDRSTRLEN];
@@ -1157,6 +1192,24 @@ static int query_service_records(const struct 
query_cmd *q, struct sa_handle * h

 dump_service_record);
 }

+static int query_switchinfo_records(const struct query_cmd *q,
+ struct sa_handle * h, struct query_params *p,
+ int argc, char *argv[])
+{
+ib_switch_info_record_t swir;
+ib_net64_t comp_mask = 0;
+int lid = 0;
+
+if (argc  0)
+parse_lid_and_ports(h, argv[0], lid, NULL, NULL);
+
+memset(swir, 0, sizeof(swir));
+CHECK_AND_SET_VAL(lid, 16, 0, swir.lid, SWIR, LID);
+
+return get_and_dump_any_records(h, IB_SA_ATTR_SWITCHINFORECORD, 0, 
comp_mask,

+  swir, sizeof(swir), dump_switch_info_record);
+}
+
 static int query_inform_info_records(const struct query_cmd *q,
 struct sa_handle * h, struct query_params *p,
 int argc, char *argv[])
@@ -1349,6 +1402,8 @@ static const struct query_cmd query_cmds[] = {
  [[mlid]/[position]/[block]], query_mft_records},
 {GUIDInfoRecord, GIR, IB_SA_ATTR_GUIDINFORECORD,
  [[lid]/[block]], query_guidinfo_records},
+{SwitchInfoRecord, SWIR, IB_SA_ATTR_SWITCHINFORECORD,
+ [lid], query_switchinfo_records},
 {0}
 };

--
1.7.9.6

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v3 9/9] iser-target: Add iSCSI Extensions for RDMA (iSER) target driver

2013-04-04 Thread Or Gerlitz

On 04/04/2013 10:24, Nicholas A. Bellinger wrote:

+
+void isert_cq_tx_callback(struct ib_cq *, void *);
+void isert_cq_rx_callback(struct ib_cq *, void *);
+void isert_free_rx_descriptors(struct isert_conn *);

any reason not to have these as static functions (same for isert_cq_rx_work)
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v3 9/9] iser-target: Add iSCSI Extensions for RDMA (iSER) target driver

2013-04-04 Thread Or Gerlitz

On 04/04/2013 10:24, Nicholas A. Bellinger wrote:


+#define ISER_RECV_DATA_SEG_LEN  8192
+#define ISER_RX_PAYLOAD_SIZE(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
[...]
+#define ISER_RX_PAD_SIZE   (16384 - (ISER_RX_PAYLOAD_SIZE + \
+   sizeof(u64) + sizeof(struct ib_sge)))


We're eating here too much ram for the pad, you need 8K + something, so 
the pad can count down
from 12K and not 16K which means each such element will consume three 
pages and not four.



+struct iser_rx_desc {
+   struct iser_hdr iser_header;
+   struct iscsi_hdr iscsi_header;
+   chardata[ISER_RECV_DATA_SEG_LEN];
+   u64 dma_addr;
+   struct ib_sge   rx_sg;
+   charpad[ISER_RX_PAD_SIZE];
+} __packed;
+
+struct isert_rx_desc {
+   struct isert_conn   *desc_conn;
+   struct work_struct  desc_work;
+   struct iser_rx_desc desc;
+} __packed;


You have way enough room in the pad field of struct iser_rx_desc to 
place there the two fields
added by struct isert_rx_desc (and you only use struct iser_rx_desc from 
within isert_rx_desc) -- any reason

not to unify  them?



Or.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v3 9/9] iser-target: Add iSCSI Extensions for RDMA (iSER) target driver

2013-04-04 Thread Or Gerlitz

On 04/04/2013 10:24, Nicholas A. Bellinger wrote:

+static int
+isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+   struct isert_cmd *isert_cmd = container_of(cmd,
+   struct isert_cmd, iscsi_cmd);
+   struct isert_conn *isert_conn = (struct isert_conn *)conn-context;
+   struct ib_send_wr *send_wr = isert_cmd-tx_desc.send_wr;
+   struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *)
+   isert_cmd-tx_desc.iscsi_header;
+
+   isert_create_send_desc(isert_conn, isert_cmd, isert_cmd-tx_desc);
+   iscsit_build_rsp_pdu(cmd, conn, true, hdr);
+   isert_init_tx_hdrs(isert_conn, isert_cmd-tx_desc);
+   /*
+* Attach SENSE DATA payload to iSCSI Response PDU
+*/
+   if (cmd-se_cmd.sense_buffer 
+   ((cmd-se_cmd.se_cmd_flags  SCF_TRANSPORT_TASK_SENSE) ||
+   (cmd-se_cmd.se_cmd_flags  SCF_EMULATED_TASK_SENSE))) {
+   struct ib_device *ib_dev = isert_conn-conn_cm_id-device;
+   struct ib_sge *tx_dsg = isert_cmd-tx_desc.tx_sg[1];
+   u32 padding, sense_len;
+
+   put_unaligned_be16(cmd-se_cmd.scsi_sense_length,
+  cmd-sense_buffer);
+   cmd-se_cmd.scsi_sense_length += sizeof(__be16);
+
+   padding = -(cmd-se_cmd.scsi_sense_length)  3;
+   hton24(hdr-dlength, (u32)cmd-se_cmd.scsi_sense_length);
+   sense_len = cmd-se_cmd.scsi_sense_length + padding;
+
+   isert_cmd-sense_buf_dma = ib_dma_map_single(ib_dev,
+   (void *)cmd-sense_buffer, sense_len,
+   DMA_TO_DEVICE);
+
+   isert_cmd-sense_buf_len = sense_len;
+   ib_dma_sync_single_for_cpu(ib_dev, isert_cmd-sense_buf_dma,
+  sense_len, DMA_TO_DEVICE);
+   ib_dma_sync_single_for_device(ib_dev, isert_cmd-sense_buf_dma,
+ sense_len, DMA_TO_DEVICE);
+


you just called dma_map_single, and not going to touch the buffer before 
posting it to the wire,
there's no point to sync it for the cpu and for the device, remove these 
calls.




+   tx_dsg-addr = isert_cmd-sense_buf_dma;
+   tx_dsg-length   = sense_len;
+   tx_dsg-lkey = isert_conn-conn_mr-lkey;
+   isert_cmd-tx_desc.num_sge = 2;
+   }
+
+   isert_init_send_wr(isert_cmd, send_wr);
+
+   pr_debug(Posting SCSI Response IB_WR_SEND \n);
+
+   return isert_post_response(isert_conn, isert_cmd);
+}


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH opensm] Setup SM port GUID in subnet object as soon as it is known

2013-04-04 Thread Hal Rosenstock

rather than wait for local NodeInfo response
This allows plugin to know SM port GUID at construct() time

Signed-off-by: Hal Rosenstock h...@mellanox.com
---
diff --git a/include/opensm/osm_opensm.h b/include/opensm/osm_opensm.h
index 3c8bc59..4750505 100644
--- a/include/opensm/osm_opensm.h
+++ b/include/opensm/osm_opensm.h
@@ -219,8 +219,10 @@ typedef struct osm_opensm {
osm_congestion_control_t cc;
cl_qlist_t plugin_list;
osm_db_t db;
+   boolean_t mad_pool_constructed;
osm_mad_pool_t mad_pool;
osm_vendor_t *p_vendor;
+   boolean_t vl15_constructed;
osm_vl15_t vl15;
osm_log_t log;
cl_dispatcher_t disp;
@@ -313,6 +315,34 @@ void osm_opensm_construct(IN osm_opensm_t * p_osm);
 *  SM object, osm_opensm_init, osm_opensm_destroy
 */
 
+/f* OpenSM: OpenSM/osm_opensm_construct_finish
+* NAME
+*  osm_opensm_construct_finish
+*
+* DESCRIPTION
+*  The osm_opensm_construct_finish function completes
+*  the second phase of constucting an OpenSM object.
+*
+* SYNOPSIS
+*/
+void osm_opensm_construct_finish(IN osm_opensm_t * p_osm);
+/*
+* PARAMETERS
+*  p_osm
+*  [in] Pointer to a OpenSM object to construct.
+*
+* RETURN VALUE
+*  This function does not return a value.
+*
+* NOTES
+*  Calling osm_opensm_construct/osm_construct_finish is a prerequisite
+*  to calling any other method except 
osm_opensm_init/osm_opensm_init_finish.
+*
+* SEE ALSO
+*  SM object, osm_opensm_init, osm_opensm_construct_finish,
+*  osm_opensm_destroy, osm_opensm_destroy_finish
+*/
+
 /f* OpenSM: OpenSM/osm_opensm_destroy
 * NAME
 *  osm_opensm_destroy
@@ -342,6 +372,36 @@ void osm_opensm_destroy(IN osm_opensm_t * p_osm);
 *  SM object, osm_opensm_construct, osm_opensm_init
 */
 
+/f* OpenSM: OpenSM/osm_opensm_destroy_finish
+* NAME
+*  osm_opensm_destroy_finish
+*
+* DESCRIPTION
+*  The osm_opensm_destroy_finish function handles the second phase
+*  of destroying an SM, releasing all resources.
+*
+* SYNOPSIS
+*/
+void osm_opensm_destroy_finish(IN osm_opensm_t * p_osm);
+/*
+* PARAMETERS
+*  p_osm
+*  [in] Pointer to a OpenSM object to destroy.
+*
+* RETURN VALUE
+*  This function does not return a value.
+*
+* NOTES
+*  Performs second phase of any necessary cleanup of the specified OpenSM 
object.
+*  Further operations should not be attempted on the destroyed object.
+*  This function should only be called after a call to
+*  osm_opensm_construct_finish or osm_opensm_init_finish.
+*
+* SEE ALSO
+*  SM object, osm_opensm_construct, osm_opensm_construct_finish,
+*  osm_opensm_init, osm_opensm_init_finish
+*/
+
 /f* OpenSM: OpenSM/osm_opensm_init
 * NAME
 *  osm_opensm_init
@@ -371,6 +431,37 @@ ib_api_status_t osm_opensm_init(IN osm_opensm_t * p_osm,
 *  SM object, osm_opensm_construct, osm_opensm_destroy
 */
 
+/f* OpenSM: OpenSM/osm_opensm_init_finish
+* NAME
+*  osm_opensm_init_finish
+*
+* DESCRIPTION
+*  The osm_opensm_init_finish function performs the second phase
+*  of initialization of an OpenSM object.
+*
+* SYNOPSIS
+*/
+ib_api_status_t osm_opensm_init_finish(IN osm_opensm_t * p_osm,
+  IN const osm_subn_opt_t * p_opt);
+/*
+* PARAMETERS
+*  p_osm
+*  [in] Pointer to an osm_opensm_t object to initialize.
+*
+*  p_opt
+*  [in] Pointer to the subnet options structure.
+*
+* RETURN VALUES
+*  IB_SUCCESS if the OpenSM object was initialized successfully.
+*
+* NOTES
+*  Allows calling other OpenSM methods.
+*
+* SEE ALSO
+*  SM object, osm_opensm_construct, osm_opensm_construct_finish,
+*  osm_opensm_destroy, osm_opensm_destroy_finish
+*/
+
 /f* OpenSM: OpenSM/osm_opensm_sweep
 * NAME
 *  osm_opensm_sweep
diff --git a/opensm/main.c b/opensm/main.c
index 92b1c03..9349d79 100644
--- a/opensm/main.c
+++ b/opensm/main.c
@@ -1193,7 +1193,16 @@ int main(int argc, char *argv[])
opt.guid = get_port_guid(osm, opt.guid);
 
if (opt.guid == 0)
-   goto Exit;
+   goto Exit2;
+
+   status = osm_opensm_init_finish(osm, opt);
+   if (status != IB_SUCCESS) {
+   const char *err_str = ib_get_err_str(status);
+   if (err_str == NULL)
+   err_str = Unknown Error Type;
+   printf(\nError from osm_opensm_init_finish: %s.\n, err_str);
+   goto Exit2;
+   }
 
status = osm_opensm_bind(osm, opt.guid);
if (status != IB_SUCCESS) {
@@ -1237,6 +1246,8 @@ int main(int argc, char *argv[])
 
 Exit:
osm_opensm_destroy(osm);
+Exit2:
+   osm_opensm_destroy_finish(osm);
complib_exit();
remove_pidfile();
 
diff --git a/opensm/osm_opensm.c b/opensm/osm_opensm.c
index 0909a36..06a5af2 100644
--- a/opensm/osm_opensm.c
+++ 

Re: [PATCH] libibumad: document the setting of errno for umad_send and umad_recv

2013-04-04 Thread Hal Rosenstock
On 3/29/2013 2:21 PM, Ira Weiny wrote:
 
 
 Signed-off-by: Ira Weiny ira.we...@intel.com

Thanks. Applied.

-- Hal
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] Add IBV_*_USNIC enums for the Cisco Ethernet Virtual NIC.

2013-04-04 Thread Jeff Squyres (jsquyres)
On Apr 3, 2013, at 2:45 PM, Or Gerlitz or.gerl...@gmail.com wrote:

 Jeff, I agree with Sean, there's not much point to review/discuss
 these general/pre-step patches without seeing some actual device
 specific kernel (if there are such or user space code if there aren't
 any kernel ones) code. e.g you can submit the two kernel pre-step
 patches as the two first pieces in a series that has the driver code.


Unfortunately, not yet.

I just sent another mail that explained our rationale: our kernel driver and 
libibverbs plugin code are working their way through QA.  It'll take a little 
time before we can submit good patches for these.  The main driving factor for 
submitting these new enums is so that they can be included in RHEL 6.5.

-- 
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to: 
http://www.cisco.com/web/about/doing_business/legal/cri/

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Ad IB_MTU_1500|9000 enums.

2013-04-04 Thread Hal Rosenstock
On 4/4/2013 8:22 AM, Jeff Squyres (jsquyres) wrote:
 On Apr 3, 2013, at 12:52 PM, Roland Dreier rol...@purestorage.com wrote:
 
 I don't think we can blithely do this... I think the IB enum values
 are defined to match the values used in the IB spec (PathRecord etc).
 
 Gotcha.  I inserted the enums in their proper numerical order to make the 
 range comparisons simpler in ib_addr.h.  But the 1500/9000 values could be 
 tacked at the end of the current values (e.g., 6 and 7, respectively) -- it 
 would just necessitate some different changes in ib_addr.h.

What happens if in the future the IBTA adds new MTUs and allocates those
currently reserved MTU values ? Wouldn't those values need to be
standardized at the IBTA so that conflict won't occur ?

-- Hal
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx4: Fail post send command on error recovery

2013-04-04 Thread Jack Morgenstein
On Thursday 04 April 2013 16:01, Kleber Sacilotto de Souza wrote:
 On 04/02/2013 02:00 PM, Roland Dreier wrote:
  diff --git a/drivers/infiniband/hw/mlx4/qp.c 
  b/drivers/infiniband/hw/mlx4/qp.c
  index 35cced2..0fa4f72 100644
  --- a/drivers/infiniband/hw/mlx4/qp.c
  +++ b/drivers/infiniband/hw/mlx4/qp.c
  @@ -2216,6 +2216,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct 
  ib_send_wr *wr,
  __be32 blh;
  int i;
 
  +   if (pci_channel_offline(to_mdev(ibqp-device)-dev-pdev))
  +   return -EIO;
  +
  spin_lock_irqsave(qp-sq.lock, flags);
 
  ind = qp-sq_next_wqe;
  
  To pile on to what Or and Jack asked, why here?  Why not in post_recv?
   Why not in mlx4_en?  What about userspace consumers?  What if the
  error condition triggers just after the pci_channel_offline() check?
  What if a command is queued but a PCI error occurs before the
  completion can be returned?
  
  Is there some practical scenario where this change makes a difference?
  
  I would assume that in case of a PCI error, the driver would notice a
  catastrophic error and send that asynchronous event to consumers, who
  would know that commands might have been lost.
  
 
 The problem that I'm trying to solve is that some IB core modules are
 hanging waiting on completion queues on their remove path during error
 recovery. I've added the pci offline check in post_send, which seemed to
 have to solved the problem, but while running other tests I was able to
 hit the bug again. Adding the check in post_recv also only hid the
 problem for a few testcases.
 
 Adding any check in mlx4_en doesn't make sense in this case, because the
 problem is only with IB adapters. The ethernet/RoCE adapters are
 recovering fine, the check has been added already on the relevant places
 in mlx4_core.
 
 What async event should be sent to consumers before calling the remove
 functions? IB_EVENT_DEVICE_FATAL, which is currently sent by mlx4_core
 in case of catastrophic error (but not in PCI error recovery), doesn't
 seem to be handled by most of the event handlers registered. Sending
 IB_EVENT_PORT_ERR seems to solve the problem for most modules, but
 rdma_cm, which doesn't have an event handler, is still hanging. Should
 we implement an event handler for rdma_cm?


This won't really help unless ALL userspace apps respond by calling 
ibv_close_device.
You can check this by running ibv_asyncwatch  (in libibverbs/examples). Until 
ibv_asyncwatch
is exited the low-level device restart won't work.

-Jack
 
 Thanks!
 
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 1/4] Add IBV_*_USNIC enums for the Cisco Ethernet Virtual NIC.

2013-04-04 Thread Hefty, Sean
 The reason we're asking for these IBV_*_USNIC enums now -- before we've
 submitted the driver -- is because we're targeting getting our driver included
 in RHEL 6.5.  There's a bit of a chicken-and-egg issue here: they'll accept 
 our
 patches for a new hardware driver while that driver is being worked upstream.
 But they (rightfully) won't accept patches to IB core and libibverbs until
 they've been vetted by the community.  Hence, even though our driver is slowly
 working its way through QA and not available yet, we wanted to submit these 
 new
 enums upstream for community approval so that they can be included in RHEL 
 6.5.

I understand the issue.

In the end, these are kernel changes with no actual users of those changes...  
But then they are also just small changes to a framework...

Just thinking aloud here, but what if we added 'RDMA_NODE_VENDOR' instead?  
Then other fields, such as transport, become vendor specific.

- Sean
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 2/2] Ad IB_MTU_1500|9000 enums.

2013-04-04 Thread Hefty, Sean
 What happens if in the future the IBTA adds new MTUs and allocates those
 currently reserved MTU values ? Wouldn't those values need to be
 standardized at the IBTA so that conflict won't occur ?

The IBTA needs to standardize the values as they appear in MADs.  The software 
values can differ.  They would just need to be mapped.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 2/2] Ad IB_MTU_1500|9000 enums.

2013-04-04 Thread Weiny, Ira
 -Original Message-
 From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
 
  What happens if in the future the IBTA adds new MTUs and allocates
  those currently reserved MTU values ? Wouldn't those values need to be
  standardized at the IBTA so that conflict won't occur ?
 
 The IBTA needs to standardize the values as they appear in MADs.  The
 software values can differ.  They would just need to be mapped.

Even with a map I think having IB_MTU_1500 will cause some confusion as this is 
not an IB MTU.  It seems an alternate enum name like RDMA_MTU_1500 is better.

Just a stupid users opinion,
Ira
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] opensm: Add configure output messages for several configure options

2013-04-04 Thread Albert Chu
The --enable-console-loopback, --enable-console-socket, --enable-perf-mgr,
--enable-perf-mgr-profile, and --enable-default-event-plugin did not
have a checking message or notice message indicating if they were
enabled/disabled when running configure.

This made things difficult when perusing old build logs to determine
if things were enabled/disabled.

Signed-off-by: Albert Chu ch...@llnl.gov
---
 config/osmvsel.m4 |   14 ++
 1 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/config/osmvsel.m4 b/config/osmvsel.m4
index 6c98c07..f249b1c 100644
--- a/config/osmvsel.m4
+++ b/config/osmvsel.m4
@@ -183,6 +183,7 @@ AC_DEFUN([OPENIB_OSM_CONSOLE_SOCKET_SEL], [
 # --- BEGIN OPENIB_OSM_CONSOLE_SOCKET_SEL ---
 
 dnl Console over a loopback socket is default if libwrap is available
+AC_MSG_CHECKING([to enable console loopback])
 AC_ARG_ENABLE(console-loopback,
 [  --enable-console-loopback Enable a console socket on the loopback 
interface, requires tcp_wrappers (default yes)],
 [case $enableval in
@@ -190,6 +191,7 @@ AC_ARG_ENABLE(console-loopback,
  no)  console_loopback=no ;;
esac],
console_loopback=yes)
+AC_MSG_RESULT([yes])
 
 if test $console_loopback = yes; then
 AC_CHECK_LIB(wrap, request_init, [], [console_loopback=no
@@ -202,6 +204,7 @@ if test $console_loopback = yes; then
 fi
 
 dnl Console over a socket connection
+AC_MSG_CHECKING([to enable console socket])
 AC_ARG_ENABLE(console-socket,
 [  --enable-console-socket Enable a console socket, requires 
--enable-console-loopback (default no)],
 [case $enableval in
@@ -209,6 +212,8 @@ AC_ARG_ENABLE(console-socket,
  no)  console_socket=no ;;
esac],
console_socket=no)
+AC_MSG_RESULT([no])
+
 if test $console_socket = yes; then
   if test $console_loopback = no; then
 AC_MSG_ERROR([--enable-console-socket requires --enable-console-loopback])
@@ -228,6 +233,7 @@ AC_DEFUN([OPENIB_OSM_PERF_MGR_SEL], [
 # --- BEGIN OPENIB_OSM_PERF_MGR_SEL ---
 
 dnl enable the perf-mgr
+AC_MSG_CHECKING([to enable perf mgr])
 AC_ARG_ENABLE(perf-mgr,
 [  --enable-perf-mgr Enable the performance manager (default yes)],
[case $enableval in
@@ -235,6 +241,9 @@ AC_ARG_ENABLE(perf-mgr,
  no)  perf_mgr=no ;;
esac],
perf_mgr=yes)
+AC_MSG_RESULT([yes])
+
+AC_MSG_CHECKING([to enable perf mgr profiling])
 AC_ARG_ENABLE(perf-mgr-profile,
 [  --enable-perf-mgr-profile Enable the performance manager profiling (default 
no)],
[case $enableval in
@@ -242,6 +251,8 @@ AC_ARG_ENABLE(perf-mgr-profile,
no)  perf_mgr_profile=no ;;
esac],
perf_mgr_profile=no)
+AC_MSG_RESULT([no])
+
 if test $perf_mgr = yes; then
   AC_DEFINE(ENABLE_OSM_PERF_MGR,
1,
@@ -261,6 +272,7 @@ AC_DEFUN([OPENIB_OSM_DEFAULT_EVENT_PLUGIN_SEL], [
 # --- BEGIN OPENIB_OSM_DEFAULT_EVENT_PLUGIN_SEL ---
 
 dnl enable the default-event-plugin
+AC_MSG_CHECKING([to enable default event plugin])
 AC_ARG_ENABLE(default-event-plugin,
 [  --enable-default-event-plugin  Enable a default event plugin 
osmeventplugin (default no)],
[case $enableval in
@@ -268,6 +280,8 @@ AC_ARG_ENABLE(default-event-plugin,
  no)  default_event_plugin=no ;;
esac],
default_event_plugin=no)
+AC_MSG_RESULT([no])
+
 if test $default_event_plugin = yes; then
   AC_DEFINE(ENABLE_OSM_DEFAULT_EVENT_PLUGIN,
1,
-- 
1.7.1



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 2/2] Ad IB_MTU_1500|9000 enums.

2013-04-04 Thread Hefty, Sean
 Even with a map I think having IB_MTU_1500 will cause some confusion as this 
 is
 not an IB MTU.  It seems an alternate enum name like RDMA_MTU_1500 is 
 better.

Couldn't these be usable MTU's for RoCE?

In hindsight, the user space API never should have exposed the mtu as an enum...

Since an enum is an int, and we're never going to have anything with an mtu = 
5 bytes, couldn't we just store all new mtu values directly as their byte value?
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Using XRC between kernel and user space.

2013-04-04 Thread Shlomo Pongratz

Hi,

I wonder how a kernel program and a user space one can communicate 
using XRC.
A example for such an application is iSER kernel initiator connected to 
tgtd target that runs in user space.
The XRC domain in the kernel is created with struct ib_xrcd 
*ib_alloc_xrcd(struct ib_device *device) while in the user space it is 
created with static inline struct ibv_xrcd * ibv_open_xrcd(struct 
ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr).
When looking in xrc_pingpong.c example we can see that the attribute 
has some file descriptor of some file (/tmp/xrc_domain).
So how a kernel program on one machine can communicate with a user 
space program on another one?
The XRC annex (page 9) sates that all QP should belong to the same XRC 
domain.


S.P.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH 2/2] Ad IB_MTU_1500|9000 enums.

2013-04-04 Thread Weiny, Ira
 -Original Message-
 From: Hefty, Sean
 
  Even with a map I think having IB_MTU_1500 will cause some confusion
  as this is not an IB MTU.  It seems an alternate enum name like
 RDMA_MTU_1500 is better.
 
 Couldn't these be usable MTU's for RoCE?

I guess so, I don't have much experience with RoCE.  If that is the case the 
RoCE annex might look at reserving these values in the spec?

 
 In hindsight, the user space API never should have exposed the mtu as an
 enum...
 
 Since an enum is an int, and we're never going to have anything with an mtu
 = 5 bytes, couldn't we just store all new mtu values directly as their byte
 value?

That seems like a pretty good idea.

Ira
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: IPoIB - GRO forces memcpy inside __pskb_pull_tail

2013-04-04 Thread Roland Dreier
On Wed, Apr 3, 2013 at 12:03 PM, Markus Stockhausen
markus.stockhau...@gmx.de wrote:
 going through hard lessons to understand the SKBs maybe I finally
 found the reason for the unnecessary memcpy commands. Even with
 newest 3.9-rc5 kernel the problem persists. IPoIB creates only
 fragmented SKBs without any single bit in the normal data part. Some
 debug messages during GRO handling showed

 skb-len = 1988 (total data)
 skb-data_len= 1988 (paged data)
 skb_headlen(skb) = 0(non paged data)

 inet_gro_receive() requires the IP header inside the SKB. So it
 pulls missing data from fragments. This process requires extra
 memcpy operations.

Thanks for the really detailed investigation!  This makes sense.

I'll send out a proposed patch for you to test in a moment.

Thanks!
  Roland
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH/RFC] IPoIB: Leave space in skb linear buffer for IP headers

2013-04-04 Thread Roland Dreier
From: Roland Dreier rol...@purestorage.com

Markus Stockhausen markus.stockhau...@gmx.de noticed that IPoIB was
spending significant time doing memcpy() in __pskb_pull_tail().  He
found that this is because his adapter reports a maximum MTU of 4K,
which causes IPoIB datagram mode to receive all the actual data in a
separate page in the fragment list.

We're already allocating extra tailroom for the skb linear part, so we
might as well use it.

Cc: Eric Dumazet eduma...@google.com
Reported-by: Markus Stockhausen markus.stockhau...@gmx.de
Signed-off-by: Roland Dreier rol...@purestorage.com
---
 drivers/infiniband/ulp/ipoib/ipoib.h|  3 ++-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 +++-
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
b/drivers/infiniband/ulp/ipoib/ipoib.h
index eb71aaa..ab2cc4c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -64,7 +64,8 @@ enum ipoib_flush_level {
 enum {
IPOIB_ENCAP_LEN   = 4,
 
-   IPOIB_UD_HEAD_SIZE= IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+   /* add 128 bytes of tailroom for IP/TCP headers */
+   IPOIB_UD_HEAD_SIZE= IB_GRH_BYTES + IPOIB_ENCAP_LEN + 128,
IPOIB_UD_RX_SG= 2, /* max buffer needed for 4K mtu */
 
IPOIB_CM_MTU  = 0x1 - 0x10, /* padding to align header 
to 16 */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c 
b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 2cfa76f..9eaa58e 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -156,18 +156,12 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct 
net_device *dev, int id)
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct sk_buff *skb;
int buf_size;
-   int tailroom;
u64 *mapping;
 
-   if (ipoib_ud_need_sg(priv-max_ib_mtu)) {
-   buf_size = IPOIB_UD_HEAD_SIZE;
-   tailroom = 128; /* reserve some tailroom for IP/TCP headers */
-   } else {
-   buf_size = IPOIB_UD_BUF_SIZE(priv-max_ib_mtu);
-   tailroom = 0;
-   }
+   buf_size = ipoib_ud_need_sg(priv-max_ib_mtu) ?
+   IPOIB_UD_HEAD_SIZE : IPOIB_UD_BUF_SIZE(priv-max_ib_mtu);
 
-   skb = dev_alloc_skb(buf_size + tailroom + 4);
+   skb = dev_alloc_skb(buf_size + 4);
if (unlikely(!skb))
return NULL;
 
-- 
1.8.1.2

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH opensm] Change LFT event to be per block/per switch rather than just per switch

2013-04-04 Thread Weiny, Ira
This changes the data sent by OSM_EVENT_ID_LFT_CHANGE.  I think this is a 
dangerous precedent.  Even though plugins must be coded to specific versions of 
OpenSM I don't think they will fail to compile with this change and users would 
get random behavior when they try to use the event_data passed.

I think it might be more appropriate to define a new event id say 
OSM_EVENT_ID_LFT_BLOCK_CHANGE.

Ira

 -Original Message-
 From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
 
 
 Eliminates expensive LFT comparison to determine changed blocks
 
 Signed-off-by: Hal Rosenstock h...@mellanox.com
 ---
 diff --git a/include/opensm/osm_event_plugin.h
 b/include/opensm/osm_event_plugin.h
 index c9a904b..3ece2f0 100644
 --- a/include/opensm/osm_event_plugin.h
 +++ b/include/opensm/osm_event_plugin.h
 @@ -39,6 +39,7 @@
  #include iba/ib_types.h
  #include complib/cl_qlist.h
  #include opensm/osm_config.h
 +#include opensm/osm_switch.h
 
  #ifdef __cplusplus
  #  define BEGIN_C_DECLS extern C {
 @@ -87,6 +88,18 @@ typedef struct osm_epi_port_id {
   char node_name[OSM_EPI_NODE_NAME_LEN];  }
 osm_epi_port_id_t;
 
 +typedef enum {
 + LFT_CHANGED_LFT_TOP = (1  0),
 + LFT_CHANGED_BLOCK = (1  1)
 +} osm_epi_lft_change_flags_t;
 +
 +typedef struct osm_epi_lft_change_event {
 + osm_switch_t *p_sw;
 + osm_epi_lft_change_flags_t flags;
 + uint16_t lft_top;
 + uint32_t block_num;
 +} osm_epi_lft_change_event_t;
 +
  /**
 ==
 ===
   * Port error event
   * OSM_EVENT_ID_PORT_COUNTER
 diff --git a/include/opensm/osm_madw.h b/include/opensm/osm_madw.h
 index 5d78eaa..fd6ba7f 100644
 --- a/include/opensm/osm_madw.h
 +++ b/include/opensm/osm_madw.h
 @@ -229,6 +229,7 @@ typedef struct osm_si_context {
   ib_net64_t node_guid;
   boolean_t set_method;
   boolean_t light_sweep;
 + boolean_t lft_top_change;
  } osm_si_context_t;
  /*/
 
 diff --git a/include/opensm/osm_switch.h b/include/opensm/osm_switch.h
 index 6e8a87e..41ac959 100644
 --- a/include/opensm/osm_switch.h
 +++ b/include/opensm/osm_switch.h
 @@ -104,7 +104,6 @@ typedef struct osm_switch {
   uint8_t *lft;
   uint8_t *new_lft;
   uint16_t lft_size;
 - unsigned lft_change;
   osm_mcast_tbl_t mcast_tbl;
   int32_t mft_block_num;
   uint32_t mft_position;
 diff --git a/opensm/osm_lin_fwd_rcv.c b/opensm/osm_lin_fwd_rcv.c index
 f13b9a8..dd18c09 100644
 --- a/opensm/osm_lin_fwd_rcv.c
 +++ b/opensm/osm_lin_fwd_rcv.c
 @@ -51,6 +51,8 @@
  #define FILE_ID OSM_FILE_LIN_FWD_RCV_C
  #include opensm/osm_switch.h
  #include opensm/osm_sm.h
 +#include opensm/osm_event_plugin.h
 +#include opensm/osm_opensm.h
 
  void osm_lft_rcv_process(IN void *context, IN void *data)  { @@ -62,6 +64,7
 @@ void osm_lft_rcv_process(IN void *context, IN void *data)
   osm_lft_context_t *p_lft_context;
   uint8_t *p_block;
   ib_net64_t node_guid;
 + osm_epi_lft_change_event_t lft_change;
   ib_api_status_t status;
 
   CL_ASSERT(sm);
 @@ -89,7 +92,17 @@ void osm_lft_rcv_process(IN void *context, IN void
 *data)
   0x% PRIx64 \n, cl_ntoh64(node_guid));
   } else {
   status = osm_switch_set_lft_block(p_sw, p_block,
 block_num);
 - if (status != IB_SUCCESS) {
 + if (status == IB_SUCCESS) {
 + if (sm-p_subn-first_time_master_sweep ==
 FALSE) {
 + lft_change.p_sw = p_sw;
 + lft_change.flags = LFT_CHANGED_BLOCK;
 + lft_change.lft_top = 0;
 + lft_change.block_num = block_num;
 + osm_opensm_report_event(sm-p_subn-
 p_osm,
 +
   OSM_EVENT_ID_LFT_CHANGE,
 + lft_change);
 + }
 + } else {
   OSM_LOG(sm-p_log, OSM_LOG_ERROR, ERR 0402:
 
   Setting forwarding table block failed (%s)
   , Switch 0x% PRIx64  %s\n,
 diff --git a/opensm/osm_mcast_mgr.c b/opensm/osm_mcast_mgr.c index
 fea0a69..135b174 100644
 --- a/opensm/osm_mcast_mgr.c
 +++ b/opensm/osm_mcast_mgr.c
 @@ -1070,6 +1070,7 @@ static void mcast_mgr_set_mfttop(IN osm_sm_t *
 sm, IN osm_switch_t * p_sw)
   context.si_context.light_sweep = FALSE;
   context.si_context.node_guid =
 osm_node_get_node_guid(p_node);
   context.si_context.set_method = TRUE;
 + context.si_context.lft_top_change = FALSE;
 
   status = osm_req_set(sm, p_path, (uint8_t *)  si,
sizeof(si), IB_MAD_ATTR_SWITCH_INFO,
 diff --git a/opensm/osm_node_info_rcv.c b/opensm/osm_node_info_rcv.c
 index cb96f29..592f2de 100644
 --- a/opensm/osm_node_info_rcv.c
 +++ b/opensm/osm_node_info_rcv.c
 @@ -552,6 +552,7 @@ static void 

RE: [PATCH opensm] Change LFT event to be per block/per switch rather than just per switch

2013-04-04 Thread Weiny, Ira
 -Original Message-
 From: Hal Rosenstock [mailto:h...@dev.mellanox.co.il]
 
 On 4/4/2013 2:45 PM, Weiny, Ira wrote:
  This changes the data sent by OSM_EVENT_ID_LFT_CHANGE.
 
 That event was just added and was experimental rather than being some
 preexisting event already in some release.

Oh, sorry, I did not know it was not release yet...  my bad.

 
  I think this is a dangerous precedent.
 
 It's not setting this precedent.

Sorry,
Ira

 
 -- Hal
 
  Even though plugins must be coded to specific versions of OpenSM I don't
 think they will fail to compile with this change and users would get random
 behavior when they try to use the event_data passed.
 
  I think it might be more appropriate to define a new event id say
 OSM_EVENT_ID_LFT_BLOCK_CHANGE.
 
  Ira
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] IPoIB: Leave space in skb linear buffer for IP headers

2013-04-04 Thread Markus Stockhausen

From: Roland Dreier rol...@purestorage.com

Markus Stockhausen markus.stockhau...@gmx.de noticed that IPoIB was
spending significant time doing memcpy() in __pskb_pull_tail().  He
found that this is because his adapter reports a maximum MTU of 4K,
which causes IPoIB datagram mode to receive all the actual data in a
separate page in the fragment list.

We're already allocating extra tailroom for the skb linear part, so we
might as well use it.

Cc: Eric Dumazet eduma...@google.com
Reported-by: Markus Stockhausen markus.stockhau...@gmx.de
Signed-off-by: Roland Dreier rol...@purestorage.com
---
 drivers/infiniband/ulp/ipoib/ipoib.h|  3 ++-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 12 +++-
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
b/drivers/infiniband/ulp/ipoib/ipoib.h
index eb71aaa..ab2cc4c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -64,7 +64,8 @@ enum ipoib_flush_level {
 enum {
 IPOIB_ENCAP_LEN  = 4,
 
-IPOIB_UD_HEAD_SIZE  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+/* add 128 bytes of tailroom for IP/TCP headers */
+IPOIB_UD_HEAD_SIZE  = IB_GRH_BYTES + IPOIB_ENCAP_LEN + 128,
...

Thanks for the help but I guess the patch is not yet perfect.

My (remote) test machine stopped responding after loading the new
ipoib module. Tomorrow I can check the console. Having a look at the
source code I guess we now have some major problems when receiving
small packets:

...
ipoib_ud_skb_put_frags(..., unsigned int length)
  ...
  size = length - IPOIB_UD_HEAD_SIZE; /* may be less than zero! */
  skb_frag_size_set(frag, size);
  ...

Markus


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] Add IBV_*_USNIC enums for the Cisco Ethernet Virtual NIC.

2013-04-04 Thread Or Gerlitz
Jeff Squyres (jsquyres) jsquy...@cisco.com wrote:

 Sure.  For a little background, the 2nd-generation Cisco VIC has been 
 available
 since last year (IIRC): http://www.cisco.com/en/US/products/ps10277
 /prod_module_series_home.html.  It's a converged 10G Ethernet adapter 
 available  in a variety of form factors (e.g., 2x10G on PCIe and Mezz).

 After some off-list discussion with Roland, we chose to create new IBV_*_USNIC
 enums because none of the current enums were accurate for our device.  It's an
 Ethernet NIC, but it's not an RNIC.  It's an Ethernet-based transport, but 
 it's not
 iWARP.


 The reason we're asking for these IBV_*_USNIC enums now -- before we've 
 submitted the driver -- is because we're targeting getting our driver 
 included in RHEL 6.5.  There's a bit of a chicken-and-egg issue here: they'll 
 accept our patches for a new hardware driver while that driver is being 
 worked upstream.  But they (rightfully) won't accept patches to IB core and 
 libibverbs until they've been vetted by the community.  Hence, even though 
 our driver is slowly working its way through QA and not available yet, we 
 wanted to submit these new enums upstream for community approval so that they 
 can be included in RHEL 6.5.

 Does that help?

yes it does, but I still think we need to see the driver code in order
to conduct proper /better review and maybe even accept the proposed
changes to the IB core. You can submit it as RFC which means you can
look on it, and give me comments, but don't pick it up yet

Or.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx4: Fail post send command on error recovery

2013-04-04 Thread Or Gerlitz
On Thu, Apr 4, 2013 at 4:01 PM, Kleber Sacilotto de Souza

 The problem that I'm trying to solve is that some IB core modules are
 hanging waiting on completion queues on their remove path during error 
 recovery.

So maybe patch them to give up after some time?

Or.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx4: Fail post send command on error recovery

2013-04-04 Thread Roland Dreier
On Thu, Apr 4, 2013 at 2:45 PM, Or Gerlitz or.gerl...@gmail.com wrote:
 The problem that I'm trying to solve is that some IB core modules are
 hanging waiting on completion queues on their remove path during error 
 recovery.

 So maybe patch them to give up after some time?

I don't know so much about this PCI error recovery stuff but it does
seem sensible to trigger a catastrophic error async event when it
happens (I'm assuming the recovery mechanism resets the adapter).

Then we should fix at least kernel ULPs behave appropriately when they
get such an async event.  And similarly if someone wants to harden
some subset of userspace apps to handle PCI error recovery too, that
would be another step forward.

 - R.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v3 9/9] iser-target: Add iSCSI Extensions for RDMA (iSER) target driver

2013-04-04 Thread Nicholas A. Bellinger
On Thu, 2013-04-04 at 12:20 +0300, Or Gerlitz wrote:
 On 04/04/2013 10:24, Nicholas A. Bellinger wrote:
  +
  +void isert_cq_tx_callback(struct ib_cq *, void *);
  +void isert_cq_rx_callback(struct ib_cq *, void *);
  +void isert_free_rx_descriptors(struct isert_conn *);
 any reason not to have these as static functions (same for isert_cq_rx_work)

Nope.  Looking at re-orig now to avoid the use of function prototypes
here, and marking everything as static..

--nab

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v3 9/9] iser-target: Add iSCSI Extensions for RDMA (iSER) target driver

2013-04-04 Thread Nicholas A. Bellinger
On Thu, 2013-04-04 at 12:45 +0300, Or Gerlitz wrote:
 On 04/04/2013 10:24, Nicholas A. Bellinger wrote:
 
  +#define ISER_RECV_DATA_SEG_LEN  8192
  +#define ISER_RX_PAYLOAD_SIZE(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
  [...]
  +#define ISER_RX_PAD_SIZE   (16384 - (ISER_RX_PAYLOAD_SIZE + \
  +   sizeof(u64) + sizeof(struct ib_sge)))
 
 We're eating here too much ram for the pad, you need 8K + something, so 
 the pad can count down
 from 12K and not 16K which means each such element will consume three 
 pages and not four.
 

Hmm, IIRC this larger pad was originally required after bumping the
ISER_RECV_DATA_SEG_LEN value for handling incoming ImmediateData and
Unsolicited Data-OUT..  Will try using 12k here and see what happens..

Also, ISER_RECV_DATA_SEG_LEN will need to be enforced as the largest
MaxRecvDataSegmentLength negotiated during iser login to prevent the
initiator from exceeding the hardcoded value..

  +struct iser_rx_desc {
  +   struct iser_hdr iser_header;
  +   struct iscsi_hdr iscsi_header;
  +   chardata[ISER_RECV_DATA_SEG_LEN];
  +   u64 dma_addr;
  +   struct ib_sge   rx_sg;
  +   charpad[ISER_RX_PAD_SIZE];
  +} __packed;
  +
  +struct isert_rx_desc {
  +   struct isert_conn   *desc_conn;
  +   struct work_struct  desc_work;
  +   struct iser_rx_desc desc;
  +} __packed;
 
 You have way enough room in the pad field of struct iser_rx_desc to 
 place there the two fields
 added by struct isert_rx_desc (and you only use struct iser_rx_desc from 
 within isert_rx_desc) -- any reason
 not to unify  them?
 

This is left-over cruft from the per isert_rx_desc dispatch into process
context..  Dropping this now.

--nab

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC-v3 9/9] iser-target: Add iSCSI Extensions for RDMA (iSER) target driver

2013-04-04 Thread Nicholas A. Bellinger
On Thu, 2013-04-04 at 12:51 +0300, Or Gerlitz wrote:
 On 04/04/2013 10:24, Nicholas A. Bellinger wrote:
  +static int
  +isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
  +{
  +   struct isert_cmd *isert_cmd = container_of(cmd,
  +   struct isert_cmd, iscsi_cmd);
  +   struct isert_conn *isert_conn = (struct isert_conn *)conn-context;
  +   struct ib_send_wr *send_wr = isert_cmd-tx_desc.send_wr;
  +   struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *)
  +   isert_cmd-tx_desc.iscsi_header;
  +
  +   isert_create_send_desc(isert_conn, isert_cmd, isert_cmd-tx_desc);
  +   iscsit_build_rsp_pdu(cmd, conn, true, hdr);
  +   isert_init_tx_hdrs(isert_conn, isert_cmd-tx_desc);
  +   /*
  +* Attach SENSE DATA payload to iSCSI Response PDU
  +*/
  +   if (cmd-se_cmd.sense_buffer 
  +   ((cmd-se_cmd.se_cmd_flags  SCF_TRANSPORT_TASK_SENSE) ||
  +   (cmd-se_cmd.se_cmd_flags  SCF_EMULATED_TASK_SENSE))) {
  +   struct ib_device *ib_dev = isert_conn-conn_cm_id-device;
  +   struct ib_sge *tx_dsg = isert_cmd-tx_desc.tx_sg[1];
  +   u32 padding, sense_len;
  +
  +   put_unaligned_be16(cmd-se_cmd.scsi_sense_length,
  +  cmd-sense_buffer);
  +   cmd-se_cmd.scsi_sense_length += sizeof(__be16);
  +
  +   padding = -(cmd-se_cmd.scsi_sense_length)  3;
  +   hton24(hdr-dlength, (u32)cmd-se_cmd.scsi_sense_length);
  +   sense_len = cmd-se_cmd.scsi_sense_length + padding;
  +
  +   isert_cmd-sense_buf_dma = ib_dma_map_single(ib_dev,
  +   (void *)cmd-sense_buffer, sense_len,
  +   DMA_TO_DEVICE);
  +
  +   isert_cmd-sense_buf_len = sense_len;
  +   ib_dma_sync_single_for_cpu(ib_dev, isert_cmd-sense_buf_dma,
  +  sense_len, DMA_TO_DEVICE);
  +   ib_dma_sync_single_for_device(ib_dev, isert_cmd-sense_buf_dma,
  + sense_len, DMA_TO_DEVICE);
  +
 
 you just called dma_map_single, and not going to touch the buffer before 
 posting it to the wire,
 there's no point to sync it for the cpu and for the device, remove these 
 calls.
 

Dropped.

Thanks Or!

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Using XRC between kernel and user space.

2013-04-04 Thread Shlomo Pongratz

On 4/4/2013 8:44 PM, Shlomo Pongratz wrote:

Hi,

I wonder how a kernel program and a user space one can communicate 
using XRC.
A example for such an application is iSER kernel initiator connected 
to tgtd target that runs in user space.
The XRC domain in the kernel is created with struct ib_xrcd 
*ib_alloc_xrcd(struct ib_device *device) while in the user space it 
is created with static inline struct ibv_xrcd * ibv_open_xrcd(struct 
ibv_context *context, struct ibv_xrcd_init_attr *xrcd_init_attr).
When looking in xrc_pingpong.c example we can see that the attribute 
has some file descriptor of some file (/tmp/xrc_domain).
So how a kernel program on one machine can communicate with a user 
space program on another one?
The XRC annex (page 9) sates that all QP should belong to the same XRC 
domain.


S.P.



Sorry, I misinterpreted the annex.
I thank Roland for clarifying this to me.

S.P.


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html