[PATCH 4/9] srpt: chain RDMA READ/WRITE requests

2015-11-13 Thread Christoph Hellwig
Remove struct rdma_iu and instead allocate the struct ib_rdma_wr array
early and fill out directly.  This allows us to chain the WRs, and thus
archive both less lock contention on the HCA workqueue as well as much
simpler error handling.

Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 100 +-
 drivers/infiniband/ulp/srpt/ib_srpt.h |  14 +
 2 files changed, 39 insertions(+), 75 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c 
b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 14b361a..2b6dd71 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -1057,7 +1057,7 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch 
*ch,
BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius);
 
while (ioctx->n_rdma)
-   kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge);
+   kfree(ioctx->rdma_ius[--ioctx->n_rdma].wr.sg_list);
 
kfree(ioctx->rdma_ius);
ioctx->rdma_ius = NULL;
@@ -1084,7 +1084,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
struct scatterlist *sg, *sg_orig;
int sg_cnt;
enum dma_data_direction dir;
-   struct rdma_iu *riu;
+   struct ib_rdma_wr *riu;
struct srp_direct_buf *db;
dma_addr_t dma_addr;
struct ib_sge *sge;
@@ -1117,7 +1117,8 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
+ ioctx->n_rbuf;
 
-   ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL);
+   ioctx->rdma_ius = kcalloc(nrdma, sizeof(*ioctx->rdma_ius),
+   GFP_KERNEL);
if (!ioctx->rdma_ius)
goto free_mem;
 
@@ -1141,9 +1142,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
 j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
rsize = be32_to_cpu(db->len);
raddr = be64_to_cpu(db->va);
-   riu->raddr = raddr;
+   riu->remote_addr = raddr;
riu->rkey = be32_to_cpu(db->key);
-   riu->sge_cnt = 0;
+   riu->wr.num_sge = 0;
 
/* calculate how many sge required for this remote_buf */
while (rsize > 0 && tsize > 0) {
@@ -1167,27 +1168,29 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch 
*ch,
rsize = 0;
}
 
-   ++riu->sge_cnt;
+   ++riu->wr.num_sge;
 
-   if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) {
+   if (rsize > 0 &&
+   riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) {
++ioctx->n_rdma;
-   riu->sge =
-   kmalloc(riu->sge_cnt * sizeof *riu->sge,
-   GFP_KERNEL);
-   if (!riu->sge)
+   riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+   sizeof(*riu->wr.sg_list),
+   GFP_KERNEL);
+   if (!riu->wr.sg_list)
goto free_mem;
 
++riu;
-   riu->sge_cnt = 0;
-   riu->raddr = raddr;
+   riu->wr.num_sge = 0;
+   riu->remote_addr = raddr;
riu->rkey = be32_to_cpu(db->key);
}
}
 
++ioctx->n_rdma;
-   riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge,
-  GFP_KERNEL);
-   if (!riu->sge)
+   riu->wr.sg_list = kmalloc_array(riu->wr.num_sge,
+   sizeof(*riu->wr.sg_list),
+   GFP_KERNEL);
+   if (!riu->wr.sg_list)
goto free_mem;
}
 
@@ -1202,7 +1205,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
for (i = 0, j = 0;
 j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
rsize = be32_to_cpu(db->len);
-   sge = riu->sge;
+   sge = riu->wr.sg_list;
k = 0;
 
while (rsize > 0 && tsize > 0) {
@@ -1234,9 +1237,9 @@ static int srpt_map_sg_to_

[PATCH 5/9] srpt: use the new CQ API

2015-11-13 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/ulp/srpt/ib_srpt.c | 327 +-
 drivers/infiniband/ulp/srpt/ib_srpt.h |  28 +--
 2 files changed, 88 insertions(+), 267 deletions(-)

diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c 
b/drivers/infiniband/ulp/srpt/ib_srpt.c
index 2b6dd71..d4bbad3 100644
--- a/drivers/infiniband/ulp/srpt/ib_srpt.c
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -95,6 +95,8 @@ MODULE_PARM_DESC(srpt_service_guid,
 static struct ib_client srpt_client;
 static void srpt_release_channel(struct srpt_rdma_ch *ch);
 static int srpt_queue_status(struct se_cmd *cmd);
+static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc);
 
 /**
  * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE.
@@ -780,12 +782,12 @@ static int srpt_post_recv(struct srpt_device *sdev,
struct ib_recv_wr wr, *bad_wr;
 
BUG_ON(!sdev);
-   wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index);
-
list.addr = ioctx->ioctx.dma;
list.length = srp_max_req_size;
list.lkey = sdev->pd->local_dma_lkey;
 
+   ioctx->ioctx.cqe.done = srpt_recv_done;
+   wr.wr_cqe = >ioctx.cqe;
wr.next = NULL;
wr.sg_list = 
wr.num_sge = 1;
@@ -821,8 +823,9 @@ static int srpt_post_send(struct srpt_rdma_ch *ch,
list.length = len;
list.lkey = sdev->pd->local_dma_lkey;
 
+   ioctx->ioctx.cqe.done = srpt_send_done;
wr.next = NULL;
-   wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
+   wr.wr_cqe = >ioctx.cqe;
wr.sg_list = 
wr.num_sge = 1;
wr.opcode = IB_WR_SEND;
@@ -1385,116 +1388,44 @@ out:
 }
 
 /**
- * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion.
- */
-static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id)
-{
-   struct srpt_send_ioctx *ioctx;
-   enum srpt_command_state state;
-   u32 index;
-
-   atomic_inc(>sq_wr_avail);
-
-   index = idx_from_wr_id(wr_id);
-   ioctx = ch->ioctx_ring[index];
-   state = srpt_get_cmd_state(ioctx);
-
-   WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
-   && state != SRPT_STATE_MGMT_RSP_SENT
-   && state != SRPT_STATE_NEED_DATA
-   && state != SRPT_STATE_DONE);
-
-   /* If SRP_RSP sending failed, undo the ch->req_lim change. */
-   if (state == SRPT_STATE_CMD_RSP_SENT
-   || state == SRPT_STATE_MGMT_RSP_SENT)
-   atomic_dec(>req_lim);
-
-   srpt_abort_cmd(ioctx);
-}
-
-/**
- * srpt_handle_send_comp() - Process an IB send completion notification.
- */
-static void srpt_handle_send_comp(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx)
-{
-   enum srpt_command_state state;
-
-   atomic_inc(>sq_wr_avail);
-
-   state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
-
-   if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
-   && state != SRPT_STATE_MGMT_RSP_SENT
-   && state != SRPT_STATE_DONE))
-   pr_debug("state = %d\n", state);
-
-   if (state != SRPT_STATE_DONE) {
-   srpt_unmap_sg_to_ib_sge(ch, ioctx);
-   transport_generic_free_cmd(>cmd, 0);
-   } else {
-   pr_err("IB completion has been received too late for"
-  " wr_id = %u.\n", ioctx->ioctx.index);
-   }
-}
-
-/**
- * srpt_handle_rdma_comp() - Process an IB RDMA completion notification.
- *
  * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping
  * the data that has been transferred via IB RDMA had to be postponed until the
  * check_stop_free() callback.  None of this is necessary anymore and needs to
  * be cleaned up.
  */
-static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
- struct srpt_send_ioctx *ioctx,
- enum srpt_opcode opcode)
+static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
 {
+   struct srpt_rdma_ch *ch = cq->cq_context;
+   struct srpt_send_ioctx *ioctx =
+   container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe);
+
WARN_ON(ioctx->n_rdma <= 0);
atomic_add(ioctx->n_rdma, >sq_wr_avail);
 
-   if (opcode == SRPT_RDMA_READ_LAST) {
-   if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
-   SRPT_STATE_DATA_IN))
-   target_execute_cmd(>cmd);
-   else
-   pr_err("%s[%d]: wrong state = %d\n", __func__,
-  __LINE__, srpt_get_cmd_state(ioctx));
-   } else {
-   WARN(true, "unexpected opcode %d\n", opcode);

[PATCH 7/9] IB/iser: Use a dedicated descriptor for login

2015-11-13 Thread Christoph Hellwig
From: Sagi Grimberg <sa...@mellanox.com>

Makes better sense and we'll need it later with CQ
abstraction.
iser switch login bufs to void

Signed-off-by: Sagi Grimberg <sa...@mellanox.com>
Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/ulp/iser/iscsi_iser.h |  30 +--
 drivers/infiniband/ulp/iser/iser_initiator.c | 128 +--
 drivers/infiniband/ulp/iser/iser_verbs.c |  14 +--
 3 files changed, 89 insertions(+), 83 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h 
b/drivers/infiniband/ulp/iser/iscsi_iser.h
index 502063b..5648409 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -326,6 +326,25 @@ struct iser_rx_desc {
char pad[ISER_RX_PAD_SIZE];
 } __attribute__((packed));
 
+
+/**
+ * struct iser_login_desc - iSER login descriptor
+ *
+ * @req:   pointer to login request buffer
+ * @resp:  pointer to login response buffer
+ * @req_dma:   DMA address of login request buffer
+ * @rsp_dma:  DMA address of login response buffer
+ * @sge:   IB sge for login post recv
+ */
+struct iser_login_desc {
+   void *req;
+   void *rsp;
+   u64  req_dma;
+   u64  rsp_dma;
+   struct ib_sgesge;
+} __attribute__((packed));
+
+
 struct iser_conn;
 struct ib_conn;
 struct iscsi_iser_task;
@@ -512,11 +531,7 @@ struct ib_conn {
  * @up_completion:connection establishment completed
  *(state is ISER_CONN_UP)
  * @conn_list:entry in ig conn list
- * @login_buf:login data buffer (stores login parameters)
- * @login_req_buf:login request buffer
- * @login_req_dma:login request buffer dma address
- * @login_resp_buf:   login response buffer
- * @login_resp_dma:   login response buffer dma address
+ * @login_desc:   login descriptor
  * @rx_desc_head: head of rx_descs cyclic buffer
  * @rx_descs: rx buffers array (cyclic buffer)
  * @num_rx_descs: number of rx descriptors
@@ -539,10 +554,7 @@ struct iser_conn {
struct completionib_completion;
struct completionup_completion;
struct list_head conn_list;
-
-   char *login_buf;
-   char *login_req_buf, *login_resp_buf;
-   u64  login_req_dma, login_resp_dma;
+   struct iser_login_desc   login_desc;
unsigned int rx_desc_head;
struct iser_rx_desc  *rx_descs;
u32  num_rx_descs;
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c 
b/drivers/infiniband/ulp/iser/iser_initiator.c
index ffd00c4..21f28c8 100644
--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -174,73 +174,63 @@ static void iser_create_send_desc(struct iser_conn
*iser_conn,
 static void iser_free_login_buf(struct iser_conn *iser_conn)
 {
struct iser_device *device = iser_conn->ib_conn.device;
+   struct iser_login_desc *desc = _conn->login_desc;
 
-   if (!iser_conn->login_buf)
+   if (!desc->req)
return;
 
-   if (iser_conn->login_req_dma)
-   ib_dma_unmap_single(device->ib_device,
-   iser_conn->login_req_dma,
-   ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+   ib_dma_unmap_single(device->ib_device, desc->req_dma,
+   ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
 
-   if (iser_conn->login_resp_dma)
-   ib_dma_unmap_single(device->ib_device,
-   iser_conn->login_resp_dma,
-   ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+   ib_dma_unmap_single(device->ib_device, desc->rsp_dma,
+   ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
 
-   kfree(iser_conn->login_buf);
+   kfree(desc->req);
+   kfree(desc->rsp);
 
/* make sure we never redo any unmapping */
-   iser_conn->login_req_dma = 0;
-   iser_conn->login_resp_dma = 0;
-   iser_conn->login_buf = NULL;
+   desc->req = NULL;
+   desc->rsp = NULL;
 }
 
 static int iser_alloc_login_buf(struct iser_conn *iser_conn)
 {
struct iser_device *device = iser_conn->ib_conn.device;
-   int req_err, resp_err;
-
-   BUG_ON(device == NULL);
-
-   iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
-ISER_RX_LOGIN_SIZE, GFP_KERNEL);
-   if (!iser_conn->login_buf)
-   goto out_err;
-
-   iser_conn->

[PATCH 6/9] srp: use the new CQ API

2015-11-13 Thread Christoph Hellwig
This also moves recv completion handling from hardirq context into
softirq context.

Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/ulp/srp/ib_srp.c | 198 ++--
 drivers/infiniband/ulp/srp/ib_srp.h |   7 +-
 2 files changed, 76 insertions(+), 129 deletions(-)

diff --git a/drivers/infiniband/ulp/srp/ib_srp.c 
b/drivers/infiniband/ulp/srp/ib_srp.c
index 3027824..57237e1 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -132,8 +132,9 @@ MODULE_PARM_DESC(ch_count,
 
 static void srp_add_one(struct ib_device *device);
 static void srp_remove_one(struct ib_device *device, void *client_data);
-static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
-static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
+static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc);
+static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc,
+   const char *opname);
 static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
 
 static struct scsi_transport_template *ib_srp_transport_template;
@@ -445,41 +446,6 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct 
srp_target_port *target)
  dev->max_pages_per_mr);
 }
 
-/**
- * srp_destroy_qp() - destroy an RDMA queue pair
- * @ch: SRP RDMA channel.
- *
- * Change a queue pair into the error state and wait until all receive
- * completions have been processed before destroying it. This avoids that
- * the receive completion handler can access the queue pair while it is
- * being destroyed.
- */
-static void srp_destroy_qp(struct srp_rdma_ch *ch)
-{
-   static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
-   static struct ib_recv_wr wr = { 0 };
-   struct ib_recv_wr *bad_wr;
-   int ret;
-
-   wr.wr_id = SRP_LAST_WR_ID;
-   /* Destroying a QP and reusing ch->done is only safe if not connected */
-   WARN_ON_ONCE(ch->connected);
-
-   ret = ib_modify_qp(ch->qp, , IB_QP_STATE);
-   WARN_ONCE(ret, "ib_cm_init_qp_attr() returned %d\n", ret);
-   if (ret)
-   goto out;
-
-   init_completion(>done);
-   ret = ib_post_recv(ch->qp, , _wr);
-   WARN_ONCE(ret, "ib_post_recv() returned %d\n", ret);
-   if (ret == 0)
-   wait_for_completion(>done);
-
-out:
-   ib_destroy_qp(ch->qp);
-}
-
 static int srp_create_ch_ib(struct srp_rdma_ch *ch)
 {
struct srp_target_port *target = ch->target;
@@ -490,34 +456,27 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
struct ib_fmr_pool *fmr_pool = NULL;
struct srp_fr_pool *fr_pool = NULL;
const int m = 1 + dev->use_fast_reg;
-   struct ib_cq_init_attr cq_attr = {};
int ret;
 
init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
if (!init_attr)
return -ENOMEM;
 
-   /* + 1 for SRP_LAST_WR_ID */
-   cq_attr.cqe = target->queue_size + 1;
-   cq_attr.comp_vector = ch->comp_vector;
-   recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, ch,
-  _attr);
+   /* queue_size + 1 for ib_drain_qp */
+   recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1, 
ch->comp_vector,
+   IB_POLL_SOFTIRQ);
if (IS_ERR(recv_cq)) {
ret = PTR_ERR(recv_cq);
goto err;
}
 
-   cq_attr.cqe = m * target->queue_size;
-   cq_attr.comp_vector = ch->comp_vector;
-   send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, ch,
-  _attr);
+   send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size, 
ch->comp_vector,
+   IB_POLL_DIRECT);
if (IS_ERR(send_cq)) {
ret = PTR_ERR(send_cq);
goto err_recv_cq;
}
 
-   ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
-
init_attr->event_handler   = srp_qp_event;
init_attr->cap.max_send_wr = m * target->queue_size;
init_attr->cap.max_recv_wr = target->queue_size + 1;
@@ -557,11 +516,11 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch)
}
 
if (ch->qp)
-   srp_destroy_qp(ch);
+   ib_destroy_qp(ch->qp);
if (ch->recv_cq)
-   ib_destroy_cq(ch->recv_cq);
+   ib_free_cq(ch->recv_cq);
if (ch->send_cq)
-   ib_destroy_cq(ch->send_cq);
+   ib_free_cq(ch->send_cq);
 
ch->qp = qp;
ch->recv_cq = recv_cq;
@@ -584,10 +543,10 @@ err_qp:
ib_destroy_qp(qp);
 
 err_send_cq:
-   ib_destroy_cq(send_cq);
+   ib_free_cq(send_cq);
 
 err_recv_cq:
-   ib_destroy_cq(recv_cq);
+   ib_free_cq(recv_cq);
 
 err:
 

[PATCH 9/9] IB/iser: Convert to CQ abstraction

2015-11-13 Thread Christoph Hellwig
From: Sagi Grimberg <sa...@mellanox.com>

Signed-off-by: Sagi Grimberg <sa...@mellanox.com>
Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/ulp/iser/iscsi_iser.h |  68 ---
 drivers/infiniband/ulp/iser/iser_initiator.c | 142 ++-
 drivers/infiniband/ulp/iser/iser_memory.c|  21 ++-
 drivers/infiniband/ulp/iser/iser_verbs.c | 258 ++-
 4 files changed, 209 insertions(+), 280 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h 
b/drivers/infiniband/ulp/iser/iscsi_iser.h
index cf4c4ce..1799c87 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -151,16 +151,12 @@
 - ISER_MAX_RX_MISC_PDUS) / \
 (1 + ISER_INFLIGHT_DATAOUTS))
 
-#define ISER_WC_BATCH_COUNT   16
 #define ISER_SIGNAL_CMD_COUNT 32
 
 #define ISER_VER   0x10
 #define ISER_WSV   0x08
 #define ISER_RSV   0x04
 
-#define ISER_FASTREG_LI_WRID   0xULL
-#define ISER_BEACON_WRID   0xfffeULL
-
 /**
  * struct iser_hdr - iSER header
  *
@@ -269,7 +265,7 @@ enum iser_desc_type {
 #define ISER_MAX_WRS 7
 
 /**
- * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ * struct iser_tx_desc - iSER TX descriptor
  *
  * @iser_header:   iser header
  * @iscsi_header:  iscsi header
@@ -293,6 +289,7 @@ struct iser_tx_desc {
u64  dma_addr;
struct ib_sgetx_sg[2];
int  num_sge;
+   struct ib_cqecqe;
bool mapped;
u8   wr_idx;
union iser_wr {
@@ -306,9 +303,10 @@ struct iser_tx_desc {
 };
 
 #define ISER_RX_PAD_SIZE   (256 - (ISER_RX_PAYLOAD_SIZE + \
-   sizeof(u64) + sizeof(struct ib_sge)))
+sizeof(u64) + sizeof(struct ib_sge) + \
+sizeof(struct ib_cqe)))
 /**
- * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ * struct iser_rx_desc - iSER RX descriptor
  *
  * @iser_header:   iser header
  * @iscsi_header:  iscsi header
@@ -323,6 +321,7 @@ struct iser_rx_desc {
char data[ISER_RECV_DATA_SEG_LEN];
u64  dma_addr;
struct ib_sgerx_sg;
+   struct ib_cqecqe;
char pad[ISER_RX_PAD_SIZE];
 } __attribute__((packed));
 
@@ -335,6 +334,7 @@ struct iser_rx_desc {
  * @req_dma:   DMA address of login request buffer
  * @rsp_dma:  DMA address of login response buffer
  * @sge:   IB sge for login post recv
+ * @cqe:   completion handler
  */
 struct iser_login_desc {
void *req;
@@ -342,6 +342,7 @@ struct iser_login_desc {
u64  req_dma;
u64  rsp_dma;
struct ib_sgesge;
+   struct ib_cqecqe;
 } __attribute__((packed));
 
 
@@ -352,18 +353,12 @@ struct iscsi_iser_task;
 /**
  * struct iser_comp - iSER completion context
  *
- * @device: pointer to device handle
  * @cq: completion queue
- * @wcs:work completion array
- * @tasklet:Tasklet handle
  * @active_qps: Number of active QPs attached
  *  to completion context
  */
 struct iser_comp {
-   struct iser_device  *device;
struct ib_cq*cq;
-   struct ib_wc wcs[ISER_WC_BATCH_COUNT];
-   struct tasklet_structtasklet;
int  active_qps;
 };
 
@@ -492,10 +487,11 @@ struct iser_fr_pool {
  * @rx_wr:   receive work request for batch posts
  * @device:  reference to iser device
  * @comp:iser completion context
- * @pi_support:  Indicate device T10-PI support
- * @beacon:  beacon send wr to signal all flush errors were drained
- * @flush_comp:  completes when all connection completions consumed
  * @fr_pool: connection fast registration poool
+ * @pi_support:  Indicate device T10-PI support
+ * @last:last send wr to signal all flush errors were drained
+ * @last_cqe:cqe handler for last wr
+ * @last_comp:   completes when all connection completions consumed
  */
 struct ib_conn {
struct rdma_cm_id   *cma_id;
@@ -505,10 +501,12 @@ struct ib_conn {
struct ib_recv_wrrx_wr[ISER_MIN_POSTED_RX];
struct iser_device  *device;
struct iser_comp*comp;
-   bool pi_support;
-   struct ib_send_wrbeacon;
-   struct completionflush_comp;
s

[PATCH 2/9] IB: add a proper completion queue abstraction

2015-11-13 Thread Christoph Hellwig
This adds an abstraction that allows ULP to simply pass a completion
object and completion callback with each submitted WR and let the RDMA
core handle the nitty gritty details of how to handle completion
interrupts and poll the CQ.

In detail there is a new ib_cqe structure which just contains the
completion callback, and which can be used to get at the containing
object using container_of.  It is pointed to by the WR and WC as an
alternative to the wr_id field, similar to how many ULPs already use
the field to store a pointer using casts.

A driver using the new completion callbacks allocates it's CQs using
the new ib_create_cq API, which in addition to the number of CQEs and
the completion vectors also takes a mode on how we poll for CQEs.
Three modes are available: direct for drivers that never take CQ
interrupts and just poll for them, softirq to poll from softirq context
using the to be renamed blk-iopoll infrastructure which takes care of
rearming and budgeting, or a workqueue for consumer who want to be
called from user context.

Thanks a lot to Sagi Grimberg who helped reviewing the API, wrote
the current version of the workqueue code because my two previous
attempts sucked too much and converted the iSER initiator to the new
API.

Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/Kconfig  |   1 +
 drivers/infiniband/core/Makefile|   2 +-
 drivers/infiniband/core/cq.c| 208 
 drivers/infiniband/core/device.c|  15 ++-
 drivers/infiniband/ulp/ipoib/ipoib_cm.c |   2 +-
 drivers/infiniband/ulp/srp/ib_srp.c |   6 +-
 include/rdma/ib_verbs.h |  38 +-
 7 files changed, 263 insertions(+), 9 deletions(-)
 create mode 100644 drivers/infiniband/core/cq.c

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index aa26f3c..282ec0b 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -5,6 +5,7 @@ menuconfig INFINIBAND
depends on NET
depends on INET
depends on m || IPV6 != m
+   select IRQ_POLL
---help---
  Core support for InfiniBand (IB).  Make sure to also select
  any protocols you wish to use as well as drivers for your
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index d43a899..ae48d8740 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) +=ib_umad.o
 obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=ib_uverbs.o ib_ucm.o \
$(user_access-y)
 
-ib_core-y :=   packer.o ud_header.o verbs.o sysfs.o \
+ib_core-y :=   packer.o ud_header.o verbs.o cq.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o \
roce_gid_mgmt.o
 ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
new file mode 100644
index 000..d9eb796
--- /dev/null
+++ b/drivers/infiniband/core/cq.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2015 HGST, a Western Digital Company.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include 
+#include 
+#include 
+#include 
+
+/* # of WCs to poll for with a single call to ib_poll_cq */
+#define IB_POLL_BATCH  16
+
+/* # of WCs to iterate over before yielding */
+#define IB_POLL_BUDGET_IRQ 256
+#define IB_POLL_BUDGET_WORKQUEUE   65536
+
+#define IB_POLL_FLAGS \
+   (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
+
+static int __ib_process_cq(struct ib_cq *cq, int budget)
+{
+   int i, n, completed = 0;
+
+   while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
+   for (i = 0; i < n; i++) {
+   struct ib_wc *wc = >wc[i];
+
+   if (wc->wr_cqe)
+   wc->wr_cqe->done(cq, wc);
+   else
+   WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
+   }
+
+   completed += n;
+   if (completed >= budget)
+   break;
+   }
+
+   return completed;
+}
+
+/**
+ * ib_process_direct_cq - process a CQ in caller context
+ * @cq:CQ to process
+ *
+ * This function is used to process all outstanding CQ entries on a
+ * %IB_POLL_DIRECT CQ.  It does not offload CQ processing to a different
+ * context and does not ask from c

[PATCH 1/9] move blk_iopoll to limit and make it generally available

2015-11-13 Thread Christoph Hellwig
The new name is irq_poll as iopoll is already taken.  Better suggestions
welcome.

Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 Documentation/kernel-per-CPU-kthreads.txt |   2 +-
 block/Makefile|   2 +-
 block/blk-iopoll.c| 224 --
 drivers/scsi/Kconfig  |   1 +
 drivers/scsi/be2iscsi/Kconfig |   1 +
 drivers/scsi/be2iscsi/be.h|   4 +-
 drivers/scsi/be2iscsi/be_iscsi.c  |   4 +-
 drivers/scsi/be2iscsi/be_main.c   |  24 ++--
 drivers/scsi/ipr.c|  28 ++--
 drivers/scsi/ipr.h|   4 +-
 include/linux/blk-iopoll.h|  46 --
 include/linux/interrupt.h |   2 +-
 include/linux/irq_poll.h  |  46 ++
 include/trace/events/irq.h|   2 +-
 lib/Kconfig   |   5 +
 lib/Makefile  |   1 +
 lib/irq_poll.c| 221 +
 tools/lib/traceevent/event-parse.c|   2 +-
 tools/perf/util/trace-event-parse.c   |   2 +-
 19 files changed, 313 insertions(+), 308 deletions(-)
 delete mode 100644 block/blk-iopoll.c
 delete mode 100644 include/linux/blk-iopoll.h
 create mode 100644 include/linux/irq_poll.h
 create mode 100644 lib/irq_poll.c

diff --git a/Documentation/kernel-per-CPU-kthreads.txt 
b/Documentation/kernel-per-CPU-kthreads.txt
index f4cbfe0..edec3a3 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -90,7 +90,7 @@ BLOCK_SOFTIRQ:  Do all of the following:
from being initiated from tasks that might run on the CPU to
be de-jittered.  (It is OK to force this CPU offline and then
bring it back online before you start your application.)
-BLOCK_IOPOLL_SOFTIRQ:  Do all of the following:
+IRQ_POLL_SOFTIRQ:  Do all of the following:
 1. Force block-device interrupts onto some other CPU.
 2. Initiate any block I/O and block-I/O polling on other CPUs.
 3. Once your application has started, prevent CPU-hotplug operations
diff --git a/block/Makefile b/block/Makefile
index 00ecc97..e850474 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-   blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
+   blk-lib.o blk-mq.o blk-mq-tag.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
partitions/
diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
deleted file mode 100644
index 0736729..000
--- a/block/blk-iopoll.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Functions related to interrupt-poll handling in the block layer. This
- * is similar to NAPI for network devices.
- */
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include "blk.h"
-
-static unsigned int blk_iopoll_budget __read_mostly = 256;
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
-
-/**
- * blk_iopoll_sched - Schedule a run of the iopoll handler
- * @iop:  The parent iopoll structure
- *
- * Description:
- * Add this blk_iopoll structure to the pending poll list and trigger the
- * raise of the blk iopoll softirq. The driver must already have gotten a
- * successful return from blk_iopoll_sched_prep() before calling this.
- **/
-void blk_iopoll_sched(struct blk_iopoll *iop)
-{
-   unsigned long flags;
-
-   local_irq_save(flags);
-   list_add_tail(>list, this_cpu_ptr(_cpu_iopoll));
-   __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ);
-   local_irq_restore(flags);
-}
-EXPORT_SYMBOL(blk_iopoll_sched);
-
-/**
- * __blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop:  The parent iopoll structure
- *
- * Description:
- * See blk_iopoll_complete(). This function must be called with interrupts
- * disabled.
- **/
-void __blk_iopoll_complete(struct blk_iopoll *iop)
-{
-   list_del(>list);
-   smp_mb__before_atomic();
-   clear_bit_unlock(IOPOLL_F_SCHED, >state);
-}
-EXPORT_SYMBOL(__blk_iopoll_complete);
-
-/**
- * blk_iopoll_complete - Mark this @iop as un-polled again
- * @iop:  The parent iopoll structure
- *
- * Description:
- * If a driver consumes less than the assigned budget in its run of the
- * iopoll handler, it'll end the polled mode by calling this function. The
- * iopoll handler will not be invoked again before blk_iopoll_sched_prep()
- * is called.
- **/
-void blk_iopoll_complete(struct blk_iopoll *iop)
-{
- 

add a proper completion queue abstraction

2015-11-13 Thread Christoph Hellwig
This series adds a new RDMA core abstraction that insulated the
ULPs from the nitty gritty details of CQ polling.  See the individual
patches for more details.

Note that this series should be applied on top of my
"IB: merge struct ib_device_attr into struct ib_device" patch.

A git tree is also available:

http://git.infradead.org/users/hch/rdma.git/shortlog/refs/heads/rdma-cq
git://git.infradead.org/users/hch/rdma.git rdma-cq

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/9] IB: add a helper to safely drain a QP

2015-11-13 Thread Christoph Hellwig
Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/core/cq.c | 46 
 include/rdma/ib_verbs.h  |  2 ++
 2 files changed, 48 insertions(+)

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index d9eb796..bf2a079 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -206,3 +206,49 @@ void ib_free_cq(struct ib_cq *cq)
WARN_ON_ONCE(ret);
 }
 EXPORT_SYMBOL(ib_free_cq);
+
+struct ib_stop_cqe {
+   struct ib_cqe   cqe;
+   struct completion done;
+};
+
+static void ib_stop_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+   struct ib_stop_cqe *stop =
+   container_of(wc->wr_cqe, struct ib_stop_cqe, cqe);
+
+   complete(>done);
+}
+
+/*
+ * Change a queue pair into the error state and wait until all receive
+ * completions have been processed before destroying it. This avoids that
+ * the receive completion handler can access the queue pair while it is
+ * being destroyed.
+ */
+void ib_drain_qp(struct ib_qp *qp)
+{
+   struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+   struct ib_stop_cqe stop = { };
+   struct ib_recv_wr wr, *bad_wr;
+   int ret;
+
+   wr.wr_cqe = 
+   stop.cqe.done = ib_stop_done;
+   init_completion();
+
+   ret = ib_modify_qp(qp, , IB_QP_STATE);
+   if (ret) {
+   WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
+   return;
+   }
+
+   ret = ib_post_recv(qp, , _wr);
+   if (ret) {
+   WARN_ONCE(ret, "failed to drain QP: %d\n", ret);
+   return;
+   }
+
+   wait_for_completion();
+}
+EXPORT_SYMBOL(ib_drain_qp);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e11e038..f59a8d3 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3075,4 +3075,6 @@ int ib_sg_to_pages(struct ib_mr *mr,
   int sg_nents,
   int (*set_page)(struct ib_mr *, u64));
 
+void ib_drain_qp(struct ib_qp *qp);
+
 #endif /* IB_VERBS_H */
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: srp state in current mainline

2015-11-12 Thread Christoph Hellwig
On Thu, Nov 12, 2015 at 05:12:14PM -0800, Bart Van Assche wrote:
> On 11/12/2015 09:59 AM, Christoph Hellwig wrote:
> >[  108.998574] WARNING: CPU: 0 PID: 1258 at kernel/sched/core.c:7389 
> >__might_sleep+0xa7/0xb0()
> >[  108.998580] do not call blocking ops when !TASK_RUNNING; state=1 set
> 
> Although this is most likely unrelated to the issue reported at the start of
> this thread, I have started working on a patch for this warning.

It's unrelated and fixed in the SCST version.  But don't bother, I have
a series that gets rid of the srpt completion thread entirely.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: srp state in current mainline

2015-11-12 Thread Christoph Hellwig
On Wed, Nov 11, 2015 at 01:07:44PM -0800, Bart Van Assche wrote:
> On 11/10/2015 09:15 AM, Christoph Hellwig wrote:
> >scsi host3: ib_srp: failed receive status WR flushed (5) for iu 
> >880313f4ca40
> 
> Can you also post the logs from the target system from around the time this
> message was logged on the initiator system ? Usually this message means that
> the target system closed a QP. I'm looking for messages generated by the
> following statement in ib_srpt.c:

None of those as far as I can tell:

[  108.896609] ib_srpt Received SRP_LOGIN_REQ with i_port_id 
0x0:0xf4521403007be042, t_port_id 0x2c903009f83a0:0x2c903009f83a0 and it_iu_len 
260 on port 2 (guid=0xfe80:0x2c903009f83a2)
[  108.903032] ib_srpt Session : kernel thread ib_srpt_compl (PID 1258) started
[  108.910776] ib_srpt Received SRP_LOGIN_REQ with i_port_id 
0x0:0xf4521403007be042, t_port_id 0x2c903009f83a0:0x2c903009f83a0 and it_iu_len 
260 on port 2 (guid=0xfe80:0x2c903009f83a2)
[  108.916901] ib_srpt Session : kernel thread ib_srpt_compl (PID 1259) started
[  108.924338] ib_srpt Received SRP_LOGIN_REQ with i_port_id 
0x0:0xf4521403007be042, t_port_id 0x2c903009f83a0:0x2c903009f83a0 and it_iu_len 
260 on port 2 (guid=0xfe80:0x2c903009f83a2)
[  108.929974] ib_srpt Session : kernel thread ib_srpt_compl (PID 1260) started
[  108.937428] ib_srpt Received SRP_LOGIN_REQ with i_port_id 
0x0:0xf4521403007be042, t_port_id 0x2c903009f83a0:0x2c903009f83a0 and it_iu_len 
260 on port 2 (guid=0xfe80:0x2c903009f83a2)
[  108.943112] ib_srpt Session : kernel thread ib_srpt_compl (PID 1261) started
[  108.950896] ib_srpt Received SRP_LOGIN_REQ with i_port_id 
0x0:0xf4521403007be042, t_port_id 0x2c903009f83a0:0x2c903009f83a0 and it_iu_len 
260 on port 2 (guid=0xfe80:0x2c903009f83a2)
[  108.956066] ib_srpt Session : kernel thread ib_srpt_compl (PID 1262) started
[  108.964064] ib_srpt Received SRP_LOGIN_REQ with i_port_id 
0x0:0xf4521403007be042, t_port_id 0x2c903009f83a0:0x2c903009f83a0 and it_iu_len 
260 on port 2 (guid=0xfe80:0x2c903009f83a2)
[  108.969751] ib_srpt Session : kernel thread ib_srpt_compl (PID 1263) started
[  108.977249] ib_srpt Received SRP_LOGIN_REQ with i_port_id 
0x0:0xf4521403007be042, t_port_id 0x2c903009f83a0:0x2c903009f83a0 and it_iu_len 
260 on port 2 (guid=0xfe80:0x2c903009f83a2)
[  108.982943] ib_srpt Session : kernel thread ib_srpt_compl (PID 1264) started
[  108.990002] ib_srpt Received SRP_LOGIN_REQ with i_port_id 
0x0:0xf4521403007be042, t_port_id 0x2c903009f83a0:0x2c903009f83a0 and it_iu_len 
260 on port 2 (guid=0xfe80:0x2c903009f83a2)
[  108.995576] ib_srpt Session : kernel thread ib_srpt_compl (PID 1265) started
[  108.998564] [ cut here ]
[  108.998574] WARNING: CPU: 0 PID: 1258 at kernel/sched/core.c:7389 
__might_sleep+0xa7/0xb0()
[  108.998580] do not call blocking ops when !TASK_RUNNING; state=1 set
at [] prepare_to_wait_event+0x5d/0x100
[  108.998582] Modules linked in: ib_srpt target_core_pscsi target_core_file 
target_core_iblock iscsi_target_mod target_core_mod configfs nfsd auth_rpcgss 
oid_registry nfs_acl nfs lockd grace fscache sunrpc intel_powerclamp coretemp 
kvm crct10dif_pclmul crc32_pclmul ghash_clmulni_intel mgag200 ttm sha256_ssse3 
sha256_generic snd_pcm hmac drm_kms_helper drbg ansi_cprng drm snd_timer 
aesni_intel aes_x86_64 i2c_algo_bit lrw gf128mul snd soundcore glue_helper 
i2c_core ablk_helper cryptd psmouse ipmi_devintf iTCO_wdt dcdbas tpm_tis evdev 
iTCO_vendor_support serio_raw i7core_edac acpi_power_meter pcspkr ipmi_si tpm 
ib_ipoib shpchp ipmi_msghandler edac_core button acpi_cpufreq lpc_ich mfd_core 
ib_umad rdma_ucm rdma_cm iw_cm processor ib_uverbs ib_cm mlx4_ib ib_sa ib_mad 
ib_core ib_addr parport_pc ppdev lp
[  108.998652]  parport autofs4 ext4 crc16 mbcache jbd2 sd_mod sg sr_mod cdrom 
ata_generic ata_piix crc32c_intel libata mptsas ehci_pci uhci_hcd 
scsi_transport_sas mptscsih ehci_hcd mptbase usbcore mlx4_core usb_common 
scsi_mod nvme bnx2
[  108.998677] CPU: 0 PID: 1258 Comm: ib_srpt_compl Tainted: G
I 4.2.0 #34
[  108.998679] Hardware name: Dell Inc. PowerEdge R710/00NH4P, BIOS 3.0.0 
01/31/2011
[  108.998681]  81a01b6f 88061374ba28 81621a32 

[  108.998685]  88061374ba78 88061374ba68 8107d72a 
88061374bb18
[  108.998689]  81a11775 0b30  
00d0
[  108.998692] Call Trace:
[  108.998699]  [] dump_stack+0x4c/0x65
[  108.998704]  [] warn_slowpath_common+0x8a/0xc0
[  108.998707]  [] warn_slowpath_fmt+0x46/0x50
[  108.998710]  [] ? prepare_to_wait_event+0x5d/0x100
[  108.998713]  [] ? prepare_to_wait_event+0x5d/0x100
[  108.998716]  [] __might_sleep+0xa7/0xb0
[  108.998720]  [] __kmalloc+0x1f4/0x680
[  108.998737]  [] ? target_check_reservation+0x69/0x720 
[target_core_mod]
[  108.998747]  [] ? target_alua_state_chec

Re: [PATCH RFC 1/3] IB/core: Expose a device attribute for rdma_read access flags

2015-11-11 Thread Christoph Hellwig
On Wed, Nov 11, 2015 at 11:09:03AM +0200, Sagi Grimberg wrote:
> It does, but it doesn't look like something we'd want to check for each
> IO...

Both potential callers already have a access flags variable in object
that's assigned to at setup time so I don't see a problem here.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/3] IB/core: Expose a device attribute for rdma_read access flags

2015-11-11 Thread Christoph Hellwig
On Wed, Nov 11, 2015 at 11:07:14AM +0200, Sagi Grimberg wrote:
> 
> 
> On 10/11/2015 15:41, Christoph Hellwig wrote:
> >FYI, this is the API I'd aim for (only SRP and no HW driver converted
> >yet):
> 
> This looks fine, although personally I find scope and direction flags
> more confusing than access_flags (but maybe it's just me).

Looking at the drivers the current flags seem to confuse programmes
hard, given that the choises seem to be random values that just happen
to work:

rkeys:
iser:   lwrite | rwrite | rread
srp:lwrite | rwrite | rread
rds:lwrite | rread | rwrite
rds:rwrite
xprdtrdma (wr): rwrite | lwrite
xprdtrdma (rd): rread

lkeys:
isert:  lwrite
svcrdma:lwrite | rwrite

moving to a model where the consumer clearly says what they intend
to do with the MR seem much better than that.

> I think that the real issue here is the server/target side which needs
> extra logic for rdma_read and memory registration. If we can put this
> logic in the core and give ULPs an API that looks something like:
> - ib_rdma_read()
> - ib_rdma_write()
> 
> then maybe we don't need to change our flags?

The current flags don't make any sense for someone trying to use
the API.  They're pointless leakage of badly designed protocol specs.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: srp state in current mainline

2015-11-11 Thread Christoph Hellwig
On Wed, Nov 11, 2015 at 07:35:47AM -0800, Bart Van Assche wrote:
> On 11/10/2015 09:15 AM, Christoph Hellwig wrote:
> >This is a simply xfstests run using XFS on a remote LIO ramdisk.
> 
> Hello Christoph,
> 
> Which version of the kernel and LIO were installed at the target side ?

I've tried a couple different one from 4.1-rc3 to latest Linus tree
from yesterday, and the target side doesn't matter.

I've also bisected things down on the initiator side and the changes to
enable prefer_fr and register_always seem to be the culprit at least as
far as 4.3 is concerned.  If I disable both 4.3 is working fine, but
enabling either one causes frequent map failures.  Just rebuilding
a new Linus current tree to see if the options help there as well.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: srp state in current mainline

2015-11-11 Thread Christoph Hellwig
On Wed, Nov 11, 2015 at 08:03:46AM -0800, Bart Van Assche wrote:
> Hello Christoph,
> 
> The SRP initiator from kernel 4.3 is working fine on my test setup. I will
> start a test with Linus' tree and with the following SRP kernel module
> parameters:
> 
> # cat /etc/modprobe.d/ib_srp.conf
> options ib_srp cmd_sg_entries=255 register_always=1 prefer_fr=1

I just finished two runs with the current Linus tree.

register_always=N prefer_fr=N finishes fine.

default options isn't happy:

[   69.516617] run fstests generic/001 at 2015-11-11 08:07:39
[   73.649108] XFS (sdb): Unmounting Filesystem
[   73.649265] [ cut here ]
[   73.649276] WARNING: CPU: 2 PID: 1785 at 
drivers/infiniband/ulp/srp/ib_srp.c:1260 srp_map_desc+0x64/0x80 [ib_srp]()
[   73.649278] Modules linked in: xfs libcrc32c ib_srp scsi_transport_srp nfsd 
auth_rpcgss oid_regi stry nfs_acl nfs lockd grace fscache sunrpc 
intel_powerclamp coretemp kvm_intel kvm irqbypass crct1 0dif_pclmul 
crc32_pclmul sha256_ssse3 sha256_generic hmac drbg ansi_cprng snd_pcm mgag200 
ttm drm_k ms_helper aesni_intel snd_timer drm aes_x86_64 snd lrw gf128mul 
psmouse iTCO_wdt iTCO_vendor_support glue_helper soundcore evdev 
acpi_power_meter ablk_helper i2c_algo_bit lpc_ich serio_raw dcdbas i7core_edac 
ipmi_devintf pcspkr i2c_core cryptd edac_core shpchp button mfd_core ipmi_si 
ipmi_msghandler acpi_cpufreq tpm_tis tpm ib_ipoib ib_umad rdma_ucm rdma_cm 
iw_cm ib_uverbs ib_cm mlx4_ib ib_sa ib_mad ib_core ib_addr autofs4 ext4 crc16 
mbcache jbd2 sd_mod sg sr_mod cdrom ata_generic crc32c_intel mptsas ata_piix
[   73.649342]  ehci_pci uhci_hcd scsi_transport_sas ehci_hcd mptscsih libata 
mptbase mlx4_core usb core scsi_mod usb_common bnx2
[   73.649353] CPU: 2 PID: 1785 Comm: xfsaild/sdb Tainted: G  I 4.3.0+ 
#32
[   73.649356] Hardware name: Dell Inc. PowerEdge R710/00NH4P, BIOS 3.0.0 
01/31/2011
[   73.649358]  a04c2460 812c9d63  
8106ef61
[   73.649361]  8800c94a3a60 8800c94a3a30 0006102ad1c0 
0001
[   73.649364]  88002dd4c300 a04bba14 0801086e 
8800c94a3a60 
[   73.649367] Call Trace:
[   73.649376]  [] ? dump_stack+0x40/0x5d
[   73.649383]  [] ? warn_slowpath_common+0x81/0xb0
[   73.649387]  [] ? srp_map_desc+0x64/0x80 [ib_srp]
[   73.649391]  [] ? srp_map_finish_fr+0x159/0x1f0
[ib_srp]
[   73.649394]  [] ? srp_map_idb.isra.39+0xf1/0x150
[ib_srp]
[   73.649398]  [] ? srp_queuecommand+0xc21/0xc70
[ib_srp]
[   73.649405]  [] ? scsi_init_sgtable+0x3f/0x70
[scsi_mod]
[   73.649412]  [] ? scsi_dispatch_cmd+0xd8/0x1f0
[scsi_mod]
[   73.649418]  [] ? scsi_request_fn+0x46a/0x600
[scsi_mod]
[   73.649425]  [] ? __blk_run_queue+0x2f/0x40
[   73.649432]  [] ? cfq_insert_request+0x2f3/0x530
[   73.649436]  [] ? blk_flush_plug_list+0x1f5/0x220
[   73.649440]  [] ? blk_finish_plug+0x26/0x40
[   73.649459]  [] ?
__xfs_buf_delwri_submit+0x1b2/0x230 [xfs]
[   73.649474]  [] ?
xfs_buf_delwri_submit_nowait+0x1c/0x30 [xfs]
[   73.649489]  [] ?
xfs_buf_delwri_submit_nowait+0x1c/0x30 [xfs]
[   73.649503]  [] ? xfsaild+0x258/0x570 [xfs]
[   73.649516]  [] ?
xfs_trans_ail_cursor_first+0x80/0x80 [xfs]
[   73.649520]  [] ? kthread+0xcf/0xf0
[   73.649524]  [] ? kthread_park+0x50/0x50
[   73.649528]  [] ? ret_from_fork+0x3f/0x70
[   73.649531]  [] ? kthread_park+0x50/0x50
[   73.649534] ---[ end trace 2b5af006a2490f6b ]---
[   73.65] scsi host3: ib_srp: failed receive status WR flushed (5)
for iu 88060f84e780
[   83.772430] scsi host3: ib_srp: reconnect succeeded
[   83.780566] scsi host3: ib_srp: failed receive status WR flushed (5)
for iu 88060f801cc0
[   93.900380] scsi host3: ib_srp: reconnect succeeded
[   93.908355] scsi host3: ib_srp: failed receive status WR flushed (5)
for iu 88060f801cc0
[  104.027018] scsi host3: ib_srp: reconnect succeeded
[  104.036106] scsi host3: ib_srp: failed receive status WR flushed (5)
for iu 88060f801cc0
[  114.155391] scsi host3: ib_srp: reconnect succeeded
[  114.163894] scsi host3: ib_srp: failed receive status WR flushed (5)
for iu 88060f801cc0
[  124.285540] scsi host3: ib_srp: reconnect succeeded
[  124.296033] scsi host3: ib_srp: failed receive status WR flushed (5)
for iu 88060f801cc0
[  134.416857] scsi host3: ib_srp: reconnect succeeded
[  134.427546] scsi host3: ib_srp: failed receive status WR flushed (5)
for iu 88060f801cc0
[  144.548077] scsi host3: ib_srp: reconnect succeeded
[  144.42] scsi host3: ib_srp: failed receive status WR flushed (5)
for iu 88060f801cc0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 2/3] svcrdma: Use device rdma_read_access_flags

2015-11-11 Thread Christoph Hellwig
On Tue, Nov 10, 2015 at 11:25:46AM -0700, Jason Gunthorpe wrote:
> I like this, my only comment is we should have a rdma_cap for this
> behavior, rdma_cap_needs_rdma_read_mr(pd) or something?

Yes, that's better than checking the protocol.

> > +   if (!(dev->device_cap_flags &
> > IB_DEVICE_MEM_MGT_EXTENSIONS)) {
> 
> Lets enforce this in the core, if rdma_cap_needs_rdma_read_mr is set
> the the device must also set IB_DEVICE_MEM_MGT_EXTENSIONS, check at
> device creation time.

The iWarp verbs spec requires them to be supported, so that should not
be an issue.

> > +   } else if (rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) {
> > +   /*
> > +* For IB or RoCE life is easy, no unsafe write access is
> > +* required and multiple SGEs are supported, so we don't need
> > +* to use MRs.
> > +*/
> > +   newxprt->sc_reader = rdma_read_chunk_lcl;
> > +   } else {
> > +   /*
> > +* Neither iWarp nor IB-ish, we're out of luck.
> > +*/
> > goto errout;
> 
> No need for the else, !rdma_cap_needs_rdma_read_mr means pd->local_dma_lkey 
> is okay
> to use.

What would happen if someone tried to use NFS on usnic without this?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/3] IB/core: Expose a device attribute for rdma_read access flags

2015-11-11 Thread Christoph Hellwig
On Tue, Nov 10, 2015 at 11:36:27AM -0700, Jason Gunthorpe wrote:
> > n = ib_map_mr_sg(desc->mr, state->sg, state->sg_nents,
> > -dev->mr_page_size);
> > +dev->mr_page_size,
> > +/*
> > + * XXX: add a bool write argument to this function,
> > + * so that we only need to open up the required
> > + * permissions.
> > + */
> > +IB_MR_REMOTE | IB_MR_RDMA_READ |
> > IB_MR_RDMA_WRITE);
> 
> I would call it IB_RDMA_LKEY and IB_RDMA_RKEY. We have other places in
> the API where lkey/rkey is used and it makes a lot more sense to think
> about a MR as being either a lkey or rkey usable MR - since this is
> effectively what we are doing here with these ops.

Hmm, I really hate these suport short names, but if there is consensus I
can fix it up.

> > +enum ib_mr_flags {
> > +   /* scope: either remote or local */
> > +   IB_MR_REMOTE,
> > +   IB_MR_LOCAL,
> > +
> > +   /* direction: one or both can be ORed into the scope above */
> > +   IB_MR_RDMA_READ = (1 << 10),
> > +   IB_MR_RDMA_WRITE= (1 << 11)
> 
> Don't forget SEND too.

I don't think we're ever using that in the kernel, but it's an easy
addition for completeless.  Especially once we start exposing these
flags to the drivers.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 2/3] svcrdma: Use device rdma_read_access_flags

2015-11-11 Thread Christoph Hellwig
On Tue, Nov 10, 2015 at 04:55:51PM -0700, Jason Gunthorpe wrote:
> IMHO, the break point for when it makes sense to switch from a RDMA
> READ chain to a MR is going to be a RDMA core tunable. The performance
> curve won't have much to do with the ULP.

core/device, a lot of it depends on when we'd exceed the nr_sge limit
as well.  But yes, it needs to move out ot the ULP and into the core,
the ULP has no business here.

As said a few times before we need a core API that takes a struct
scatterlist and builds RDMA READ/WRITE requests from it in an efficient
way.  I have started talking to Sagi and preparing some build blocks
for it, but getting all corner cases right will take a while.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/3] IB/core: Expose a device attribute for rdma_read access flags

2015-11-11 Thread Christoph Hellwig
On Tue, Nov 10, 2015 at 11:01:56AM -0700, Jason Gunthorpe wrote:
> No need to change every driver.
> 
> I'd suggest something like
> 
>   unsigned int rdma_cap_rdma_read_mr_flags(const struct ib_pd *pd)
>   {
>  if (rdma_protocol_iwarp(pd->device, rdma_start_port(pd->device)))
>return IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE;
>  return IB_ACCESS_LOCAL_WRITE;
>   }

Yes, this looks good!
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


srp state in current mainline

2015-11-10 Thread Christoph Hellwig
I've just tried forward porting some work affecting SRP from a 4.1-ish
base, and started to run into error ASAP on current Linus' HEAD and also
4.3.  In current HEAD memory registrations on the client seem to fail,
probably due to the MR rework, but even on 4.3 I run into crazy
corruption reports from xfstests, which mostly seem to be slab
poisoning.  I'm not sure at this point if they are caused by the
target or initiator, but I'd like to share them.  This is a simply
xfstests run using XFS on a remote LIO ramdisk.

4.3 (actually -rc, but I didn't see any change since):

[   86.316719] run fstests generic/018 at 2015-11-10 08:52:56
[   86.558749] XFS (sdb): Mounting V4 Filesystem
[   86.798915] XFS (sdb): Ending clean mount
[   86.887999] XFS (sdc): Mounting V4 Filesystem
[   86.894340] XFS (sdc): Ending clean mount
[   86.980603] XFS (sdc): Unmounting Filesystem
[   86.992347] XFS (sdc): Mounting V4 Filesystem
[   86.999572] XFS (sdc): Ending clean mount
[   87.052941] XFS (sdc): Unmounting Filesystem
[   87.065080] XFS (sdc): Mounting V4 Filesystem
[   87.070402] XFS (sdc): Ending clean mount
[   87.124831] XFS (sdc): Unmounting Filesystem
[   87.136828] XFS (sdc): Mounting V4 Filesystem
[   87.144746] XFS (sdc): Ending clean mount
[   87.157052] XFS (sdc): Metadata corruption detected at 
xfs_agi_read_verify+0x4a/0xe0 [xfs], block 0x2
[   87.166374] XFS (sdc): Unmount and run xfs_repair
[   87.171161] XFS (sdc): First 64 bytes of corrupted metadata buffer:
[   87.177515] 880314c2ce00: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
6b  
[   87.186312] 880314c2ce10: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
6b  
[   87.195107] 880314c2ce20: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
6b  
[   87.203902] 880314c2ce30: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 
6b  
[   87.212716] XFS (sdc): metadata I/O error: block 0x2
("xfs_trans_read_buf_map") error 117 numblks 1
[   87.221816] XFS (sdc): xfs_do_force_shutdown(0x1) called from line
315 of file fs/xfs/xfs_trans_buf.c.  Return address = 0xa067138c
   87.221828] XFS (sdc): I/O Error Detected. Shutting down filesystem
   [   87.228132] XFS (sdc): Please umount the filesystem and rectify
   the problem(s)
   [   87.328890] XFS (sdc): xfs_log_force: error -5 returned.
   [   87.328897] XFS (sdc): Unmounting Filesystem
   [   87.328906] XFS (sdc): xfs_log_force: error -5 returned.
   [   87.328921] XFS (sdc): xfs_log_force: error -5 returned.
   [   87.432013] run fstests generic/020 at 2015-11-10 08:52:57

and then a couple times more until xfstests eventually gives up.

On current Linus' HEAD tree:

[  128.786534] run fstests generic/001 at 2015-11-10 09:05:06
[  132.914320] XFS (sdb): Unmounting Filesystem
[  132.914463] [ cut here ]
[  132.914474] WARNING: CPU: 3 PID: 1795 at 
drivers/infiniband/ulp/srp/ib_srp.c:1262 srp_map_desc+0x64/ 0x80 [ib_srp]()
[  132.914476] Modules linked in: xfs libcrc32c ib_srp(O) scsi_transport_srp 
nfsd auth_rpcgss oid_regis
try nfs_acl nfs lockd grace fscache sunrpc intel_powerclamp coretemp kvm_intel 
kvm irqbypass crct10dif_
pclmul crc32_pclmul sha256_ssse3 sha256_generic hmac drbg mgag200 ttm 
ansi_cprng drm_kms_helper aesni_i
ntel drm aes_x86_64 lrw gf128mul snd_pcm glue_helper ablk_helper snd_timer 
evdev cryptd ipmi_devintf i7
core_edac shpchp iTCO_wdt iTCO_vendor_support psmouse snd soundcore 
i2c_algo_bit i2c_core lpc_ich serio
_raw edac_core dcdbas acpi_power_meter pcspkr mfd_core acpi_cpufreq button 
tpm_tis ipmi_si ipmi_msghand
ler tpm ib_ipoib ib_umad rdma_ucm rdma_cm iw_cm ib_uverbs ib_cm mlx4_ib ib_sa 
ib_mad ib_core ib_addr au
tofs4 ext4 crc16 mbcache jbd2 sd_mod sg sr_mod cdrom ata_generic crc32c_intel 
ehci_pci uhci_hcd
[  132.914541]  ehci_hcd mptsas ata_piix scsi_transport_sas mptscsih
libata mptbase mlx4_core usbcore s
csi_mod usb_common bnx2
[  132.914552] CPU: 3 PID: 1795 Comm: xfsaild/sdb Tainted: G  IO 4.3.0+ 
#28
[  132.914554] Hardware name: Dell Inc. PowerEdge R710/00NH4P, BIOS 3.0.0 
01/31/2011
[  132.914556]  a05aa460 812c9453  
8106ef51
[  132.914559]  880313acfa60 880313acfa30 0006153efe80 
0001
[  132.914562]  880620a310c0 a05a3a14 08012207 
880313acfa60
[  132.914566] Call Trace:
[  132.914572]  [] ? dump_stack+0x40/0x5d
[  132.914578]  [] ? warn_slowpath_common+0x81/0xb0
[  132.914582]  [] ? srp_map_desc+0x64/0x80 [ib_srp]
[  132.914585]  [] ? srp_map_finish_fr+0x159/0x1f0 [ib_srp]
[  132.914589]  [] ? srp_map_idb.isra.39+0xf1/0x150 [ib_srp]
[  132.914593]  [] ? srp_queuecommand+0xc21/0xc70 [ib_srp]
[  132.914600]  [] ? scsi_init_sgtable+0x3f/0x70 [scsi_mod]
[  132.914607]  [] ? scsi_dispatch_cmd+0xd8/0x1f0 [scsi_mod]
[  132.914613]  [] ? scsi_request_fn+0x46a/0x600 [scsi_mod]
[  132.914619]  [] ? __blk_run_queue+0x2f/0x40
[  132.914624]  [] ? cfq_insert_request+0x2f3/0x530
[  

Re: [PATCH RFC 2/3] svcrdma: Use device rdma_read_access_flags

2015-11-10 Thread Christoph Hellwig
On Tue, Nov 10, 2015 at 01:46:40PM +0200, Sagi Grimberg wrote:
> 
> 
> On 10/11/2015 13:41, Christoph Hellwig wrote:
> >Oh, and while we're at it.  Can someone explain why we're even
> >using rdma_read_chunk_frmr for IB?  It seems to work around the
> >fact tat iWarp only allow a single RDMA READ SGE, but it's used
> >whenever the device has IB_DEVICE_MEM_MGT_EXTENSIONS, which seems
> >wrong.
> 
> I think Steve can answer it better than I can. I think that it is
> just to have a single code path for both IB and iWARP. I agree that
> the condition seems wrong and for small transfers rdma_read_chunk_frmr
> is really a performance loss.

Well, the code path already exists, but only is used fi
IB_DEVICE_MEM_MGT_EXTENSIONS isn't set.  Below is an untested patch
that demonstrates how I think svcrdma should setup the reads.  Note
that this also allows to entirely remove it's allphys MR.

Note that as a followon this would also allow to remove the
non-READ_W_INV code path from rdma_read_chunk_frmr as a future
step.


diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f869807..35fa638 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -147,7 +147,6 @@ struct svcxprt_rdma {
struct ib_qp *sc_qp;
struct ib_cq *sc_rq_cq;
struct ib_cq *sc_sq_cq;
-   struct ib_mr *sc_phys_mr;   /* MR for server memory */
int  (*sc_reader)(struct svcxprt_rdma *,
  struct svc_rqst *,
  struct svc_rdma_op_ctxt *,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c 
b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index ff4f01e..a410cb6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -160,7 +160,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
goto err;
atomic_inc(>sc_dma_used);
 
-   /* The lkey here is either a local dma lkey or a dma_mr lkey */
+   /* The lkey here is the local dma lkey */
ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
ctxt->sge[pno].length = len;
ctxt->count++;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c 
b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 9f3eb89..2780da4 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -887,8 +887,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt 
*xprt)
struct ib_cq_init_attr cq_attr = {};
struct ib_qp_init_attr qp_attr;
struct ib_device *dev;
-   int uninitialized_var(dma_mr_acc);
-   int need_dma_mr = 0;
int ret = 0;
int i;
 
@@ -986,68 +984,41 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt 
*xprt)
}
newxprt->sc_qp = newxprt->sc_cm_id->qp;
 
-   /*
-* Use the most secure set of MR resources based on the
-* transport type and available memory management features in
-* the device. Here's the table implemented below:
-*
-*  FastGlobal  DMA Remote WR
-*  Reg LKEYMR  Access
-*  Sup'd   Sup'd   Needed  Needed
-*
-* IWARPN   N   Y   Y
-*  N   Y   Y   Y
-*  Y   N   Y   N
-*  Y   Y   N   -
-*
-* IB   N   N   Y   N
-*  N   Y   N   -
-*  Y   N   Y   N
-*  Y   Y   N   -
-*
-* NB:  iWARP requires remote write access for the data sink
-*  of an RDMA_READ. IB does not.
-*/
-   newxprt->sc_reader = rdma_read_chunk_lcl;
-   if (dev->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
-   newxprt->sc_frmr_pg_list_len =
-   dev->max_fast_reg_page_list_len;
-   newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
-   newxprt->sc_reader = rdma_read_chunk_frmr;
-   }
+   if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) {
+   /*
+* iWARP requires remote write access for the data sink, and
+* only supports a single SGE for RDMA_READ requests, so we'll
+* have to use a memory registration for each RDMA_READ.
+*/
+   if (!(dev->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
+   /*
+* We're an iWarp device but don't support FRs.
+* Tought luck, better exit early as we have little
+  

Re: [PATCH RFC 1/3] IB/core: Expose a device attribute for rdma_read access flags

2015-11-10 Thread Christoph Hellwig
On Tue, Nov 10, 2015 at 12:44:13PM +0200, Sagi Grimberg wrote:
> Different devices may require different access permissions
> for rdma reads e.g. for Infiniband devices, local write access
> suffices while iWARP devices require remote write permissions as
> well.
> 
> This situation generates extra logic for ULPs that need to be aware
> of the underlying device to determine rdma read access. Instead,
> add a device attribute for ULPs to use without being aware of the
> underlying device.
> 
> Also, set rdma_read_access_flags in the relevant device drivers:
> mlx4, mlx5, qib, ocrdma, nes: IB_ACCESS_LOCAL_WRITE
> cxgb3, cxgb4, hfi: IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE

>From all I can tell nes also is a iWarp driver.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] IB: start documenting device capabilities

2015-11-10 Thread Christoph Hellwig
Just IB_DEVICE_LOCAL_DMA_LKEY and IB_DEVICE_MEM_MGT_EXTENSIONS for now
as I'm most familar with those.

Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 include/rdma/ib_verbs.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 45ce36e..6034f92 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -120,6 +120,14 @@ enum ib_device_cap_flags {
IB_DEVICE_RC_RNR_NAK_GEN= (1<<12),
IB_DEVICE_SRQ_RESIZE= (1<<13),
IB_DEVICE_N_NOTIFY_CQ   = (1<<14),
+
+   /*
+* This device supports a per-device lkey or stag that can be
+* used without performing a memory registration for the local
+* memory.  Note that ULPs should never check this flag, but
+* instead of use the local_dma_lkey flag in the ib_pd structure,
+* which will always contain a usable lkey.
+*/
IB_DEVICE_LOCAL_DMA_LKEY= (1<<15),
IB_DEVICE_RESERVED  = (1<<16), /* old SEND_W_INV */
IB_DEVICE_MEM_WINDOW= (1<<17),
@@ -133,6 +141,16 @@ enum ib_device_cap_flags {
IB_DEVICE_UD_IP_CSUM= (1<<18),
IB_DEVICE_UD_TSO= (1<<19),
IB_DEVICE_XRC   = (1<<20),
+
+   /*
+* This device supports the IB "base memory management extension",
+* which includes support for fast registrations (IB_WR_REG_MR,
+* IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs).  This flag should
+* also be set by any iWarp device which must support FRs to comply
+* to the iWarp verbs spec.  iWarp devices also support the
+* IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the
+* stag.
+*/
IB_DEVICE_MEM_MGT_EXTENSIONS= (1<<21),
IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22),
IB_DEVICE_MEM_WINDOW_TYPE_2A= (1<<23),
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/3] IB/core: Expose a device attribute for rdma_read access flags

2015-11-10 Thread Christoph Hellwig
On Tue, Nov 10, 2015 at 02:42:44PM +0200, Sagi Grimberg wrote:
> >When this approach is used, the upper layer doesn't have to be aware
> >at all of the lower layer's details.
> 
> Yea, we could do that too. Any preferences from other people?
> I'm pretty indifferent on which way to go...

Yes, that's the way to go.  I though we had agree on doing that as
one of the next step of the MR API cleanups, but decided to postpone
it past the first iteration for some reason that made it non-trivial.

I have to say I still don't like the Windows API either, though as it
still keeps the criptic {local,remote}_{read,write} flags.

Basically what we want is two-dimension selection:

/*
 * If %true this is the target of RDMA READ/WRITE operations
 * from the remote system.  If %false this is the sink / source
 * for RDMA READ/WRITE operations issued by the local system.
 */
enum ib_mr_scope { IB_MR_REMOTE, IB_MR_LOCAL };

/*
 * Operation we're registering for.  Can be OR'ed together
 * when allowing RDMA READs and WRITEs on a single MR.
 */
enum ib_mr_dir { IB_MR_RDMA_READ = 1 << 0, IB_MR_RDMA_WRITE = 1 << 1 };

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx: Expose max_fmr to ib_query_device

2015-11-05 Thread Christoph Hellwig
On Thu, Nov 05, 2015 at 12:48:45PM +0200, Yuval Shaia wrote:
> On Thu, Oct 29, 2015 at 07:21:45PM +0200, Sagi Grimberg wrote:
> > Hi Yuval,
> > 
> > The title prefix should be IB/mlx4:
> > 
> > >Expose max_fmr so it will be available to ULPs.
> > >max_fmr is num_mpts minus reserved.

What do you plan to use it for?  We're aiming hard to get rid of FMRs
from the whole kernel stack ASAP.

In doubt whatever you plan to use it for should be converted to FRs
instead.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/mlx: Expose max_fmr to ib_query_device

2015-11-05 Thread Christoph Hellwig
On Thu, Nov 05, 2015 at 01:07:49PM +0200, Yuval Shaia wrote:
> > What do you plan to use it for?  We're aiming hard to get rid of FMRs
> > from the whole kernel stack ASAP.
> We have two drivers that are using FMR.

I now RDS still has the totally silly separate code bases for IB vs
iWrap and you need to move IB to use the IW code for memory
registrations, but that's the second one?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 4/7] IB/srp: Fix a potential queue overflow in an error path

2015-11-03 Thread Christoph Hellwig
On Tue, Nov 03, 2015 at 12:50:59PM -0800, Bart Van Assche wrote:
> Such a check wouldn't be that simple because the only way to perform such a
> check is either by doubling the number of ib_map_mr_sg() calls or by
> performing additional memory allocations.

The other option woud be to disallow gappy SG lists unless supported by
the hardware with a single MR similar to iSER.  While this would leave
the SRP protocol support for multiple SG entries per command unused it
would significantly simplify the driver.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: shrink struct ib_send_wr V4

2015-10-29 Thread Christoph Hellwig
On Wed, Oct 28, 2015 at 10:57:59PM -0400, Doug Ledford wrote:
> > I had to do a minor hand merge to get this to apply, but it has been
> > pulled in for 4.4.
> 
> This breaks all of the drivers in staging BTW.  That will need fixed up
> before the pull request goes in during the merge window.

That latest branch on infradead.org should have fixed all the staging
drivers.  But Linus just clarified that we indeed do not have to care for
staging drivers, I asked him at kernel summit in front of the whole audience.

If you were not merging the latest branch you might also have to pick up
the mlx5 fix separately, Eli or Sagi might be able to help with that as
I'll be offline for a few days.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: merge struct ib_device_attr into struct ib_device V2

2015-10-21 Thread Christoph Hellwig
On Tue, Oct 20, 2015 at 06:07:31PM +0300, Or Gerlitz wrote:
> But this makes struct ib_device much much bigger, and this structure
> is used in **fast** path,
> e.g the ULP traverses through a pointer from struct ib_device to
> post_send/recv, poll_cq and friends.

But it doesn't get near the end of the structure.  Maybe it's time for a
little cache line 101 primer?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: merge struct ib_device_attr into struct ib_device V2

2015-10-21 Thread Christoph Hellwig
On Tue, Oct 20, 2015 at 06:13:35PM +0300, Sagi Grimberg wrote:
>> The networking community will let you work for 10y before they add a
>> field to struct net_device exactly b/c
>> of the reason I brought. Why here we can come out of the blue and add
>> tens if not hundreds of fields to our device structure?
>
> Keeping struct ib_device_attr embedded at the end of struct ib_device is
> exactly the same isn't it?

In terms of layout it is, in terms of usability it's worse.  Just to take
the networking example Or seems to like so much there is no struct netdev_attr
to which fields of struct netdev are moved out to, which you then need to
query and cache anyway in your private device structure.   This design simply
was a idiotic idea to start with and I don't understand why anyone would
han onto it.  I do understand Chuck's concern about churn, but I'd rather
fix the infiniband subsystem up before we're growing even more users.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: merge struct ib_device_attr into struct ib_device V2

2015-10-21 Thread Christoph Hellwig
On Wed, Oct 21, 2015 at 09:44:41AM +0300, Or Gerlitz wrote:
> Fact is that for struct net_device you will not add 333 new fields over 
> night in the coming 33 years, for sure.

That's because they never had this split and added fields to struct netdev
as required.  One interesting difference in netdev is that it move all
the function pointers out to a different structure so it could be kept
const.  This introduces one more level of pointer chasing but has other
advantages.

> This makes much sense to me, as a guideline. I don't think we should have a 
> device structure with the current
> 100 fields and another few hundreds without any notable benefit and against 
> the common practice in the networking
> subsystem.

Please understand the networking subsystem (or SCSI, or NVMe, or ...) before
making such incorrect comments.  We're moving towards how all other
subsystems work.

> We have a UAPI that requires us to keep the device query verb for ever, and 
> hence the returned device attr struct,
> it would be a much smaller and noisy patch to cache an device attr pointer 
> on the device structure.

Take a look at the code.  The only time we ever call into ->query_device is
for the userspace only timestamping extensions only implemented for mlx4.

With all the stuff we have on the plate the kernel API will look substatially
different from the arkane 'Verbs' implementation in userspace, and they will
use less and less common code.  Libuvers and the ABIs it uses are something
we can't change unfortunately, but we can make the kernel API much much
better by learining lessons from other kernel subsystems.  That's work
that Jason Sagi and me have been doing for a while.

I'd also suggest you update your Linux knowledge before trying to
micromanage patch submissions.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: merge struct ib_device_attr into struct ib_device V2

2015-10-21 Thread Christoph Hellwig
On Wed, Oct 21, 2015 at 10:11:29AM +0300, Or Gerlitz wrote:
>
> We will have many more device query extensions,

None of which use struct ib_device_attr I hope!

> but the point I tried to 
> make here is a bit different --
> we do need to keep the user/kernel device attr struct as part of the UAPI, 

It's an entirely separate ib_uverbs_query_device_resp structure. 

> and don't see any reason to
> avoid it in the kernel, just because some other subsystems do that, 
> according to your view. As I said, you
> would have to work real (real) hard to convince the networking ppl to add 
> fields to struct net_device sk_buff

Or, please stop these bullshit strawman arguments.  Yes, adding fields to
struct sk_buff will be hard, but that's not the right comparism.  struct
sk_buff is a structured allocated for the data buffers in great quantities and
not the net_device structure allocated once per device.  I'm going to
finish this email, but if you don't even try to get your facts right I'll stop
responding ro you because it's futile.

> and friends... the nature of rdma/offloading is such that new attr come 
> often and I don't think we need
> to touch our device struct every release.

For anything you want to add you need to touch _a_ struct.  It's not any
difference in efforts if it's ib_device_attr or ib_device.

You're not even saving much memory if at all as every driver caches at
least some fields of it in their own device structure, and iser, isert, 
srpt and nfs cache the full structure, so if you use one of those you're
already having one of them per device.  If you use two of them you
have multiple copies plus individual fields caches by other ULDs and core
code.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: merge struct ib_device_attr into struct ib_device V2

2015-10-21 Thread Christoph Hellwig
On Wed, Oct 21, 2015 at 10:41:13AM +0300, Or Gerlitz wrote:
> I know, but a patch that adds caching an attr pointer on the device will 
> remove these local caches,
> we actually had that/similar patch posted here and it was dropped/forgotten.
>
>> so if you use one of those you're
>> already having one of them per device.  If you use two of them you
>> have multiple copies plus individual fields caches by other ULDs and core
>> code.
>
> again, the method I propose does remove these duplications all together.
>
> I am still waiting to hear what is the precise argument for having this 
> change.

It does everything the pointer does, but in addition saves memory (no
rounding up of the slab size), reduces pointer chasing and makes and API
that's easier to use and more similar to how all other major Linux subsystems
work.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: merge struct ib_device_attr into struct ib_device V2

2015-10-20 Thread Christoph Hellwig
On Tue, Oct 20, 2015 at 04:08:54PM +0300, Sagi Grimberg wrote:
> Can you explain what you mean by performance gains? My understanding is
> that this patch is not performance critical. It just replaces each and
> every ULP device attributes caching.

Exactly.  The only 'performance' we care about is that of ULD writers :)
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] IB: merge struct ib_device_attr into struct ib_device

2015-10-12 Thread Christoph Hellwig
Avoid the need to query for device attributes and store them in a
separate structure by merging struct ib_device_attr into struct
ib_device.  This matches how the device structures are used in most
Linux subsystems.

Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/core/cm.c   |  12 +-
 drivers/infiniband/core/cma.c  |   8 -
 drivers/infiniband/core/device.c   |  20 ---
 drivers/infiniband/core/fmr_pool.c |  20 +--
 drivers/infiniband/core/sysfs.c|  14 +-
 drivers/infiniband/core/uverbs_cmd.c   | 128 +++-
 drivers/infiniband/core/verbs.c|   8 +-
 drivers/infiniband/hw/cxgb3/iwch_provider.c|  60 +++-
 drivers/infiniband/hw/cxgb4/provider.c |  64 +++-
 drivers/infiniband/hw/mlx4/main.c  | 169 -
 drivers/infiniband/hw/mlx5/main.c  | 116 ++
 drivers/infiniband/hw/mthca/mthca_provider.c   |  77 +-
 drivers/infiniband/hw/nes/nes_verbs.c  |  94 +---
 drivers/infiniband/hw/ocrdma/ocrdma_main.c |  40 -
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c|  49 --
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h|   2 -
 drivers/infiniband/hw/qib/qib_verbs.c  |  86 +--
 drivers/infiniband/hw/usnic/usnic_ib_main.c|   3 +-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c   |  50 ++
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h   |   4 +-
 drivers/infiniband/ulp/ipoib/ipoib_cm.c|  19 +--
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c   |  14 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |  21 +--
 drivers/infiniband/ulp/iser/iscsi_iser.c   |   4 +-
 drivers/infiniband/ulp/iser/iscsi_iser.h   |   2 -
 drivers/infiniband/ulp/iser/iser_memory.c  |   9 +-
 drivers/infiniband/ulp/iser/iser_verbs.c   |  38 ++---
 drivers/infiniband/ulp/isert/ib_isert.c|  43 ++
 drivers/infiniband/ulp/isert/ib_isert.h|   1 -
 drivers/infiniband/ulp/srp/ib_srp.c|  32 ++--
 drivers/infiniband/ulp/srpt/ib_srpt.c  |  15 +-
 drivers/infiniband/ulp/srpt/ib_srpt.h  |   3 -
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c|  21 +--
 drivers/staging/rdma/amso1100/c2.h |   3 -
 drivers/staging/rdma/amso1100/c2_pd.c  |   6 +-
 drivers/staging/rdma/amso1100/c2_provider.c|  23 +--
 drivers/staging/rdma/amso1100/c2_rnic.c|  63 +++-
 drivers/staging/rdma/ehca/ehca_hca.c   |  78 +-
 drivers/staging/rdma/ehca/ehca_iverbs.h|   3 +-
 drivers/staging/rdma/ehca/ehca_main.c  |   3 +-
 drivers/staging/rdma/hfi1/verbs.c  |  89 +--
 drivers/staging/rdma/ipath/ipath_verbs.c   |  90 +--
 include/rdma/ib_verbs.h|  98 ++--
 net/rds/ib.c   |  28 +---
 net/rds/iw.c   |  23 +--
 net/sunrpc/xprtrdma/frwr_ops.c |   7 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |  48 +++---
 net/sunrpc/xprtrdma/verbs.c|  24 +--
 net/sunrpc/xprtrdma/xprt_rdma.h|   1 -
 49 files changed, 725 insertions(+), 1108 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index ea4db9c..56c7a70 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3749,16 +3749,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
 }
 EXPORT_SYMBOL(ib_cm_init_qp_attr);
 
-static void cm_get_ack_delay(struct cm_device *cm_dev)
-{
-   struct ib_device_attr attr;
-
-   if (ib_query_device(cm_dev->ib_device, ))
-   cm_dev->ack_delay = 0; /* acks will rely on packet life time */
-   else
-   cm_dev->ack_delay = attr.local_ca_ack_delay;
-}
-
 static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
   char *buf)
 {
@@ -3870,7 +3860,7 @@ static void cm_add_one(struct ib_device *ib_device)
return;
 
cm_dev->ib_device = ib_device;
-   cm_get_ack_delay(cm_dev);
+   cm_dev->ack_delay = ib_device->local_ca_ack_delay;
cm_dev->going_down = 0;
cm_dev->device = device_create(_class, _device->dev,
   MKDEV(0, 0), NULL,
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index b1ab13f..077c4e2 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1847,7 +1847,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
struct rdma_id_private *listen_id, *conn_id;
struct rdma_cm_event event;
int ret;
-   struct ib_device_attr attr;
   

merge struct ib_device_attr into struct ib_device V2

2015-10-12 Thread Christoph Hellwig
This patch gets rid of struct ib_device_attr and cleans up drivers nicely.

It goes on top of my send_wr cleanups and the memory registration udpates
from Sagi.

Changes since V1:
 - rebased on top of the Sagi's latest reg_api.6 branch

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: merge struct ib_device_attr into struct ib_device V2

2015-10-12 Thread Christoph Hellwig
On Mon, Oct 12, 2015 at 12:26:06PM +0300, Sagi Grimberg wrote:
> First go with this looks OK for mlx4. mlx5 needs the below incremental
> patch to be folded in.
>
> we need dev->ib_dev.max_pkeys set when get_port_caps() is called.

Thanks, I've folded your patch and force pushed out the updated tree.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Ksummit-discuss] [TECH TOPIC] IRQ affinity

2015-10-12 Thread Christoph Hellwig
On Mon, Oct 12, 2015 at 12:09:48PM -0400, Theodore Ts'o wrote:
> Hi Christoph,
> 
> Do you think this is still an issue that would be worth discsussing at
> the kernel summit as a technical topic?  If so, would you be willing
> to be responsible for kicking off the discussion for this topic?

Hi Ted,

while we have a high level agreement there's still some discussion
needed.  I can prepare a few slides for 10 minute discussion and then
take it to the hallways with the interested people.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: shrink struct ib_send_wr V4

2015-10-11 Thread Christoph Hellwig
On Sun, Sep 13, 2015 at 05:13:33PM +0200, Christoph Hellwig wrote:
> This series shrinks the WR size by splitting out the different WR
> types.
> 
> Patch number one is too large for the mailinglist, so if you didn't
> get it grab it here:
> 
>   
> http://git.infradead.org/users/hch/rdma.git/commitdiff_plain/c752d80937ff2d71f25ae7fcdd1a151054fb2ceb
> 
> or the full git tree at:
> 
>   git://git.infradead.org/users/hch/rdma.git wr-cleanup

This git tree has been updated with the latests ACKs and has been
rebased to the latest rdma tree which introduced a few trivial rejects.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB: merge struct ib_device_attr into struct ib_device

2015-10-11 Thread Christoph Hellwig
On Sun, Oct 11, 2015 at 07:16:27PM +0300, Sagi Grimberg wrote:
> Christoph, would you mind rebasing it on top of 4.3-rc4 or so? I
> want to develop over it so I can test it on the fly.

I can do a rebase to whatever you want.  Initially this was
over your reg_api branch.  Is a rebase to the latest reg_api.6
fine?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] iser: Set block queue_virt_boundary and remove bounce buffering code

2015-10-10 Thread Christoph Hellwig
Looks good,

Reviewed-by: Christoph Hellwig <h...@lst.de>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] IB/iser: Enable SG clustering

2015-10-10 Thread Christoph Hellwig
Looks good,

Reviewed-by: Christoph Hellwig <h...@lst.de>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 00/26] New fast registration API

2015-10-08 Thread Christoph Hellwig
On Thu, Oct 08, 2015 at 10:50:43AM +0300, Sagi Grimberg wrote:
> - Rebased against Doug's for-4.4 tree (4.3.0-rc1) + 4.3-rc fixes

Does this means its not on top of the send_wr changes now?  I'm fine
with that as I think your series is even more important than the send_wr
split and I'm happy to rebase, but I'd also really like to see some
progress on them..
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 00/24] New fast registration API

2015-10-07 Thread Christoph Hellwig
On Tue, Oct 06, 2015 at 11:37:40AM +0300, Sagi Grimberg wrote:
> The issue is that the device requires the MR page array to have
> an alignment (0x40 for mlx4 and 0x400 for mlx5). When I modified the
> page array allocation to be non-coherent I didn't take care of
> alignment.

Just curious:  why did you switch away from the coheret dma allocations
anyway?  Seems like the page lists are mapped as long as they are
allocated so the coherent allocator would seem like a nice fit.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 00/24] New fast registration API

2015-10-07 Thread Christoph Hellwig
On Wed, Oct 07, 2015 at 12:25:25PM +0300, Sagi Grimberg wrote:
> Bart suggested that having to sync once for the entire page list might
> perform better than coherent memory. I'll settle either way since using
> non-coherent memory might cause higher-order allocations due to
> alignment, so it's not free-of-charge.

I don't really care either way, it just seemed like an odd change hiding
in here that I missed when reviewing earlier.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 01/24] IB/core: Introduce new fast registration API

2015-09-28 Thread Christoph Hellwig
On Mon, Sep 28, 2015 at 01:57:52PM -0700, Bart Van Assche wrote:
> >Actually I think it doesn't since it is only relevant for the else
> >statement where we are passing the page_size boundary.
> 
> Hello Sagi,
> 
> Suppose that the following sg-list is passed to this function as { offset,
> length } pairs and that this list has not been coalesced by the DMA mapping
> code: [ { 0, page_size / 4 }, { page_size / 4, page_size / 4 }, { 2 *
> page_size / 4, page_size / 2 } ]. I think the algorithm in patch 01/24 will
> map the above sample sg-list onto two pages. Shouldn't that sg-list be
> mapped onto one page instead ?

Shouldn't higher layers take care of this?  Trying to implement the same
coalescing algorithm at various layers isn't very optimal, although we
need to decide and document which one is responsible.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: libmlx4 and libmlx5 git trees? Who is handling those?

2015-09-28 Thread Christoph Hellwig

Hi Robert,

getting a package out should not be an issue.  master should always be
in releasable state, and cutting a release should be a simple shell
script doing all the tagging and uploading.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] IB: merge struct ib_device_attr into struct ib_device

2015-09-23 Thread Christoph Hellwig
Avoid the need to query for device attributes and store them in a
separate structure by merging struct ib_device_attr into struct
ib_device.  This matches how the device structures are used in most
Linux subsystems.

Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 drivers/infiniband/core/cm.c   |  12 +-
 drivers/infiniband/core/cma.c  |   8 -
 drivers/infiniband/core/device.c   |  20 ---
 drivers/infiniband/core/fmr_pool.c |  20 +--
 drivers/infiniband/core/sysfs.c|  14 +-
 drivers/infiniband/core/uverbs_cmd.c   | 128 +++-
 drivers/infiniband/core/verbs.c|   8 +-
 drivers/infiniband/hw/cxgb3/iwch_provider.c|  60 +++-
 drivers/infiniband/hw/cxgb4/provider.c |  64 +++-
 drivers/infiniband/hw/mlx4/main.c  | 167 -
 drivers/infiniband/hw/mlx5/main.c  | 116 ++
 drivers/infiniband/hw/mthca/mthca_provider.c   |  77 +-
 drivers/infiniband/hw/nes/nes_verbs.c  |  94 +---
 drivers/infiniband/hw/ocrdma/ocrdma_main.c |  40 -
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c|  49 --
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h|   2 -
 drivers/infiniband/hw/qib/qib_verbs.c  |  86 +--
 drivers/infiniband/hw/usnic/usnic_ib_main.c|   3 +-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c   |  50 ++
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h   |   4 +-
 drivers/infiniband/ulp/ipoib/ipoib_cm.c|  19 +--
 drivers/infiniband/ulp/ipoib/ipoib_ethtool.c   |  14 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c  |  21 +--
 drivers/infiniband/ulp/iser/iscsi_iser.c   |   4 +-
 drivers/infiniband/ulp/iser/iscsi_iser.h   |   2 -
 drivers/infiniband/ulp/iser/iser_memory.c  |   9 +-
 drivers/infiniband/ulp/iser/iser_verbs.c   |  38 ++---
 drivers/infiniband/ulp/isert/ib_isert.c|  43 ++
 drivers/infiniband/ulp/isert/ib_isert.h|   1 -
 drivers/infiniband/ulp/srp/ib_srp.c|  32 ++--
 drivers/infiniband/ulp/srpt/ib_srpt.c  |  15 +-
 drivers/infiniband/ulp/srpt/ib_srpt.h  |   3 -
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c|  21 +--
 drivers/staging/rdma/amso1100/c2.h |   3 -
 drivers/staging/rdma/amso1100/c2_pd.c  |   6 +-
 drivers/staging/rdma/amso1100/c2_provider.c|  23 +--
 drivers/staging/rdma/amso1100/c2_rnic.c|  63 +++-
 drivers/staging/rdma/ehca/ehca_hca.c   |  78 +-
 drivers/staging/rdma/ehca/ehca_iverbs.h|   3 +-
 drivers/staging/rdma/ehca/ehca_main.c  |   3 +-
 drivers/staging/rdma/hfi1/verbs.c  |  89 +--
 drivers/staging/rdma/ipath/ipath_verbs.c   |  90 +--
 include/rdma/ib_verbs.h|  98 ++--
 net/rds/ib.c   |  28 +---
 net/rds/iw.c   |  23 +--
 net/sunrpc/xprtrdma/frwr_ops.c |   7 +-
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |  48 +++---
 net/sunrpc/xprtrdma/verbs.c|  23 +--
 net/sunrpc/xprtrdma/xprt_rdma.h|   1 -
 49 files changed, 723 insertions(+), 1107 deletions(-)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index ea4db9c..56c7a70 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -3749,16 +3749,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
 }
 EXPORT_SYMBOL(ib_cm_init_qp_attr);
 
-static void cm_get_ack_delay(struct cm_device *cm_dev)
-{
-   struct ib_device_attr attr;
-
-   if (ib_query_device(cm_dev->ib_device, ))
-   cm_dev->ack_delay = 0; /* acks will rely on packet life time */
-   else
-   cm_dev->ack_delay = attr.local_ca_ack_delay;
-}
-
 static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
   char *buf)
 {
@@ -3870,7 +3860,7 @@ static void cm_add_one(struct ib_device *ib_device)
return;
 
cm_dev->ib_device = ib_device;
-   cm_get_ack_delay(cm_dev);
+   cm_dev->ack_delay = ib_device->local_ca_ack_delay;
cm_dev->going_down = 0;
cm_dev->device = device_create(_class, _device->dev,
   MKDEV(0, 0), NULL,
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index b1ab13f..077c4e2 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1847,7 +1847,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id,
struct rdma_id_private *listen_id, *conn_id;
struct rdma_cm_event event;
int ret;
-   struct ib_device_attr attr;
   

Re: [PATCH] IB: merge struct ib_device_attr into struct ib_device

2015-09-23 Thread Christoph Hellwig
On Wed, Sep 23, 2015 at 10:23:15AM -0700, Chuck Lever wrote:
> Getting rid of ib_query_device() makes sense. Moving the device
> attributes into ib_device is nice. Getting rid of ib_device_attr
> is of questionable value. Why do we need to go there?
> 
> IB core API changes generate merge conflicts with feature work
> in the ULPs. It’s a pain for maintainers. In this particular
> case, keeping ib_device_attr would greatly reduce ULP churn.

But it will keep a totally nonsenical structure around in the future.

Note that the whole in-kernel Verbs API is a bit of a nightmare, so
there will be all kinds of larger changes.  Better get it right as long
as we have as few ULPs as we have now.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: merge struct ib_device_attr into struct ib_device

2015-09-22 Thread Christoph Hellwig
Hi Yann,

looks like the patch was too large and majordomo ate it

Here is a link:

http://git.infradead.org/users/hch/rdma.git/commitdiff/0e46553467cd01b63ab9c985f87c18c5328880bb
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


message size, was Re: merge struct ib_device_attr into struct ib_device

2015-09-22 Thread 'Christoph Hellwig'
On Tue, Sep 22, 2015 at 04:06:17PM -0500, Steve Wise wrote:
> Can you create a series of smaller patches that will fit on the list?
> That would make it easier for everyone to review/comment. 

I don't see how that is possible, as it's a flag day change.

But maybe we really need to bump up the message size limits for linux-rdma,
I'm pretty sure linux-kernel and linux-scsi handle attachments of this
size just fine.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


merge struct ib_device_attr into struct ib_device

2015-09-21 Thread Christoph Hellwig
This patch gets rid of struct ib_device_attr and cleans up drivers nicely.

It goes on top of my send_wr cleanups and the memory registration udpates
from Sagi.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 00/24] New fast registration API

2015-09-19 Thread Christoph Hellwig
Hi Sagi,

I've converted the driver I'm developing to your API and it works
great.  I think this is an important step towards making the RDMA
more usable!
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH rdma-next 02/32] IB/core: Add SEND_LAST_INV and SEND_ONLY_INV opcodes

2015-09-16 Thread Christoph Hellwig
On Wed, Sep 16, 2015 at 04:42:36PM +0300, Kamal Heib wrote:
> Intorduce Add SEND_LAST_INV and SEND_ONLY_INV opcodes in ib_pack.h to be
> used by RXE for RC.

Why does RXE need new public opcodes?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] IB: remove xrc_remote_srq_num from struct ib_send_wr

2015-09-13 Thread Christoph Hellwig
The field is only initialized in mlx, but never used.

If we want to add proper XRC support it should be done with a new
struct ib_xrc_wr.

This shrinks the various WR structures by another 4 bytes.

Signed-off-by: Christoph Hellwig <h...@lst.de>
Reviewed-by: Sagi Grimberg <sa...@mellanox.com>
Reviewed-by: Jason Gunthorpe <jguntho...@obsidianresearch.com>
---
 drivers/infiniband/hw/mlx5/qp.c | 1 -
 include/rdma/ib_verbs.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 9941c4a..65696e1 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2629,7 +2629,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
switch (ibqp->qp_type) {
case IB_QPT_XRC_INI:
xrc = seg;
-   xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num);
seg += sizeof(*xrc);
size += sizeof(*xrc) / 16;
/* fall through */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 25f022c..edf0290 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1100,7 +1100,6 @@ struct ib_send_wr {
__be32  imm_data;
u32 invalidate_rkey;
} ex;
-   u32 xrc_remote_srq_num; /* XRC TGT QPs only */
 };
 
 struct ib_rdma_wr {
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RDMA/CM and multiple QPs

2015-09-10 Thread Christoph Hellwig
On Thu, Sep 10, 2015 at 12:50:33PM +0300, Sagi Grimberg wrote:
> >What I'm more interested in is a way to tell the CM that I only
> >want routes that are using this ib_device that I got from the first
> >lookup as all others are useless for me.
> >
> 
> I'm not sure I understand what you are aiming for? if you connect to
> a single address multiple times you will get the same device because
> it is the same route right?

In testing I do get the same all the time, but I don't see anything that
gurantees that in code or documentation.

Think about the case where the routing changes between the calls, or
we're using multipath TCP for example.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RDMA/CM and multiple QPs

2015-09-08 Thread Christoph Hellwig
On Tue, Sep 08, 2015 at 03:32:11PM +0300, Sagi Grimberg wrote:
> The CM is responsible of establishing an RDMA channel. What you are
> referring to is a concept of a session. I'm not entirely sure how we can
> fit a model where the CM establishes a multi-channel session as the
> CM request contains a (single) source QPN. So there is a 1-1
> relationship between a cm_id and a queue-pair. The device handle depends
> on the address resolution to the end-node.
> 
> I assume we can think of some form of an rdma_session which will manage
> multiple cm_id's (that belongs to a single address resolution), call
> the ULP to allocate their corresponding queue-pairs and send a connect
> request for each one. Such an rdma_session can verify the same ib_device
> handle on all the cm_id's. But I'm not sure how such a concept would
> impact on aspects such as event handling etc...

What I'm more interested in is a way to tell the CM that I only
want routes that are using this ib_device that I got from the first
lookup as all others are useless for me.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RDMA/CM and multiple QPs

2015-09-06 Thread Christoph Hellwig
On Sun, Sep 06, 2015 at 01:24:52PM +0530, Parav Pandit wrote:
> Yes. SRP is good example. The point I am trying to make is, SRP
> implements failover and request spreading where one QP fails it
> delivers to other QP.

But SRP doesn't implement that.  There are no fail over capabilities
in a single SRP session even with multiple QPs, and the spreading
is implemented by a higher layer, namely blk-mq, which is common code
for all block drivers.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RDMA/CM and multiple QPs

2015-09-06 Thread Christoph Hellwig
Hi All,

right now RDMA/CM works on a QP basis, but seems very awakward if you
want multiple QPs as part of a single logical device, which will be
useful for a lot of modern protocols.  For example we will need to check
in the CM handler that we're not getting a different ib_device if we
want to apply the device limit in any sort of global scope, and it's
generally very hard to get a struct ib_device that can be used as
a driver model parent.

Is there any interest in trying to add an API to the CM to do a single
address resolution and allocate multiple QPs with these checks in
place?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RDMA/CM and multiple QPs

2015-09-06 Thread Christoph Hellwig
On Sun, Sep 06, 2015 at 01:12:56PM +0530, Parav Pandit wrote:
> Hi Christoph,
> 
> Establishing multiple QP is just one part of it.
> Bigger challenge is how do we distribute the work request among
> multiple QPs

For my case I simply rely on the blk-mq layer to have cpu-local queues,
so that's a somewhat solved issue as long as you are fine with the
usage model.  If your usage is skewed heavily towards certain CPUs
it might be a little suboptimal.

Note that the SRP driver already in tree is a good example for this,
although it doesn't use RDMA/CM and thus already operates on a
per-ib_device level.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: shrink struct ib_send_wr V3

2015-08-31 Thread Christoph Hellwig
On Sun, Aug 30, 2015 at 06:31:35PM +0300, Sagi Grimberg wrote:
>>   - patch 2 now explicitly replaces the weird overloading in the mlx5
>> driver with an explicit embedding of struct ib_send_wr, similar
>> to what we do for all other MRs.
>
> That's nice,
>
> There is one non-trivial spot that was missed in mlx5_ib_post_send
> though:

Oh, that was a weird abuse of the old casts.

I've foled both your fixes and force pushed to the wr-cleanup branch.

I do not plan to resend the series until the merge window for 4.4
is open.  Doug, any chance you could pick up the first patch in the
series for 4.3-rc?  It's marked for stable as well.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


shrink struct ib_send_wr V3

2015-08-26 Thread Christoph Hellwig
This series shrinks the WR size by splitting out the different WR
types.

Patch number two is too large for the mailinglist, so if you didn't
get it grab it here:


http://git.infradead.org/users/hch/rdma.git/commitdiff/2b63f958de7bd630aba85caf65986831d4372869

or the full git tree at:

git://git.infradead.org/users/hch/rdma.git wr-cleanup

Changes since V2:

 - fixed patch one to accept SEND - note that this was alredy fixed by
   patch 2
 - added a CC: stable for patch 1
 - added additional review tags

Changes since the version sent around a couple of times:

 - new patch 1 which rejects invalid opcodes.  Patch 2 was doing
   this implicitly except for UD QPs, but I think we should a)
   do this explicitly and b) ensure this goes into 4.2 and -stable
   as I can see quite a lot of harm from submitting such malformed
   operations
 - patch 2 now covers all drivers including those in staging to
   side step any sort of discussions on the staging tree.
 - patch 2 now explicitly replaces the weird overloading in the mlx5
   driver with an explicit embedding of struct ib_send_wr, similar
   to what we do for all other MRs.
 - new patch to drop another unused send_wr field.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] IB: remove xrc_remote_srq_num from struct ib_send_wr

2015-08-26 Thread Christoph Hellwig
The field is only initialized in mlx, but never used.

If we want to add proper XRC support it should be done with a new
struct ib_xrc_wr.

This shrinks the various WR structures by another 4 bytes.

Signed-off-by: Christoph Hellwig h...@lst.de
Reviewed-by: Sagi Grimberg sa...@mellanox.com
Reviewed-by: Jason Gunthorpe jguntho...@obsidianresearch.com
---
 drivers/infiniband/hw/mlx5/qp.c | 1 -
 include/rdma/ib_verbs.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 04df156..83a290f 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2634,7 +2634,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
switch (ibqp-qp_type) {
case IB_QPT_XRC_INI:
xrc = seg;
-   xrc-xrc_srqn = htonl(wr-xrc_remote_srq_num);
seg += sizeof(*xrc);
size += sizeof(*xrc) / 16;
/* fall through */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9b29c78..b855189 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1100,7 +1100,6 @@ struct ib_send_wr {
__be32  imm_data;
u32 invalidate_rkey;
} ex;
-   u32 xrc_remote_srq_num; /* XRC TGT QPs only */
 };
 
 struct ib_rdma_wr {
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] IB/uverbs: reject invalid or unknown opcodes

2015-08-26 Thread Christoph Hellwig
We have many WR opcodes that are only supported in kernel space
and/or require optional information to be copied into the WR
structure.  Reject all those not explicitly handled so that we
can't pass invalid information to drivers.

Cc: sta...@vger.kernel.org
Signed-off-by: Christoph Hellwig h...@lst.de
Reviewed-by: Jason Gunthorpe jguntho...@obsidianresearch.com
Reviewed-by: Sagi Grimberg sa...@mellanox.com
---
 drivers/infiniband/core/uverbs_cmd.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index a15318a..be4cb9f 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2372,6 +2372,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
next-send_flags = user_wr-send_flags;
 
if (is_ud) {
+   if (next-opcode != IB_WR_SEND 
+   next-opcode != IB_WR_SEND_WITH_IMM) {
+   ret = -EINVAL;
+   goto out_put;
+   }
+
next-wr.ud.ah = idr_read_ah(user_wr-wr.ud.ah,
 file-ucontext);
if (!next-wr.ud.ah) {
@@ -2411,9 +2417,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
user_wr-wr.atomic.compare_add;
next-wr.atomic.swap = user_wr-wr.atomic.swap;
next-wr.atomic.rkey = user_wr-wr.atomic.rkey;
+   case IB_WR_SEND:
break;
default:
-   break;
+   ret = -EINVAL;
+   goto out_put;
}
}
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



ehca driver status?

2015-08-26 Thread Christoph Hellwig
Hi Hoang-Nam, hi Christoph, hi Alexander,

do you know what the current state of the eHCA driver and hardware is?

The driver hasn't seen any targeted updats since 2010, so we wonder if
it's still alive?  It's one of the few drivers not supporting FRWRs,
and it's also really odd in that it has its own dma_map_ops
implementation that seems to pass virtual addresses to the hardware (or
probably rather firmware), getting in the way of at least two currently
pending cleanups.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] IB/uverbs: reject invalid or unknown opcodes

2015-08-25 Thread Christoph Hellwig
On Mon, Aug 24, 2015 at 10:59:21AM +0300, Haggai Eran wrote:
  It looks odd to me as well, but it's not really something I want to
  change in this series.  Note that sparse annoted types like __be32
  aren't really common in userspace, but with a bit of effort they can
  be supported.  We have them and regularly run sparse for xfsprogs for
  example.
 
 I have to try it with libibverbs sometime. It doesn't use uapi yet
 though IIRC - it has its own version of the header files.

Yes, I noticed that.  And the WR opcodes aren't even exported in the
uapi header, and use a shared namespace with the in-kernel only ones.

It's all a giant mess unfortunately.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] IB/uverbs: reject invalid or unknown opcodes

2015-08-24 Thread Christoph Hellwig
On Mon, Aug 24, 2015 at 09:52:14AM +0300, Haggai Eran wrote:
 Okay. Maybe you can just add a case for IB_WR_SEND in this patch to
 avoid hurting bisectability.

I've done this already, just waiting for more feedback before resending:

http://git.infradead.org/users/hch/rdma.git/commitdiff/20f34ca8ecac302984f3a92b9ad29f5f4b41780d

 Looking at the uverbs part in patch 2, I think the changes are okay. I
 noticed there's a (__be32 __force) cast of the immediate data from
 userspace (it was already in the existing code). I wonder, why not
 define the field in the uapi struct as __be32 in the first place?

It looks odd to me as well, but it's not really something I want to
change in this series.  Note that sparse annoted types like __be32
aren't really common in userspace, but with a bit of effort they can
be supported.  We have them and regularly run sparse for xfsprogs for
example.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] IB/uverbs: reject invalid or unknown opcodes

2015-08-22 Thread Christoph Hellwig
On Sat, Aug 22, 2015 at 06:38:47AM +, Haggai Eran wrote:
 It looks like the default case in the non-UD branch is currently used to 
 handle plain IB_WR_SEND operations, so the patch would cause these to return 
 an error.

Indeed.  It's handled fine in patch 2 which splits up the case, but
will be incorrectly rejected with just this patch applied.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH WIP 28/43] IB/core: Introduce new fast registration API

2015-08-21 Thread Christoph Hellwig
On Thu, Aug 20, 2015 at 01:04:13PM -0600, Jason Gunthorpe wrote:
 Trying to decouple the sub resources, ie by separately pooling the
 MR/SQE/etc, is just unnecessary complexity, IMHO.. NFS client already
 had serioues bugs in this area.
 
 So, I turn to the idea that every ULP should work as the above, which
 means when it gets to working on a 'slot' that implies there is an
 actual struct ib_mr resource guaranteed available. This is why I
 suggested using the 'struct ib_mr' to guide the SG construction even if
 the actual HW MR isn't going to be used. The struct ib_mr is tied to
 the slot, so using it has no cost.

How is this going to work for drivers that might consumer multiple
MRs per request like SRP or similar upcoming block drivers?  Unless
you want to allocate a potentially large number of MRs for each
request that scheme doesn't work.

 This forces the ULP to deal with many of the issues. Having a slot
 means guarenteed minimum avaiable MR,SQE,CQE resources. That
 guarenteed minimum avoids the messy API struggle in my prior writings.
 
 .. and maybe the above is even thinking too small, to Christoph's
 earlier musings, I wonder if a slot based middle API could hijack the
 entire SCQ processing and have a per-slot callback scheme
 instead. That kind of intervention is exactly what we'd need to
 trivially hide the FMR difference.

FYI, I have working early patches to do per-WR completion callback,
I'll post them after I get them into a slightly better shape.

As for your grand schemes:  I like some of the idea there, but we
need to get there gradually.  I'd much prefer to finish Sagi's simple
scheme, get my completion work in, add abstractions for RDMA READ and
WRITE scatterlist mapping and build things up slowly.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] IB/uverbs: reject invalid or unknown opcodes

2015-08-20 Thread Christoph Hellwig
On Wed, Aug 19, 2015 at 07:50:23PM +, Hefty, Sean wrote:
  AFAIK, this path is rarely (never?) actually used. I think all the
  drivers we have can post directly from userspace.
 
 I didn't think the ipath or qib drivers post from userspace.

Makes sense with their software IB stack.  Guess the idea to get rid
of this path is dead, would have been too nice..
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Supported uverbs opcodes?

2015-08-19 Thread Christoph Hellwig
What opcodes are supposed to be submitted by users?

Currently we do not define opcodes in the UAPI and kinda rely that
userspace uses the same ones as the kernel.

For thos defines by libibverbs (RDMA_WRITE, RDMA_WRITE_WITH_IMM,
SEND, SEND_WITH_IMM, RDMA_READ, ATOMIC_CMP_AND_SWP and
ATOMIC_FETCH_AND_ADD) this happens to work fine.

Additionally uvers takes care of SEND_WITH_INV (but not READ_WITH_INV)
and copies the right fields into the kernel WR, but it also allows
all other opcodes without taking care of their fields.

My send_wr split that I'm going to repost soon will disallow all
those not handled explicitly as the current situration seems
dangerous.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH WIP 28/43] IB/core: Introduce new fast registration API

2015-08-19 Thread Christoph Hellwig
On Wed, Aug 19, 2015 at 02:56:24PM +0300, Sagi Grimberg wrote:
 So I had a go with moving the DMA mapping into ib_map_mr_sg() and
 it turns out mapping somewhat poorly if the ULP _may_ register memory
 or just send sg_lists (like storage targets over IB/iWARP). So the ULP
 will sometimes use the DMA mapping and sometimes it won't... feels
 kinda off to me...

Yes, it's odd.

 it's much saner to do:
 1. dma_map_sg
 2. register / send-sg-list
 3. unregister (if needed)
 4. dma_unmap_sg
 
 then:
 1. if register - call ib_map_mr_sg (which calls dma_map_sg)
else do dma_map_sg
 2. if registered - call ib_dma_unmap_sg (which calles dma_unmap_sg)
else do dma_unmap_sg
 
 this kinda forces ULP to completely separate these code paths with
 with very little sharing.
 
 Also, at the moment, when ULPs are doing either FRWR or FMRs
 its a pain to get a non-intrusive conversion.
 
 I'm thinking we should keep dma_map_sg out of ib_map_mr_sg, and leave
 it to the ULP like it does today (at least in the first stage...)

Keep it out for now.  I think we need to move the dma mapping into
the RDMA care rather sooner than later, but that must also include
ib_post_send/recv, so it's better done separately.

After having a look at the mess some drivers (ipath,qib,hfi  ehca)
cause with abuse of dma_map_ops I've got an even strong opion on
the whole subject now.  However I think we'll get more things done
if we split them into smaller steps.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


shrink struct ib_send_wr

2015-08-19 Thread Christoph Hellwig
This series shrinks the WR size by splitting out the different WR
types.

Patch number two is too large for the mailinglist, so if you didn't
get it grab it here:


http://git.infradead.org/users/hch/rdma.git/commitdiff/30e522ee6c1d7adb614d7308f09fbfd71c6d3e07

or the full git tree at:

git://git.infradead.org/users/hch/rdma.git wr-cleanup

Changes since the version sent around a couple of times:

 - new patch 1 which rejects invalid opcodes.  Patch 2 was doing
   this implicitly except for UD QPs, but I think we should a)
   do this explicitly and b) ensure this goes into 4.2 and -stable
   as I can see quite a lot of harm from submitting such malformed
   operations
 - patch 2 now covers all drivers including those in staging to
   side step any sort of discussions on the staging tree.
 - patch 2 now explicitly replaces the weird overloading in the mlx5
   driver with an explicit embedding of struct ib_send_wr, similar
   to what we do for all other MRs.
 - new patch to drop another unused send_wr field.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] IB: remove xrc_remote_srq_num from struct ib_send_wr

2015-08-19 Thread Christoph Hellwig
The field is only initialized in mlx, but never used.

If we want to add proper XRC support it should be done with a new
struct ib_xrc_wr.

This shrinks the various WR structures by another 4 bytes.

Signed-off-by: Christoph Hellwig h...@lst.de
---
 drivers/infiniband/hw/mlx5/qp.c | 1 -
 include/rdma/ib_verbs.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 04df156..83a290f 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -2634,7 +2634,6 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
switch (ibqp-qp_type) {
case IB_QPT_XRC_INI:
xrc = seg;
-   xrc-xrc_srqn = htonl(wr-xrc_remote_srq_num);
seg += sizeof(*xrc);
size += sizeof(*xrc) / 16;
/* fall through */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9b29c78..b855189 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1100,7 +1100,6 @@ struct ib_send_wr {
__be32  imm_data;
u32 invalidate_rkey;
} ex;
-   u32 xrc_remote_srq_num; /* XRC TGT QPs only */
 };
 
 struct ib_rdma_wr {
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] IB/uverbs: reject invalid or unknown opcodes

2015-08-19 Thread Christoph Hellwig
We have many WR opcodes that are only supported in kernel space
and/or require optional information to be copied into the WR
structure.  Reject all those not explicitly handled so that we
can't pass invalid information to drivers.

Cc: sta...@vger.kernel.org
Signed-off-by: Christoph Hellwig h...@lst.de
---
 drivers/infiniband/core/uverbs_cmd.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index a15318a..f9f3921 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -2372,6 +2372,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
next-send_flags = user_wr-send_flags;
 
if (is_ud) {
+   if (next-opcode != IB_WR_SEND 
+   next-opcode != IB_WR_SEND_WITH_IMM) {
+   ret = -EINVAL;
+   goto out_put;
+   }
+
next-wr.ud.ah = idr_read_ah(user_wr-wr.ud.ah,
 file-ucontext);
if (!next-wr.ud.ah) {
@@ -2413,7 +2419,8 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
next-wr.atomic.rkey = user_wr-wr.atomic.rkey;
break;
default:
-   break;
+   ret = -EINVAL;
+   goto out_put;
}
}
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] IB/uverbs: reject invalid or unknown opcodes

2015-08-19 Thread Christoph Hellwig
On Wed, Aug 19, 2015 at 11:46:14AM -0600, Jason Gunthorpe wrote:
 Reviewed-by: Jason Gunthorpe jguntho...@obsidianresearch.com
 
 AFAIK, this path is rarely (never?) actually used. I think all the
 drivers we have can post directly from userspace.

Oh, interesting.  Is there any chance to deprecate it?  Not having
to care for the uvers command would really help with some of the
upcoming changes I have in my mind.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH WIP 28/43] IB/core: Introduce new fast registration API

2015-08-19 Thread Christoph Hellwig
On Wed, Aug 19, 2015 at 07:09:18PM +0300, Sagi Grimberg wrote:
 Ok, I was also thinking on moving the access flags
 to the work request again.

Yes, with the current code I don't think we need it in the MR.

 I'd prefer to get this right with a different helper like Steve
 suggested:
 int rdma_access_flags(int mr_roles);

We can start with that.  In the long run we really want to have
two higher level helpers to RDMA READ a scatterlist:

 - one for iWARP that uses an FR and RDMA READ WITH INVALIDATE
 - one of IB-like transports that just uses a READ with the
   local lkey

Right now every ULP that wants to support iWarp needs to duplicate
that code.  This leads to some curious situations like the NFS
server apparently always using FRs if available for this if my
reading of svc_rdma_accept() is correct, or the weird parallel
code pathes for IB vs iWarp in RDS:

hch@brick:~/work/linux/net/rds$ ls ib*
ib.c  ib_cm.c  ib.h  ib_rdma.c  ib_recv.c  ib_ring.c  ib_send.c
ib_stats.c  ib_sysctl.c
hch@brick:~/work/linux/net/rds$ ls iw*
iw.c  iw_cm.c  iw.h  iw_rdma.c  iw_recv.c  iw_ring.c  iw_send.c
iw_stats.c  iw_sysctl.c

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-17 Thread Christoph Hellwig
On Thu, Aug 13, 2015 at 09:04:39AM -0700, Christoph Hellwig wrote:
   I'm happy to do that if you're fine with the patch in general.  amso1100
   should be trivial anyway, while ipath is a mess, just like the new intel
   driver with the third copy of the soft ib stack.
  
  Correct.
 
 http://git.infradead.org/users/hch/rdma.git/commitdiff/efb2b0f21645b9caabcce955481ab6966e52ad90
 
 contains the updates for ipath and amso1100, as well as the reviewed-by
 and tested-by tags.
 
 Note that for now I've skipped the new intel hfi1 driver as updating
 two of the soft ib codebases already was tiresome enough.

Doug, is this good enough for merging?

http://git.infradead.org/users/hch/rdma.git/shortlog/refs/heads/wr-cleanup

is my always uptodate branch, it has collected all the reviews and
tested-by tag I got on the list.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-13 Thread Christoph Hellwig
On Thu, Aug 13, 2015 at 11:22:34AM -0600, Jason Gunthorpe wrote:
 The uverbs change needs to drop/move the original kmalloc:
 
   next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
  user_wr-num_sge * sizeof (struct ib_sge),
  GFP_KERNEL);
 
 It looks like it is leaking that allocation right now. Every path
 replaces next with the result of alloc_mr..

Thanks.  It should be come and indeed was in my first version.  Not
sure how it sneaked in during a rebase.

 Noticed a couple of trailing whitespaces too..

checkpatch found two of them, which I've fixed now.

New version at:

http://git.infradead.org/users/hch/rdma.git/commitdiff/5d7e6fa563dae32d4b6f63e29e3795717a545f11

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-13 Thread Christoph Hellwig
On Thu, Aug 13, 2015 at 09:07:14AM -0400, Doug Ledford wrote:
  Doug:  was your mail a request to fix up the two de-staged drivers?
  I'm happy to do that if you're fine with the patch in general.  amso1100
  should be trivial anyway, while ipath is a mess, just like the new intel
  driver with the third copy of the soft ib stack.
 
 Correct.

http://git.infradead.org/users/hch/rdma.git/commitdiff/efb2b0f21645b9caabcce955481ab6966e52ad90

contains the updates for ipath and amso1100, as well as the reviewed-by
and tested-by tags.

Note that for now I've skipped the new intel hfi1 driver as updating
two of the soft ib codebases already was tiresome enough.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-13 Thread Christoph Hellwig
On Wed, Aug 12, 2015 at 08:24:49PM +0300, Sagi Grimberg wrote:
 Just a nit that I've noticed, in mlx4 set_fmr_seg params are not
 aligned to the parenthesis (maybe in other locations too but I haven't
 noticed such...)

This is just using a normal two tab indent for continued function
parameters..
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-12 Thread Christoph Hellwig
On Wed, Aug 12, 2015 at 07:24:44PM -0700, Chuck Lever wrote:
 That makes sense, but you already Acked the change that breaks Lustre,
 and it's going in through the NFS tree. Are you changing that to a NAK?

It seems like Doug was mostly concened about to be removed drivers.
I defintively refuse to fix Lustre for anything I tough because it's
such a giant mess with uses just about every major subsystem in
an incorrect way.

Doug:  was your mail a request to fix up the two de-staged drivers?
I'm happy to do that if you're fine with the patch in general.  amso1100
should be trivial anyway, while ipath is a mess, just like the new intel
driver with the third copy of the soft ib stack.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-07 Thread Christoph Hellwig
On Thu, Aug 06, 2015 at 01:58:45PM -0400, Chuck Lever wrote:
 Wondering if this means we'll have to drop ib_reg_phys_mr()
 removal until Lustre gets around to removing their call sites
 from the staging tree.

Why?  Just because the buildbot catches it?

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-07 Thread 'Christoph Hellwig'
On Thu, Aug 06, 2015 at 12:44:42PM -0500, Steve Wise wrote:
  Driver/staging isn't considered in tree for global API change
  perspective, so I didn't bother with all these staging drivers.
 
 The kbuild test bot will probably catch this. 

It already did catch it for my tree, which is expected.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-07 Thread Christoph Hellwig
On Thu, Aug 06, 2015 at 11:08:45PM +0530, Parav Pandit wrote:
 Do you see value in dividing ib_ud _wr into ib_ud_wr and ib_ud_gsi_wr
 to save 4 bytes?

For now I just wanted to split along the lines of the existing unions.
From looking at the various drivers splitting the GSI path might not be
a bad idea, but it's not a priority for me.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-07 Thread Christoph Hellwig
On Thu, Aug 06, 2015 at 07:46:44PM +0300, Sagi Grimberg wrote:
 I agree that this is a shame to keep in here for everyone to carry...
 The only driver I've seen supporting XRC is mlx5 with no consumers.
 
 If people are reluctant to remove it, you can put it in ib_xrc_send_wr
 or something...

If'll send a patch to remove it, and then we can introduce
ib_xrc_send_wr once users show up.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-4.3 11/15] iw_cxgb4: Support ib_alloc_mr verb

2015-08-07 Thread Christoph Hellwig
On Fri, Aug 07, 2015 at 10:06:26AM -0500, Steve Wise wrote:
 If it is too much of a pain to alter this patch, then I'll just
 submit the NFSRDMA fix and live with the bisect issue...

Doug's tree is still to be rebased.  So please submit your NFS
fix now as ask Doug to merge it before Sagi's series in the final
tree.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-4.3 11/15] iw_cxgb4: Support ib_alloc_mr verb

2015-08-07 Thread 'Christoph Hellwig'
On Fri, Aug 07, 2015 at 11:29:12AM -0500, Steve Wise wrote:
 I misspoke.  I had the order reversed. The order is such that we can add my 
 new NFS patch after:
 
 e20684a xprtrdma, svcrdma: Convert to ib_alloc_mr
 
 and before these:
 
 af78181 cxgb3: Support ib_alloc_mr verb
 b7e06cd iw_cxgb4: Support ib_alloc_mr verb

In general it would be preferable to have it before the ib_alloc_mr
conversion, but if it causes more work I doubt it's really worth it.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH for-4.3 11/15] iw_cxgb4: Support ib_alloc_mr verb

2015-08-07 Thread 'Christoph Hellwig'
On Fri, Aug 07, 2015 at 11:19:59AM -0500, Steve Wise wrote:
 I guess I'll post two patches, the NFS fix that preceeds af78181/ b7e06cd, 
 and a reworked patch to replace e20684a.
 
 Is that the way to go in your opinion?

To me this sounds good.  We have a couple patches from Jason's series
that already need to be replaced, so the tree will need a rebase anyway,
so I don't see a problem with replacing ones in Sagi's series either.
If Sagi needs to do a repost for some reason he can just include your
NFS patch in front of the series.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-07 Thread Christoph Hellwig
On Fri, Aug 07, 2015 at 10:17:18AM -0400, Chuck Lever wrote:
 If bot barking doesn't bother anyone, then I'll keep the removal patch.
 For some such a complaint might be grounds for rejecting the patch.

If it's (a) in tree proper and (b) not one of the rare false positives I
would consider it a reason for rejection as well.  But this is the
staging tree we're talking about.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-06 Thread Christoph Hellwig
On Wed, Aug 05, 2015 at 10:40:08PM -0600, Jason Gunthorpe wrote:
 Any numbers on the struct size reduction?

sizeof(struct ib_send_wr) (old): 96

sizeof(struct ib_send_wr): 48
sizeof(struct ib_rdma_wr): 64
sizeof(struct ib_atomic_wr): 96
sizeof(struct ib_ud_wr): 88
sizeof(struct ib_fast_reg_wr): 88
sizeof(struct ib_bind_mw_wr): 96
sizeof(struct ib_sig_handover_wr): 80

sizeof(struct ib_fastreg_wr) (Sagi): 64

So the commonly used send and rdma WRs are drastically redueces.  FR
currently doesn't look much better, but the replacement WR for it in
Sagi's series will be in a simila ballpark.

Note that we can remove an additional 4 bytes from each of them by
removing the unused xrc_remote_srq_num value from struct ib_send_wr.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-06 Thread Christoph Hellwig
On Thu, Aug 06, 2015 at 12:04:32PM -0500, Steve Wise wrote:
 You missed amso1100 (and probably ipath) that have been moved to
 drivers/staging...

Driver/staging isn't considered in tree for global API change
perspective, so I didn't bother with all these staging drivers.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-06 Thread Christoph Hellwig
I've pushed out a new version.  Updates:

 - the ib_recv_wr change Bart notices has been fixed.
 - iser and isert have been converted
 - the handling of the embedded WR in the qib software queue entry
   has been fixed.

Which means we're basically done now and the patch could use
broader testing.

The full patch will be too much for the list again, so here is the
git commit:

http://git.infradead.org/users/hch/scsi.git/commitdiff/a0027ed00fc3ae2686d8a843a724b50597115a71

ib_vers.h diff below:

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0940051..2f2efdd 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1100,54 +1100,94 @@ struct ib_send_wr {
__be32  imm_data;
u32 invalidate_rkey;
} ex;
-   union {
-   struct {
-   u64 remote_addr;
-   u32 rkey;
-   } rdma;
-   struct {
-   u64 remote_addr;
-   u64 compare_add;
-   u64 swap;
-   u64 compare_add_mask;
-   u64 swap_mask;
-   u32 rkey;
-   } atomic;
-   struct {
-   struct ib_ah *ah;
-   void   *header;
-   int hlen;
-   int mss;
-   u32 remote_qpn;
-   u32 remote_qkey;
-   u16 pkey_index; /* valid for GSI only */
-   u8  port_num;   /* valid for DR SMPs on switch only 
*/
-   } ud;
-   struct {
-   u64 iova_start;
-   struct ib_fast_reg_page_list   *page_list;
-   unsigned intpage_shift;
-   unsigned intpage_list_len;
-   u32 length;
-   int access_flags;
-   u32 rkey;
-   } fast_reg;
-   struct {
-   struct ib_mw*mw;
-   /* The new rkey for the memory window. */
-   u32  rkey;
-   struct ib_mw_bind_info   bind_info;
-   } bind_mw;
-   struct {
-   struct ib_sig_attrs*sig_attrs;
-   struct ib_mr   *sig_mr;
-   int access_flags;
-   struct ib_sge  *prot;
-   } sig_handover;
-   } wr;
u32 xrc_remote_srq_num; /* XRC TGT QPs only */
 };
 
+struct ib_rdma_wr {
+   struct ib_send_wr   wr;
+   u64 remote_addr;
+   u32 rkey;
+};
+
+static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_rdma_wr, wr);
+}
+
+struct ib_atomic_wr {
+   struct ib_send_wr   wr;
+   u64 remote_addr;
+   u64 compare_add;
+   u64 swap;
+   u64 compare_add_mask;
+   u64 swap_mask;
+   u32 rkey;
+};
+
+static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_atomic_wr, wr);
+}
+
+struct ib_ud_wr {
+   struct ib_send_wr   wr;
+   struct ib_ah*ah;
+   void*header;
+   int hlen;
+   int mss;
+   u32 remote_qpn;
+   u32 remote_qkey;
+   u16 pkey_index; /* valid for GSI only */
+   u8  port_num;   /* valid for DR SMPs on switch only 
*/
+};
+
+static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_ud_wr, wr);
+}
+
+struct ib_fast_reg_wr {
+   struct ib_send_wr   wr;
+   u64 iova_start;
+   struct ib_fast_reg_page_list *page_list;
+   unsigned intpage_shift;
+   unsigned intpage_list_len;
+   u32 length;
+   int access_flags;
+   u32 rkey;
+};
+
+static inline struct ib_fast_reg_wr *fast_reg_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_fast_reg_wr, wr);
+}
+
+struct ib_bind_mw_wr {
+   struct ib_send_wr   wr;
+   struct ib_mw*mw;
+   /* The new rkey for the memory window. */
+   u32 rkey;
+   struct ib_mw_bind_info  bind_info;
+};
+
+static inline struct 

[RFC] split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig
Hi all,

please take a look at my RFC patch here:


http://git.infradead.org/users/hch/scsi.git/commitdiff/751774250b71da83a26ba8584cff70f5e7bb7b1e

the commit contains my explanation, but apparently the patch is too
large for the list limit and didn't make it through.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig
On Tue, Aug 04, 2015 at 04:07:42PM +, Hefty, Sean wrote:
 This looks like a reasonable start.  It may help with feedback if you
 could just post the changes to ib_verbs.h.

Not sure it's all that useful, but here we go:


diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0940051..666b571 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1100,55 +1100,96 @@ struct ib_send_wr {
__be32  imm_data;
u32 invalidate_rkey;
} ex;
-   union {
-   struct {
-   u64 remote_addr;
-   u32 rkey;
-   } rdma;
-   struct {
-   u64 remote_addr;
-   u64 compare_add;
-   u64 swap;
-   u64 compare_add_mask;
-   u64 swap_mask;
-   u32 rkey;
-   } atomic;
-   struct {
-   struct ib_ah *ah;
-   void   *header;
-   int hlen;
-   int mss;
-   u32 remote_qpn;
-   u32 remote_qkey;
-   u16 pkey_index; /* valid for GSI only */
-   u8  port_num;   /* valid for DR SMPs on switch only 
*/
-   } ud;
-   struct {
-   u64 iova_start;
-   struct ib_fast_reg_page_list   *page_list;
-   unsigned intpage_shift;
-   unsigned intpage_list_len;
-   u32 length;
-   int access_flags;
-   u32 rkey;
-   } fast_reg;
-   struct {
-   struct ib_mw*mw;
-   /* The new rkey for the memory window. */
-   u32  rkey;
-   struct ib_mw_bind_info   bind_info;
-   } bind_mw;
-   struct {
-   struct ib_sig_attrs*sig_attrs;
-   struct ib_mr   *sig_mr;
-   int access_flags;
-   struct ib_sge  *prot;
-   } sig_handover;
-   } wr;
u32 xrc_remote_srq_num; /* XRC TGT QPs only */
 };
 
+struct ib_rdma_wr {
+   struct ib_send_wr   wr;
+   u64 remote_addr;
+   u32 rkey;
+};
+
+static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_rdma_wr, wr);
+}
+
+struct ib_atomic_wr {
+   struct ib_send_wr   wr;
+   u64 remote_addr;
+   u64 compare_add;
+   u64 swap;
+   u64 compare_add_mask;
+   u64 swap_mask;
+   u32 rkey;
+};
+
+static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_atomic_wr, wr);
+}
+
+struct ib_ud_wr {
+   struct ib_send_wr   wr;
+   struct ib_ah*ah;
+   void*header;
+   int hlen;
+   int mss;
+   u32 remote_qpn;
+   u32 remote_qkey;
+   u16 pkey_index; /* valid for GSI only */
+   u8  port_num;   /* valid for DR SMPs on switch only 
*/
+};
+
+static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_ud_wr, wr);
+}
+
+struct ib_fast_reg_wr {
+   struct ib_send_wr   wr;
+   u64 iova_start;
+   struct ib_fast_reg_page_list *page_list;
+   unsigned intpage_shift;
+   unsigned intpage_list_len;
+   u32 length;
+   int access_flags;
+   u32 rkey;
+};
+
+static inline struct ib_fast_reg_wr *fast_reg_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_fast_reg_wr, wr);
+}
+
+struct ib_bind_mw_wr {
+   struct ib_send_wr   wr;
+   struct ib_mw*mw;
+   /* The new rkey for the memory window. */
+   u32 rkey;
+   struct ib_mw_bind_info  bind_info;
+};
+
+static inline struct ib_bind_mw_wr *bind_mw_wr(struct ib_send_wr *wr)
+{
+   return container_of(wr, struct ib_bind_mw_wr, wr);
+}
+
+struct ib_sig_handover_wr {
+   struct ib_send_wr   wr;
+   struct ib_sig_attrs*sig_attrs;
+   struct ib_mr   *sig_mr;
+   int   

Re: [PATCH, RFC] rdma: split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig
On Tue, Aug 04, 2015 at 08:06:16PM +0300, Sagi Grimberg wrote:
 Question though, a ULP may want to keep a couple of WRs around instead
 of having each allocated in the stack and handled one by one. We need
 to provide it with a hint of what is the size it needs.

Note that with the drastic shrink of the struct size the typical case
of needing a fast_reg + one send or a few rdma wr should be just fine
on stack now.  Even more so with your registration cleanups which
will drastically shrink the size of the fast reg mrs again.

 I just posted a patch to do that in iser 
 (http://www.spinics.net/lists/linux-rdma/msg27632.html).

That's actually part of the reason why I didn't manage to convert iser
as it seems so convoluted..

 So if I would want to preallocate an array of work requests, what is the
 size of the space I'd need?
 is it some form of max(sizeof(struct ib_send_wr),
sizeof(struct ib_fastreg_wr),
sizeof(struct sig_handover), ..)?

Preferably you'd preallocate them in a type safe manner, if you really
need to overlay them do it as a proper union.  For example the rds
code is doing that although it only preallocates on per work item.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH, RFC] rdma: split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig
On Tue, Aug 04, 2015 at 08:44:26PM +0300, Sagi Grimberg wrote:
 I do agree that the size on the stack is less of an issue now. What
 still can matter is handling each wr one by one vs. doing a collective
 post.

But if structured correctly you can still do that with on-stack WRs.

 I can understand that. In the last patches I tried to simplify the flow
 (I noticed that these aren't in your tree correct?). There always room
 for cleanups... I'm trying to have a gradual progress towards getting it
 to be simpler.

I used Doug's to be rebased for 4.3 tree from yesterday or so.  So
whatever he merged is in.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] split struct ib_send_wr

2015-08-04 Thread Christoph Hellwig
On Tue, Aug 04, 2015 at 09:36:49AM -0700, Bart Van Assche wrote:
 diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
  [ ... ]
   struct ib_recv_wr {
 +struct ib_send_wr   wr;
  struct ib_recv_wr  *next;
  u64 wr_id;
  struct ib_sge  *sg_list;
 
 Hello Christoph,
 
 This part of the patch surprised me ?

It should.  It's a stupid cut  paste error, the new member isn't used
at all.  I'll fix it up.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/12] IB: Replace safe uses for ib_get_dma_mr with pd-local_dma_lkey

2015-08-03 Thread Christoph Hellwig
On Fri, Jul 31, 2015 at 03:20:40PM -0700, Bart Van Assche wrote:
 On 07/30/2015 04:22 PM, Jason Gunthorpe wrote:
 All patches are compile tested. I've done basic testing up to and including
 the IPoIB patch, the rest required specialized setups I don't have access to,
 but are fairly straightforward.
 
 Hello Jason,
 
 SRP login fails with this patch series applied on top of Linux kernel
 v4.2-rc4. At the target side the following message appears every time the
 SRP initiator tries to log in: ib_srpt: RDMA t 5 for idx 0 failed with
 status 10 (10=remote access error). That causes the initiator to receive a
 flush error (5).

Could this be caused because srp_map_sg_entry falls back to the phys
mapping for unaligned large requests in line 1389 in Dougs tree:

if ((!dev-use_fast_reg  dma_addr  ~dev-mr_page_mask) ||
dma_len  dev-mr_max_size) {
ret = srp_finish_mapping(state, ch);
if (ret)
return ret;

srp_map_desc(state, dma_addr, dma_len, target-rkey);
srp_map_update_start(state, NULL, 0, 0);
return 0;
}

Bart, do you know what hardware this workaround is for?

Also the SRP driver still falls back to phys registrations if the
better MR methods fail, something that will stop working with
Jasons patch.  It's something that looks a little questionable
and which other ULDs don't do either, so it would be good time
to review this practice.

Additionally the SRP_DATA_DESC_INDIRECT case always uses the global
rkey, so whenever we hits this things are going to break.

Jason, I suspect it might be a good idea to rename the rkey field
to something like global_rkey, and make sure it's guarded by a proper
conditional shared by all users.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 00/50] Add OPA gen1 driver

2015-08-03 Thread Christoph Hellwig
On Sat, Aug 01, 2015 at 04:34:31PM -0400, Doug Ledford wrote:
 Or, I haven't looked at the soft-roce driver (ever).  Is it going to
 need this library as well?

ROCE implements the IB protocol, so a software ROCE driver will need
a IB protocol implementation sitting ontop of ethernet frames (v1)
or the internet protocol stack (v2).

 If it is, then as you have rung the bell for
 getting this library written, I will expect Mellanox to work with Intel
 to make sure that this library is suitable for not just their hardware
 drivers but also the soft-roce driver you guys are working on.  I might
 even suggest that it's time to work up the soft-roce submission sooner
 rather than later in light of what I've done with the hfi1 driver.

I think burdering this on Intel is a bit unfair.  If they can come
up with a library that jsut fits their use case without being bad in
other ways the burden to adopt it to ROCE should fall on whoever wants
to submit that code.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


<    1   2   3   4   5   >