from:"Or Gerlitz"

From: Matan Barak 

ib_uverbs_ex_create_cq follows the extension verbs
mechanism. New features (for example, CQ creation flags
field which is added in a downstream patch) could used
via user-space libraries without breaking the ABI.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |  170 ++---
 drivers/infiniband/core/uverbs_main.c |1 +
 include/uapi/rdma/ib_user_verbs.h |   17 
 4 files changed, 154 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b716b08..ba365b6 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -259,5 +259,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
 IB_UVERBS_DECLARE_EX_CMD(create_flow);
 IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
 IB_UVERBS_DECLARE_EX_CMD(query_device);
+IB_UVERBS_DECLARE_EX_CMD(create_cq);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 1954ebb..51311b1 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1330,41 +1330,37 @@ ssize_t ib_uverbs_create_comp_channel(struct 
ib_uverbs_file *file,
return in_len;
 }
 
-ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
-   const char __user *buf, int in_len,
-   int out_len)
+static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+  struct ib_udata *ucore,
+  struct ib_udata *uhw,
+  struct ib_uverbs_ex_create_cq *cmd,
+  size_t cmd_sz,
+  int (*cb)(struct ib_uverbs_file *file,
+struct ib_ucq_object *obj,
+struct 
ib_uverbs_ex_create_cq_resp *resp,
+struct ib_udata *udata,
+void *context),
+  void *context)
 {
-   struct ib_uverbs_create_cq  cmd;
-   struct ib_uverbs_create_cq_resp resp;
-   struct ib_udata udata;
struct ib_ucq_object   *obj;
struct ib_uverbs_event_file*ev_file = NULL;
struct ib_cq   *cq;
int ret;
+   struct ib_uverbs_ex_create_cq_resp resp;
struct ib_cq_init_attr attr = {};
 
-   if (out_len < sizeof resp)
-   return -ENOSPC;
-
-   if (copy_from_user(&cmd, buf, sizeof cmd))
-   return -EFAULT;
-
-   INIT_UDATA(&udata, buf + sizeof cmd,
-  (unsigned long) cmd.response + sizeof resp,
-  in_len - sizeof cmd, out_len - sizeof resp);
-
-   if (cmd.comp_vector >= file->device->num_comp_vectors)
-   return -EINVAL;
+   if (cmd->comp_vector >= file->device->num_comp_vectors)
+   return ERR_PTR(-EINVAL);
 
obj = kmalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
-   return -ENOMEM;
+   return ERR_PTR(-ENOMEM);
 
-   init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, 
&cq_lock_class);
+   init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, 
&cq_lock_class);
down_write(&obj->uobject.mutex);
 
-   if (cmd.comp_channel >= 0) {
-   ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+   if (cmd->comp_channel >= 0) {
+   ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel);
if (!ev_file) {
ret = -EINVAL;
goto err;
@@ -1377,10 +1373,14 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
 
-   attr.cqe = cmd.cqe;
-   attr.comp_vector = cmd.comp_vector;
+   attr.cqe = cmd->cqe;
+   attr.comp_vector = cmd->comp_vector;
+
+   if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
+   attr.flags = cmd->flags;
+
cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
-file->ucontext, &udata);
+file->ucontext, uhw);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
@@ -1399,14 +1399,15 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
goto err_free;
 
memset(&resp, 0, sizeof resp);
-   resp.cq_handle = obj->uobject.id;
-   resp.cqe   = cq-&g

[PATCH for-next V1 08/11] IB/mlx4: Support extended create_cq and query_device uverbs

From: Matan Barak 

Add support for ib_uverbs_ex_create_cq and ib_uverbs_ex_query_device
by setting the appropriate bit in uverbs_ex_cmd_mask.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/main.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index d87401e..ef211c8 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2302,6 +2302,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
}
 
+   ibdev->ib_dev.uverbs_ex_cmd_mask |=
+   (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) |
+   (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ);
+
mlx4_ib_alloc_eqs(dev, ibdev);
 
spin_lock_init(&iboe->lock);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V1 00/11] Add completion timestamping support

Hi Doug,

This patchset adds completion timestamping supports for verbs consumers. 

Timestamping is used by applications in order to know when a WQE was 
received/transmitted by the HW. The value is given is HCA hardware cycles,
but could be easily converted as the hardware's core clock frequecny is 
available through extension of query device. 

Moreover, we add an ability to read the HCA's current clock. This could be 
useful on order to synchronize events to the wall clock.

This functionality is achieved by adding/extending the following verbs:

create_cq - create_cq is extended in order to allow passing creation flags
to the CQ creation function. We change IB/core --> vendors API
to be easily extendible by passing a struct which contains
comp_vectors, cqe and the new flags parameter. In order to create
CQ which supports timestamping, IB_CQ_FLAGS_TIMESTAMP should be given.

query_device - We extend query_device uverb further by giving the hardware's
clock frequency and the timestamp mask (the number of timestamp
bits which are supported). If timestamp isn't supported, 0 is returned.

In order to read the timestamp in the WQE, the user needs to query the device 
for support, create an appropriate CQ (using the extanded uverb with
IB_CQ_FLAGS_TIMESTAMP) and poll the CQ with an extended poll_cq verb (currently,
only implemented in user-space).

In mlx4, allowing the user to read the core clock efficiently involves mapping
this area of the hardware to user-space (being done by using a mmap command)
and reading the clock from the correct offset of the page. 

This offset is returned in the vendor's specific data from mlx4's kernel driver 
to the mlx4's user-space driver. query_device is modified in order to support
passing this vendor specific data. A user-space application could use a new
verb in order to read the hardware's clock.

Translating the hardware's clock into ms could be done by dividing this
value by hca_core_clock (which is returned by the extended version of
query_device uverb).

A user-space application could get the current HW's clock by executing

ibv_query_values_ex(struct ibv_context *context, uint32_t q_values,
struct ibv_values_ex *values)

The function gets a mask of the values to query and return their values.
Vendors could either implement this as a uverb command or use their 
user-space driver to return those values directly from the HW (the mlx4 way).

Changes from V0:

(1) Pass ib_cq_init_attr instead of cqe and comp_vector.
(2) Fix unneeded indentation.
(3) Change flags to u32.
(4) Add const to create_cq's ib_cq_init_attr argument in vendor implementation.

Matan and Or.

Matan Barak (11):
  IB/core: Change provider's API of create_cq to be extendible
  IB/core: Change ib_create_cq to use struct ib_cq_init_attr
  IB/core: Add CQ creation time-stamping flag
  IB/core: Extend ib_uverbs_create_cq
  IB/core: Add timestamp_mask and hca_core_clock to query_device
  IB/core: Pass hardware specific data in query_device
  IB/mlx4: Add mmap call to map the hardware clock
  IB/mlx4: Support extended create_cq and query_device uverbs
  IB/mlx4: Add support for timestamp in cq creation
  IB/mlx4: Add timestamp_mask and hca_core_clock to query_device
  IB/mlx4: Return hca core clock's offset in query_device vendor's data

 drivers/infiniband/core/device.c |6 +-
 drivers/infiniband/core/mad.c|5 +-
 drivers/infiniband/core/uverbs.h |1 +
 drivers/infiniband/core/uverbs_cmd.c |  188 +-
 drivers/infiniband/core/uverbs_main.c|1 +
 drivers/infiniband/core/verbs.c  |4 +-
 drivers/infiniband/hw/amso1100/c2_provider.c |   14 ++-
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |   19 ++-
 drivers/infiniband/hw/cxgb4/cq.c |9 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h   |8 +-
 drivers/infiniband/hw/cxgb4/provider.c   |8 +-
 drivers/infiniband/hw/ehca/ehca_cq.c |7 +-
 drivers/infiniband/hw/ehca/ehca_hca.c|6 +-
 drivers/infiniband/hw/ehca/ehca_iverbs.h |6 +-
 drivers/infiniband/hw/ehca/ehca_main.c   |6 +-
 drivers/infiniband/hw/ipath/ipath_cq.c   |9 +-
 drivers/infiniband/hw/ipath/ipath_verbs.c|7 +-
 drivers/infiniband/hw/ipath/ipath_verbs.h|3 +-
 drivers/infiniband/hw/mlx4/cq.c  |   13 ++-
 drivers/infiniband/hw/mlx4/mad.c |5 +-
 drivers/infiniband/hw/mlx4/main.c|   67 +-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   19 +++-
 drivers/infiniband/hw/mlx5/cq.c  |   10 +-
 drivers/infiniband/hw/mlx5/main.c|   19 ++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |5 +-
 drivers/infiniband/hw/mthca/mthca_provider.c |   15 ++-
 drivers/infiniband/hw/nes/nes_verbs.c|   17 ++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |   13 ++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |9 +-
 drivers/infiniband

[PATCH for-next V1 01/11] IB/core: Change provider's API of create_cq to be extendible

From: Matan Barak 

Add a new ib_cq_init_attr structure which contains the
previous cqe (minimum number of CQ entries) and comp_vector
(completion vector) in addition to a new flags field.
All vendors' create_cq callbacks are changed in order
to work with the new API.

This commit does not change any functionality.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/uverbs_cmd.c |6 --
 drivers/infiniband/core/verbs.c  |3 ++-
 drivers/infiniband/hw/amso1100/c2_provider.c |7 ++-
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |   11 ---
 drivers/infiniband/hw/cxgb4/cq.c |9 +++--
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h   |8 
 drivers/infiniband/hw/ehca/ehca_cq.c |7 ++-
 drivers/infiniband/hw/ehca/ehca_iverbs.h |3 ++-
 drivers/infiniband/hw/ipath/ipath_cq.c   |9 +++--
 drivers/infiniband/hw/ipath/ipath_verbs.h|3 ++-
 drivers/infiniband/hw/mlx4/cq.c  |8 +++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |3 ++-
 drivers/infiniband/hw/mlx5/cq.c  |   10 --
 drivers/infiniband/hw/mlx5/main.c|3 ++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |5 +++--
 drivers/infiniband/hw/mthca/mthca_provider.c |8 ++--
 drivers/infiniband/hw/nes/nes_verbs.c|   11 ---
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |7 ++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |6 --
 drivers/infiniband/hw/qib/qib_cq.c   |   11 ---
 drivers/infiniband/hw/qib/qib_verbs.h|5 +++--
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |   10 +++---
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h |7 ---
 include/rdma/ib_verbs.h  |   10 --
 24 files changed, 124 insertions(+), 46 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index a9f0489..1954ebb 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1341,6 +1341,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
struct ib_uverbs_event_file*ev_file = NULL;
struct ib_cq   *cq;
int ret;
+   struct ib_cq_init_attr attr = {};
 
if (out_len < sizeof resp)
return -ENOSPC;
@@ -1376,8 +1377,9 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
 
-   cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
-cmd.comp_vector,
+   attr.cqe = cmd.cqe;
+   attr.comp_vector = cmd.comp_vector;
+   cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
 file->ucontext, &udata);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index f93eb8d..0c49163 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1015,8 +1015,9 @@ struct ib_cq *ib_create_cq(struct ib_device *device,
   void *cq_context, int cqe, int comp_vector)
 {
struct ib_cq *cq;
+   struct ib_cq_init_attr attr = {.cqe = cqe, .comp_vector = comp_vector};
 
-   cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+   cq = device->create_cq(device, &attr, NULL, NULL);
 
if (!IS_ERR(cq)) {
cq->device= device;
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c 
b/drivers/infiniband/hw/amso1100/c2_provider.c
index bdf3507..8e53d05 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -286,13 +286,18 @@ static int c2_destroy_qp(struct ib_qp *ib_qp)
return 0;
 }
 
-static struct ib_cq *c2_create_cq(struct ib_device *ibdev, int entries, int 
vector,
+static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
+ const struct ib_cq_init_attr *attr,
  struct ib_ucontext *context,
  struct ib_udata *udata)
 {
+   int entries = attr->cqe;
struct c2_cq *cq;
int err;
 
+   if (attr->flags)
+   return ERR_PTR(-EINVAL);
+
cq = kmalloc(sizeof(*cq), GFP_KERNEL);
if (!cq) {
pr_debug("%s: Unable to allocate CQ\n", __func__);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c 
b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 811b24a..eb74c4d 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -138,10 +138,12 @@ static int iwch_destroy_cq(struct ib_cq

[PATCH for-next V1 10/11] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device

From: Matan Barak 

mlx4 needs to report the number of supported timestamp
bits (mask) and the hca_core_clock frequency.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/main.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index ef211c8..a5a90c4 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -233,6 +233,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
   props->max_mcast_grp;
props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
+   props->hca_core_clock = dev->dev->caps.hca_core_clock;
+   props->timestamp_mask = 0xULL;
 
 out:
kfree(in_mad);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V1 02/11] IB/core: Change ib_create_cq to use struct ib_cq_init_attr

From: Matan Barak 

Currently, ib_create_cq uses cqe and comp_vecotr instead
of the extendible ib_cq_init_attr struct.

Earlier patches already changed the vendors to work with
ib_cq_init_attr. This patch changes the consumers too.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/mad.c  |5 -
 drivers/infiniband/core/verbs.c|5 ++---
 drivers/infiniband/hw/ehca/ehca_main.c |6 +-
 drivers/infiniband/hw/mlx4/mad.c   |5 -
 drivers/infiniband/hw/mlx4/main.c  |5 -
 drivers/infiniband/hw/mlx5/main.c  |7 +--
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |9 +++--
 drivers/infiniband/ulp/iser/iser_verbs.c   |6 +-
 drivers/infiniband/ulp/isert/ib_isert.c|6 +-
 drivers/infiniband/ulp/srp/ib_srp.c|   10 --
 drivers/infiniband/ulp/srpt/ib_srpt.c  |5 -
 include/rdma/ib_verbs.h|6 ++
 net/9p/trans_rdma.c|5 -
 net/rds/ib_cm.c|8 ++--
 net/rds/iw_cm.c|8 ++--
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |   10 ++
 net/sunrpc/xprtrdma/verbs.c|   10 ++
 17 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 74c30f4..8fbf5d6 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2922,6 +2922,7 @@ static int ib_mad_port_open(struct ib_device *device,
unsigned long flags;
char name[sizeof "ib_mad123"];
int has_smi;
+   struct ib_cq_init_attr cq_attr;
 
/* Create new device info */
port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
@@ -2942,9 +2943,11 @@ static int ib_mad_port_open(struct ib_device *device,
if (has_smi)
cq_size *= 2;
 
+   memset(&cq_attr, 0, sizeof(cq_attr));
+   cq_attr.cqe = cq_size;
port_priv->cq = ib_create_cq(port_priv->device,
 ib_mad_thread_completion_handler,
-NULL, port_priv, cq_size, 0);
+NULL, port_priv, &cq_attr);
if (IS_ERR(port_priv->cq)) {
dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
ret = PTR_ERR(port_priv->cq);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 0c49163..0dc0abc 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1012,12 +1012,11 @@ EXPORT_SYMBOL(ib_destroy_qp);
 struct ib_cq *ib_create_cq(struct ib_device *device,
   ib_comp_handler comp_handler,
   void (*event_handler)(struct ib_event *, void *),
-  void *cq_context, int cqe, int comp_vector)
+  void *cq_context, struct ib_cq_init_attr *cq_attr)
 {
struct ib_cq *cq;
-   struct ib_cq_init_attr attr = {.cqe = cqe, .comp_vector = comp_vector};
 
-   cq = device->create_cq(device, &attr, NULL, NULL);
+   cq = device->create_cq(device, cq_attr, NULL, NULL);
 
if (!IS_ERR(cq)) {
cq->device= device;
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c 
b/drivers/infiniband/hw/ehca/ehca_main.c
index cd8d290..60b4730 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -534,6 +534,7 @@ static int ehca_create_aqp1(struct ehca_shca *shca, u32 
port)
struct ib_cq *ibcq;
struct ib_qp *ibqp;
struct ib_qp_init_attr qp_init_attr;
+   struct ib_cq_init_attr cq_attr;
int ret;
 
if (sport->ibcq_aqp1) {
@@ -541,7 +542,10 @@ static int ehca_create_aqp1(struct ehca_shca *shca, u32 
port)
return -EPERM;
}
 
-   ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1), 10, 0);
+   memset(&cq_attr, 0, sizeof(cq_attr));
+   cq_attr.cqe = 10;
+   ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
+   &cq_attr);
if (IS_ERR(ibcq)) {
ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
return PTR_ERR(ibcq);
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 9cd2b00..462e728 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1773,6 +1773,7 @@ static int create_pv_resources(struct ib_device *ibdev, 
int slave, int port,
   int create_tun, struct mlx4_ib_demux_pv_ctx *ctx)
 {
int ret, cq_size;
+   struct ib_cq_init_attr cq_attr;
 
if (ctx->state != DEMUX_PV_STATE_DOWN)
return -EEXIST;
@@ -1801,8 +180

[PATCH for-next V1 07/11] IB/mlx4: Add mmap call to map the hardware clock

From: Matan Barak 

In order to read the HCA's cycle counter efficiently in
user space, we need to map the HCA's register.
This is done through mmap call.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/main.c |   18 +-
 drivers/net/ethernet/mellanox/mlx4/main.c |   19 +++
 include/linux/mlx4/device.h   |9 +
 3 files changed, 45 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 2200cf9..d87401e 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -716,8 +716,24 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, 
struct vm_area_struct *vma)
   dev->dev->caps.num_uars,
   PAGE_SIZE, vma->vm_page_prot))
return -EAGAIN;
-   } else
+   } else if (vma->vm_pgoff == 3) {
+   struct mlx4_clock_params params;
+   int ret = mlx4_get_internal_clock_params(dev->dev, ¶ms);
+
+   if (ret)
+   return ret;
+
+   vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+   if (io_remap_pfn_range(vma, vma->vm_start,
+  
(pci_resource_start(dev->dev->persist->pdev,
+  params.bar) +
+   params.offset)
+  >> PAGE_SHIFT,
+  PAGE_SIZE, vma->vm_page_prot))
+   return -EAGAIN;
+   } else {
return -EINVAL;
+   }
 
return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index ced5eca..70de39c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1674,6 +1674,25 @@ static int map_internal_clock(struct mlx4_dev *dev)
return 0;
 }
 
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+  struct mlx4_clock_params *params)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+
+   if (mlx4_is_slave(dev))
+   return -ENOTSUPP;
+
+   if (!params)
+   return -EINVAL;
+
+   params->bar = priv->fw.clock_bar;
+   params->offset = priv->fw.clock_offset;
+   params->size = MLX4_CLOCK_SIZE;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params);
+
 static void unmap_internal_clock(struct mlx4_dev *dev)
 {
struct mlx4_priv *priv = mlx4_priv(dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 83e80ab..f94984f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -829,6 +829,12 @@ struct mlx4_dev {
struct mlx4_vf_dev *dev_vfs;
 };
 
+struct mlx4_clock_params {
+   u64 offset;
+   u8 bar;
+   u8 size;
+};
+
 struct mlx4_eqe {
u8  reserved1;
u8  type;
@@ -1485,4 +1491,7 @@ int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
 enum mlx4_access_reg_method method,
 struct mlx4_ptys_reg *ptys_reg);
 
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+  struct mlx4_clock_params *params);
+
 #endif /* MLX4_DEVICE_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V1 11/11] IB/mlx4: Return hca core clock's offset in query_device vendor's data

From: Matan Barak 

In order to read the HCA's core clock, the user-space needs to query
the correct offset in the mapped page. This offset is passed
through query_device's vendor specific data.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/main.c|   36 -
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   15 ++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index a5a90c4..84001cf 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -140,10 +140,27 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
struct ib_smp *out_mad = NULL;
int err = -ENOMEM;
int have_ib_ports;
+   struct mlx4_uverbs_ex_query_device cmd;
+   struct mlx4_uverbs_ex_query_device_resp resp = {.comp_mask = 0};
+   struct mlx4_clock_params clock_params;
 
-   if (uhw->inlen || uhw->outlen)
-   return -EINVAL;
+   if (uhw->inlen) {
+   if (uhw->inlen < sizeof(cmd))
+   return -EINVAL;
+
+   err = ib_copy_from_udata(&cmd, uhw, sizeof(cmd));
+   if (err)
+   return err;
+
+   if (cmd.comp_mask)
+   return -EINVAL;
+
+   if (cmd.reserved)
+   return -EINVAL;
+   }
 
+   resp.response_length = offsetof(typeof(resp), response_length) +
+   sizeof(resp.response_length);
in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
if (!in_mad || !out_mad)
@@ -236,6 +253,21 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->hca_core_clock = dev->dev->caps.hca_core_clock;
props->timestamp_mask = 0xULL;
 
+   err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
+   if (err)
+   goto out;
+
+   if (uhw->outlen >= resp.response_length + 
sizeof(resp.hca_core_clock_offset)) {
+   resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE;
+   resp.response_length += sizeof(resp.hca_core_clock_offset);
+   resp.comp_mask |= QUERY_DEVICE_RESP_MASK_TIMESTAMP;
+   }
+
+   if (uhw->outlen) {
+   err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+   if (err)
+   goto out;
+   }
 out:
kfree(in_mad);
kfree(out_mad);
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h 
b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index ae04dad..8421e15 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -556,6 +556,21 @@ struct mlx4_ib_qp_tunnel_init_attr {
u8 port;
 };
 
+struct mlx4_uverbs_ex_query_device {
+   __u32 comp_mask;
+   __u32 reserved;
+};
+
+enum query_device_resp_mask {
+   QUERY_DEVICE_RESP_MASK_TIMESTAMP = 1UL << 0,
+};
+
+struct mlx4_uverbs_ex_query_device_resp {
+   __u32 comp_mask;
+   __u32 response_length;
+   __u64 hca_core_clock_offset;
+};
+
 static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
 {
return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V1 06/11] IB/core: Pass hardware specific data in query_device

From: Matan Barak 

Vendors should be able to pass vendor specific data to/from
user-space via query_device uverb. In order to do this,
we need to pass the vendors' specific udata.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/device.c |4 +++-
 drivers/infiniband/core/uverbs_cmd.c |2 +-
 drivers/infiniband/hw/amso1100/c2_provider.c |7 +--
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |8 ++--
 drivers/infiniband/hw/cxgb4/provider.c   |8 ++--
 drivers/infiniband/hw/ehca/ehca_hca.c|6 +-
 drivers/infiniband/hw/ehca/ehca_iverbs.h |3 ++-
 drivers/infiniband/hw/ipath/ipath_verbs.c|7 +--
 drivers/infiniband/hw/mlx4/main.c|6 +-
 drivers/infiniband/hw/mlx5/main.c|9 +++--
 drivers/infiniband/hw/mthca/mthca_provider.c |7 +--
 drivers/infiniband/hw/nes/nes_verbs.c|6 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |6 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |3 ++-
 drivers/infiniband/hw/qib/qib_verbs.c|6 --
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |6 +-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h |3 ++-
 include/rdma/ib_verbs.h  |3 ++-
 18 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 81eabaf..a17106c 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -558,9 +558,11 @@ EXPORT_SYMBOL(ib_dispatch_event);
 int ib_query_device(struct ib_device *device,
struct ib_device_attr *device_attr)
 {
+   struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+
memset(device_attr, 0, sizeof(*device_attr));
 
-   return device->query_device(device, device_attr);
+   return device->query_device(device, device_attr, &uhw);
 }
 EXPORT_SYMBOL(ib_query_device);
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 11ee298..bbb02ff 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3428,7 +3428,7 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
 
memset(&attr, 0, sizeof(attr));
 
-   err = device->query_device(device, &attr);
+   err = device->query_device(device, &attr, uhw);
if (err)
return err;
 
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c 
b/drivers/infiniband/hw/amso1100/c2_provider.c
index 8e53d05..a00f479 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -63,13 +63,16 @@
 #include "c2_provider.h"
 #include "c2_user.h"
 
-static int c2_query_device(struct ib_device *ibdev,
-  struct ib_device_attr *props)
+static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+  struct ib_udata *uhw)
 {
struct c2_dev *c2dev = to_c2dev(ibdev);
 
pr_debug("%s:%u\n", __func__, __LINE__);
 
+   if (uhw->inlen || uhw->outlen)
+   return -EINVAL;
+
*props = c2dev->props;
return 0;
 }
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c 
b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index eb74c4d..da6b4cf 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1150,13 +1150,17 @@ static u64 fw_vers_string_to_u64(struct iwch_dev 
*iwch_dev)
   (fw_mic & 0x);
 }
 
-static int iwch_query_device(struct ib_device *ibdev,
-struct ib_device_attr *props)
+static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+struct ib_udata *uhw)
 {
 
struct iwch_dev *dev;
+
PDBG("%s ibdev %p\n", __func__, ibdev);
 
+   if (uhw->inlen || uhw->outlen)
+   return -EINVAL;
+
dev = to_iwch_dev(ibdev);
memset(props, 0, sizeof *props);
memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
diff --git a/drivers/infiniband/hw/cxgb4/provider.c 
b/drivers/infiniband/hw/cxgb4/provider.c
index 66bd6a2..c1558c4 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -301,13 +301,17 @@ static int c4iw_query_gid(struct ib_device *ibdev, u8 
port, int index,
return 0;
 }
 
-static int c4iw_query_device(struct ib_device *ibdev,
-struct ib_device_attr *props)
+static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+struct ib_udata *uhw)
 {
 
struct c4iw_dev *dev;
+
PDBG("%s ibdev %p\n", __func__, ibdev);
 
+   if (uhw->inlen |

[PATCH for-next V1 09/11] IB/mlx4: Add support for timestamp in cq creation

From: Matan Barak 

Support allocation of CQ with IB_CQ_FLAGS_TIMESTAMP
creation flag.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/cq.c  |9 ++---
 drivers/infiniband/hw/mlx4/mlx4_ib.h |1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 8e44aaa..a1f50e1 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -166,6 +166,7 @@ err_buf:
return err;
 }
 
+#define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_FLAGS_TIMESTAMP
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
@@ -178,10 +179,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
struct mlx4_uar *uar;
int err;
 
-   if (attr->flags)
+   if (entries < 1 || entries > dev->dev->caps.max_cqes)
return ERR_PTR(-EINVAL);
 
-   if (entries < 1 || entries > dev->dev->caps.max_cqes)
+   if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
return ERR_PTR(-EINVAL);
 
cq = kmalloc(sizeof *cq, GFP_KERNEL);
@@ -194,6 +195,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
spin_lock_init(&cq->lock);
cq->resize_buf = NULL;
cq->resize_umem = NULL;
+   cq->create_flags = attr->flags;
INIT_LIST_HEAD(&cq->send_qp_list);
INIT_LIST_HEAD(&cq->recv_qp_list);
 
@@ -237,7 +239,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
vector = dev->eq_table[vector % ibdev->num_comp_vectors];
 
err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
-   cq->db.dma, &cq->mcq, vector, 0, 0);
+   cq->db.dma, &cq->mcq, vector, 0,
+   !!(cq->create_flags & IB_CQ_FLAGS_TIMESTAMP));
if (err)
goto err_dbmap;
 
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h 
b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index f127efc..ae04dad 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -110,6 +110,7 @@ struct mlx4_ib_cq {
struct mutexresize_mutex;
struct ib_umem *umem;
struct ib_umem *resize_umem;
+   int create_flags;
/* List of qps that it serves.*/
struct list_headsend_qp_list;
struct list_headrecv_qp_list;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC 2/3] IB/mlx4: Refactor Alias GUID storing

Move the code that actually set the alias GUID provided by the admin
into a function which isn't tied to the mlx4 SRIOV sysfs constructs.

So we can use for the verbs which deal with guid setting too.

This commit does not change any functionality.

Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/mlx4_ib.h |4 ++
 drivers/infiniband/hw/mlx4/sysfs.c   |   54 ++
 2 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h 
b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index fce3934..a13a814 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -807,6 +807,10 @@ void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev 
*device);
 
 __be64 mlx4_ib_gen_node_guid(void);
 
+
+void mlx4_store_admin_alias_guid(struct mlx4_ib_dev *mdev, int port, int slave,
+__be64 guid);
+
 int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
 void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
 int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c 
b/drivers/infiniband/hw/mlx4/sysfs.c
index 6797108..705e3b8 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -59,6 +59,38 @@ static ssize_t show_admin_alias_guid(struct device *dev,
return sprintf(buf, "%llx\n", be64_to_cpu(sysadmin_ag_val));
 }
 
+
+void mlx4_store_admin_alias_guid(struct mlx4_ib_dev *mdev, int port, int slave,
+__be64 guid)
+{
+   unsigned long flags;
+   int record_num;/*0-15*/
+   int guid_index_in_rec; /*0 - 7*/
+
+   record_num= slave / 8;
+   guid_index_in_rec = slave % 8;
+
+   spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags);
+
+   *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port - 1].
+   all_rec_per_port[record_num].
+   all_recs[GUID_REC_SIZE * guid_index_in_rec] = guid;
+
+   /* Change the state to be pending for update */
+   mdev->sriov.alias_guid.ports_guid[port - 
1].all_rec_per_port[record_num].status
+   = MLX4_GUID_INFO_STATUS_IDLE ;
+
+   mlx4_set_admin_guid(mdev->dev, guid, slave, port);
+
+   /* set the record index */
+   mdev->sriov.alias_guid.ports_guid[port - 
1].all_rec_per_port[record_num].guid_indexes
+   |= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+
+   spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags);
+
+   mlx4_ib_init_alias_guid_work(mdev, port - 1);
+}
+
 /* store_admin_alias_guid stores the (new) administratively assigned value of 
that GUID.
  * Values in buf parameter string:
  * 0   - requests opensm to assign a value.
@@ -76,7 +108,6 @@ static ssize_t store_admin_alias_guid(struct device *dev,
struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
struct mlx4_ib_dev *mdev = port->dev;
u64 sysadmin_ag_val;
-   unsigned long flags;
 
record_num = mlx4_ib_iov_dentry->entry_num / 8;
guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8;
@@ -84,26 +115,11 @@ static ssize_t store_admin_alias_guid(struct device *dev,
pr_err("GUID 0 block 0 is RO\n");
return count;
}
-   spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags);
sscanf(buf, "%llx", &sysadmin_ag_val);
-   *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1].
-   all_rec_per_port[record_num].
-   all_recs[GUID_REC_SIZE * guid_index_in_rec] =
-   cpu_to_be64(sysadmin_ag_val);
-
-   /* Change the state to be pending for update */
-   mdev->sriov.alias_guid.ports_guid[port->num - 
1].all_rec_per_port[record_num].status
-   = MLX4_GUID_INFO_STATUS_IDLE ;
-   mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val),
-   mlx4_ib_iov_dentry->entry_num,
-   port->num);
 
-   /* set the record index */
-   mdev->sriov.alias_guid.ports_guid[port->num - 
1].all_rec_per_port[record_num].guid_indexes
-   |= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
-
-   spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags);
-   mlx4_ib_init_alias_guid_work(mdev, port->num - 1);
+   mlx4_store_admin_alias_guid(mdev, port->num,
+   mlx4_ib_iov_dentry->entry_num,
+   cpu_to_be64(sysadmin_ag_val));
 
return count;
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC 1/3] IB/IPoIB: Support SRIOV standard configuration

Standard configuration of SRIOV VFs through the host is done over the
following chain of calls: libvirt --> netlink --> PF netdevice

When this comes to IB/IPoIB we should normalize this into the verbs
framework so we further go: PF IPoIB --> verbs API --> PF HW driver

Virtualization systems assign VMs 48 bits mac, to allow working with
non-modified SW layers (open-stack, libvirt, etc), we can safely
extend this mac to unique 64 bits GUID. Hence the IPoIB ndo_set_vf_mac
entry calls the set_vf_guid verb.

Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   39 +
 include/rdma/ib_verbs.h   |4 +++
 2 files changed, 43 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 9e1b203..8f82870 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -1357,6 +1357,43 @@ void ipoib_dev_cleanup(struct net_device *dev)
priv->tx_ring = NULL;
 }
 
+static int ipoib_get_vf_config(struct net_device *dev, int vf, struct 
ifla_vf_info *ivf)
+{
+   struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+   if (priv->ca->get_vf_config)
+   return priv->ca->get_vf_config(priv->ca, priv->port, vf, ivf);
+   else
+   return -EINVAL;
+}
+
+static int ipoib_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
+{
+   char *raw_guid;
+   u64 guid = 0;
+
+   struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+   raw_guid = (char *)&guid;
+   raw_guid[0] = mac[0];
+   raw_guid[1] = mac[1];
+   raw_guid[2] = mac[2];
+   raw_guid[3] = 0xff;
+   raw_guid[4] = 0xfe;
+   raw_guid[5] = mac[3];
+   raw_guid[6] = mac[4];
+   raw_guid[7] = mac[5];
+
+   guid &= ~(cpu_to_be64(1ULL << 56));
+   guid |= cpu_to_be64(1ULL << 57);
+
+   if (priv->ca->set_vf_guid)
+   return priv->ca->set_vf_guid(priv->ca, priv->port, queue, guid);
+   else
+   return -EINVAL;
+}
+
+
 static const struct header_ops ipoib_header_ops = {
.create = ipoib_hard_header,
 };
@@ -1371,6 +1408,8 @@ static const struct net_device_ops ipoib_netdev_ops = {
.ndo_tx_timeout  = ipoib_timeout,
.ndo_set_rx_mode = ipoib_set_mcast_list,
.ndo_get_iflink  = ipoib_get_iflink,
+   .ndo_get_vf_config   = ipoib_get_vf_config,
+   .ndo_set_vf_mac  = ipoib_set_vf_mac,
 };
 
 void ipoib_setup(struct net_device *dev)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 65994a1..6589520 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -53,6 +53,7 @@
 #include 
 #include 
 #include 
+#include 
 
 extern struct workqueue_struct *ib_wq;
 
@@ -1653,6 +1654,9 @@ struct ib_device {
int(*check_mr_status)(struct ib_mr *mr, u32 
check_mask,
  struct ib_mr_status 
*mr_status);
 
+   int   (*set_vf_guid)  (struct ib_device *device, 
int port, int vf, u64 guid);
+   int   (*get_vf_config)(struct ib_device *device, 
int port, int vf, struct ifla_vf_info *ivf);
+
struct ib_dma_mapping_ops   *dma_ops;
 
struct module   *owner;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC 0/3] Support standard SRIOV configuration for IB VFs

Standard configuration of SRIOV VFs through the host is done over the
following chain of calls: libvirt --> netlink --> PF netdevice -- where
the PF netdevice exports the ndo_set_vf_ calls.

When this comes to IB/IPoIB we should normalize this into the verbs
framework so we further go: PF IPoIB --> verbs API --> PF HW driver

Virtualization systems assign VMs with 48 bits mac, to allow working 
with non-modified SW layers (open-stack, libvirt, etc), we can safely
extend this mac to unique 64 bits GUID. Hence the IPoIB ndo_set_vf_mac
entry calls the set_vf_guid verb.

One thing to clean for being beyond RFC is to make the get_vf_config 
verb return guid and have IPoIB to make it back a mac.

Here's how it looks when using the ip tool (libvirt runs the same
netlink to set it out) and later reflected when the VF read their port.

# ip link set dev ib0 vf 1 mac aa:bb:cc:dd:ee:ff

# ibstat -d mlx4_2
CA 'mlx4_2'
CA type: MT4100
Number of ports: 1
Firmware version: 2.34.1260
Hardware version: 0
Node GUID: 0x00140500f30e84c4
System image GUID: 0xf452140300117423
Port 1:
State: Active
Physical state: LinkUp
Rate: 56
Base lid: 7
LMC: 0
SM lid: 1
Capability mask: 0x02514868
Port GUID: 0xffeeddfeffccbbaa
    Link layer: InfiniBand

Or Gerlitz (3):
  IB/IPoIB: Support SRIOV standard configuration
  IB/mlx4: Refactor Alias GUID storing
  IB/mlx4: Add support for SRIOV VF management

 drivers/infiniband/hw/mlx4/main.c |   26 ++
 drivers/infiniband/hw/mlx4/mlx4_ib.h  |4 ++
 drivers/infiniband/hw/mlx4/sysfs.c|   54 ++--
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   39 +
 drivers/net/ethernet/mellanox/mlx4/cmd.c  |   26 +
 include/linux/mlx4/device.h   |2 +
 include/rdma/ib_verbs.h   |4 ++
 7 files changed, 128 insertions(+), 27 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RFC 3/3] IB/mlx4: Add support for SRIOV VF management

Add support for the set_vf_guid and get_vf_config verbs.

Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/main.c|   26 ++
 drivers/net/ethernet/mellanox/mlx4/cmd.c |   26 ++
 include/linux/mlx4/device.h  |2 ++
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 57070c5..17b6fa7 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -1325,6 +1325,27 @@ err_malloc:
return err;
 }
 
+static int mlx4_ib_set_vf_guid(struct ib_device *ibdev, int port, int vf, u64 
guid)
+{
+   int slave;
+   struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+
+   slave = mlx4_get_slave_indx(mdev->dev, vf);
+   if (slave < 0)
+   return -EINVAL;
+
+   mlx4_store_admin_alias_guid(mdev, port, slave, cpu_to_be64(guid));
+
+   return 0;
+}
+
+static int mlx4_ib_get_vf_config(struct ib_device *ibdev, int port, int vf, 
struct ifla_vf_info *ivf)
+{
+   struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+
+   return mlx4_get_vf_config(mdev->dev, port, vf, ivf);
+}
+
 static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
 {
struct mlx4_ib_gid_entry *ge;
@@ -2250,6 +2271,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
ibdev->ib_dev.dealloc_fmr   = mlx4_ib_fmr_dealloc;
}
 
+   if (mlx4_is_master(ibdev->dev)) {
+   ibdev->ib_dev.set_vf_guid   = mlx4_ib_set_vf_guid;
+   ibdev->ib_dev.get_vf_config = mlx4_ib_get_vf_config;
+   }
+
if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 7761045..a544650 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2647,7 +2647,7 @@ u32 mlx4_comm_get_version(void)
 return ((u32) CMD_CHAN_IF_REV << 8) | (u32) CMD_CHAN_VER;
 }
 
-static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
+int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
 {
if ((vf < 0) || (vf >= dev->persist->num_vfs)) {
mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n",
@@ -2657,6 +2657,7 @@ static int mlx4_get_slave_indx(struct mlx4_dev *dev, int 
vf)
 
return vf+1;
 }
+EXPORT_SYMBOL_GPL(mlx4_get_slave_indx);
 
 int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
 {
@@ -3089,13 +3090,22 @@ int mlx4_get_vf_config(struct mlx4_dev *dev, int port, 
int vf, struct ifla_vf_in
s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
ivf->vf = vf;
 
-   /* need to convert it to a func */
-   ivf->mac[0] = ((s_info->mac >> (5*8)) & 0xff);
-   ivf->mac[1] = ((s_info->mac >> (4*8)) & 0xff);
-   ivf->mac[2] = ((s_info->mac >> (3*8)) & 0xff);
-   ivf->mac[3] = ((s_info->mac >> (2*8)) & 0xff);
-   ivf->mac[4] = ((s_info->mac >> (1*8)) & 0xff);
-   ivf->mac[5] = ((s_info->mac)  & 0xff);
+   if (dev->caps.port_mask[port] == MLX4_PORT_TYPE_ETH) {
+   ivf->mac[0] = ((s_info->mac >> (5*8)) & 0xff);
+   ivf->mac[1] = ((s_info->mac >> (4*8)) & 0xff);
+   ivf->mac[2] = ((s_info->mac >> (3*8)) & 0xff);
+   ivf->mac[3] = ((s_info->mac >> (2*8)) & 0xff);
+   ivf->mac[4] = ((s_info->mac >> (1*8)) & 0xff);
+   ivf->mac[5] = ((s_info->mac)  & 0xff);
+   } else {
+   u64 guid = be64_to_cpu(s_info->guid);
+   ivf->mac[0] = ((guid >> (7*8)) & 0xff);
+   ivf->mac[1] = ((guid >> (6*8)) & 0xff);
+   ivf->mac[2] = ((guid >> (5*8)) & 0xff);
+   ivf->mac[3] = ((guid >> (2*8)) & 0xff);
+   ivf->mac[4] = ((guid >> (1*8)) & 0xff);
+   ivf->mac[5] = ((guid)  & 0xff);
+   }
 
ivf->vlan   = s_info->default_vlan;
ivf->qos= s_info->default_qos;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 83e80ab..e5a70bd 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -1382,6 +1382,8 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, 
int port, u8 *gid,
 int mlx4_get_roce_gid_from_slave(struct mlx4_dev *dev, int port, int slave_id,
 u8 *gid);
 
+int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf);
+
 int mlx4_FLOW_STEERING_IB_UC_QP_RANGE(struct mlx4_dev *dev, u32 min_range_qpn,
  u32 max_range_qpn);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 0/3] Support standard SRIOV configuration for IB VFs

On Thu, May 21, 2015 at 7:40 PM, Doug Ledford  wrote:

> The MAC/GUID mapping isn't the only thing that has to be faked here

Exactly nothing is faked here, Virtualization systems such as
open-stack provision unique 48 bit mac values to VMs, and it's
perfectly legitimate and viable to derive 64 bit guid value from that
mac.

> Why are we suggesting to make this work with unmodified software?  Why
> aren't we doing this right and adding a new ndo entry point for the GUID?

Because rome wasn't built in a day and nor will be the support for IB
in today's/tomorrow's virtualization systems, e.g if you follow on
this layering

[1] Open-Stack / ODL controller
[2] Open-Stack neutron / ODL agent
[3] libvirt
[4] user/kernel netlink API
[5] kernel ndo API
[6] ipoib
[7] kernel verbs API
[8] PF IB driver

with the approach presented here,  we only simply (yeah, simplicity
could turn to be critical criteria in engineering) to few kernel only
patches that deal with layers 6-8 and we are ready for all sorts of
bring-ups, testing and even production!

For reasons which I don't really see the practical / real life use
case where there's a must to get them to work (but I will happy to
hear on) one can go & change the world, namely patch layers 5 ---> 1
too and deal with all sort of dependencies for setting up a system.
But guess what, this can be perfectly done in parallel with this small
change.

> you would also have to fake the vlan/pkey mapping.  This just
> seems the wrong thing to do.

Repeating the above argument --- virt systems provision 12bit vlan-id
to be set for VM traffic, which can be nicely map to 16 bit IB pkey
doing the same job.

I understand that you have sort of  desire to see IB ala the full spec
going into libvirt and from there up to the whole virtualization
management space, but this doesn't need as an argument to not enable
doing thing in the right direction. The upstream kernel supports SRIOV
for IB over mlx4 for 3 years now, but this can't work with libvirt as
is. Using these patches can make the thing.

Couple of months ago, we both attended a call with the libvirt
developers / maintainers from red-hat and they really liked this
staged approach.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 1/3] IB/IPoIB: Support SRIOV standard configuration

On Thu, May 21, 2015, Jason Gunthorpe  wrote:

>> +static int ipoib_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
>> +{
>> + char *raw_guid;
>> + u64 guid = 0;
>
> This doesn't seem right at all.
>
> It makes no sense that a IPoIB interface with a 20 byte LLADDR would
> accept an 8 byte LLADDR only for 'ip link set vf mac'
>
> The definition of the netlink struct seems to confirm this:
>
> struct ifla_vf_mac {
> __u32 vf;
> __u8 mac[32]; /* MAX_ADDR_LEN */
> };
>
> If it was really just ever a mac it would really only be 6 bytes.
> [Honestly, this whole feature seems very inconistent with the rest of
>  the design of ip net link, so who knows]
>
> If the ifla_vf_mac had been variable-sized (like every other address
> related attribute) then sure, auto detect the length and do the right
> thing.
>
> But with this API, I think you have no choice, 'ip set vf mac LLADDR'
> can only be the 20 byte address.

You can't enforce 20 byte address on ipoib instance b/c the driver
can't dictate the QPN of their UD QP. Also, you don't need to force
that, b/c what the virtualization system want to provision relates to
a VM ID which is their mac (--> guid).

If the ifla_vf_mac had been variable-sized we could have the ipoib
set_vf_mac implementation to check if user-space provided 48 bits
(mac), 64bits (guid) or even 128bits (whole gid), but it doesn't and I
would like to either use the lowest common denominator (48bits) or
just take always 64bits which could have two zero bytes (e.g when
libvirt calls into that api through netlink).
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V1 00/11] Add completion timestamping support

2015-05-22 Thread Or Gerlitz

On Thu, May 21, 2015 at 5:56 PM, Or Gerlitz  wrote:
[...]
> In mlx4, allowing the user to read the core clock efficiently involves mapping
> this area of the hardware to user-space (being done by using a mmap command)
> and reading the clock from the correct offset of the page.
>
> This offset is returned in the vendor's specific data from mlx4's kernel 
> driver
> to the mlx4's user-space driver. query_device is modified in order to support
> passing this vendor specific data. A user-space application could use a new
> verb in order to read the hardware's clock.
>
> Translating the hardware's clock into ms could be done by dividing this
> value by hca_core_clock (which is returned by the extended version of
> query_device uverb).
>
> A user-space application could get the current HW's clock by executing
>
> ibv_query_values_ex(struct ibv_context *context, uint32_t q_values,
> struct ibv_values_ex *values)
>
> The function gets a mask of the values to query and return their values.
> Vendors could either implement this as a uverb command or use their
> user-space driver to return those values directly from the HW (the mlx4 way).
>
> Changes from V0:
> (1) Pass ib_cq_init_attr instead of cqe and comp_vector.
> (2) Fix unneeded indentation.
> (3) Change flags to u32.
> (4) Add const to create_cq's ib_cq_init_attr argument in vendor 
> implementation.


Hi Doug,

I believe the V1 post addressed all the reviewer comments on V0 except
for one open issue.

What's still open is the debate on mlx4 using the uverbs builtin udata
mechanism to transfer the clock offset in the mmaped page to user
space. I have replied there and still waiting get a response.

Or.

> Matan Barak (11):
>   IB/core: Change provider's API of create_cq to be extendible
>   IB/core: Change ib_create_cq to use struct ib_cq_init_attr
>   IB/core: Add CQ creation time-stamping flag
>   IB/core: Extend ib_uverbs_create_cq
>   IB/core: Add timestamp_mask and hca_core_clock to query_device
>   IB/core: Pass hardware specific data in query_device
>   IB/mlx4: Add mmap call to map the hardware clock
>   IB/mlx4: Support extended create_cq and query_device uverbs
>   IB/mlx4: Add support for timestamp in cq creation
>   IB/mlx4: Add timestamp_mask and hca_core_clock to query_device
>   IB/mlx4: Return hca core clock's offset in query_device vendor's data
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 09/10] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device

2015-05-22 Thread Or Gerlitz

On Wed, May 20, 2015 at 8:53 PM, Or Gerlitz  wrote:
> On Wed, May 20, 2015 at 6:11 PM, Yann Droneaud  wrote:
>
>>> But this is whole purpose of the udata framework in uverbs, right? for
>>> each uverb command the vendor user-space library has a well defined
>>> channel to communicate directly with the low level vendor driver
>>> throughout the uverbs channels.
>
>> Uverbs convey information between kernel and userspace drivers to
>> implement verbs for userspace application. I don't think it's designed
>> to allow vendor to add random extensions in the best way with regard to
>> backward/forward compability.
>
> Disagree that this is random extension. The people that designed this
> stack 10y ago (Roland and Co.) looked very nicely forward and realized
> that not all the HW are the same nor can be put 101% under the same
> API with no way out, and hence they came up with udata.
>
> Please state how you see the role of the uverbs udata mechanism.

Guys, still waiting to hear why you think it's wrong here to use the
mechanism which was built from day-1 for the purpose of allowing the
user-space driver library to communicate with the kernel driver and
pass values in both directions.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 0/3] Support standard SRIOV configuration for IB VFs

2015-05-25 Thread Or Gerlitz

On Fri, May 22, 2015 at 12:11 AM, Doug Ledford  wrote:
> On Thu, 2015-05-21 at 22:55 +0300, Or Gerlitz wrote:
>> On Thu, May 21, 2015 at 7:40 PM, Doug Ledford  wrote:
>>
>> > The MAC/GUID mapping isn't the only thing that has to be faked here
>>
>> Exactly nothing is faked here, Virtualization systems such as
>> open-stack provision unique 48 bit mac values to VMs, and it's
>> perfectly legitimate and viable to derive 64 bit guid value from that
>> mac.
>
> OK, faked wasn't the best use of words.  How's converted behind the
> software's back?  And if the management software set the MAC, then tried
> to check it via ARP after the guest is up and running, it would never
> find the guest.  I don't know if Open-Stack or any other controller
> would both A) attempt to set the MAC of the device in libvirt and start
> the guest and B) enter the MAC into a dhcp.conf file for static IP
> assignment, but they could, and this sort of manipulation would directly
> break that.

OK, so rewinding a bit, the IB VF [1] identity is their 8 bytes port
GUID, and as Jason noted the
user/kernel API allows to deliver up to 32 bytes between user and
kernel under the set_vf_mac flow
(do_setvfinfo() in net/core/rtnetlink.c). Trying it out through
**non-modified** ip tool and net/core/rtnetlink.c
things just work -  I can set eight bytes value to be the virtual port GUID :

# ip link set dev ib0 vf 1 mac aa:bb:cc:dd:ee:ff:11:22

# ibstat -d mlx4_2
CA 'mlx4_2'
CA type: MT4100
Number of ports: 1
Firmware version: 2.34.1260
Hardware version: 0
Node GUID: 0x001405003bca04bb
System image GUID: 0xf452140300117423
Port 1:
State: Active
Physical state: LinkUp
Rate: 56
Base lid: 7
LMC: 0
SM lid: 1
Capability mask: 0x02514868
Port GUID: 0x2211ffeeddccbbaa
Link layer: InfiniBand

Re DHCP: RFC 2131 adds a “client identifier” option that replaces the
client hardware address as the unique identifier of the client in its
subnet. DHCP over IB RFC 4390 [1] requires that IPoIB DHCP clients use
the client identifier (as they cannot fit their 20 byte MAC in the
client hardware address field). DHCP packages (e.g ISC) that support
IPoIB use client identifier which is based on the unique eight byte
port GUID, so with the modified patches that use 8 bytes, we're OK
DHCP wise.

[1] https://tools.ietf.org/html/rfc4390#section-2.1

[...]
> It's a workaround.  It comes with limitations, and if we get around to
> adding an ndo later for really setting the guid, then it would be
> possible to call the set_guid ndo with a complete guid that didn't use
> fffe in the middle 2 bytes, and then when we call get vf_info, we get a
> MAC back that removes those 2 bytes and generates an inconsistency
> between what we *think* our constructed guid should be and what the set
> guid actually is.

OK, as written above, I have managed to get away from this possible mess
which you described here by providing eight bytes from user to kernel through
the existing netlink API (which is used by the ip tool and libvirt).

>> Couple of months ago, we both attended a call with the libvirt
>> developers / maintainers from red-hat and they really liked this
>> staged approach.

> My recollection of that call was they said "Oh, you guys don't have an
> API for us to set the GUIDs yet.  Ok, we'll close all the bugs and wait
> until you do."  And they promptly closed the bugs and moved on.  But
> that didn't specify the API to use.  That's what we are doing here.  But
> I'm not finding this an entirely convincing solution.

So that was what said, we were wrong and with this small ipoib/verbs patch,
we fully have the API to provision the vGUID of the VF.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 0/3] Support standard SRIOV configuration for IB VFs

2015-05-25 Thread Or Gerlitz

On Tue, May 26, 2015 at 12:14 AM, Jason Gunthorpe
 wrote:
> On Mon, May 25, 2015 at 11:04:41PM +0300, Or Gerlitz wrote:
>
>> OK, so rewinding a bit, the IB VF [1] identity is their 8 bytes port
>> GUID, and as Jason noted the user/kernel API allows to deliver up to
>> 32 bytes between user and kernel under the set_vf_mac flow
>> (do_setvfinfo() in net/core/rtnetlink.c). Trying it out through
>> **non-modified** ip tool and net/core/rtnetlink.c things just work -
>> I can set eight bytes value to be the virtual port GUID :
>
> Was I not perfectly clear? You have to use the 20 byte LLADDR format here:
>
>> # ip link set dev ib0 vf 1 mac aa:bb:cc:dd:ee:ff:11:22

Jason,

I am aiming to provision the VF IB end-node address == port GUID (vGUID)
in the same manner that VF Eth end-node address is their MAC, not
more, not less.

20 bytes are the lladdr of IPoIB devices which isn't the VF IB
end-node address but
rather made of flags (1B) + QPN (3B) + subnet prefix (8B) + VF GUID --
way more then
the virtualization system care or can provision.

>> Port GUID: 0x2211ffeeddccbbaa

> The byte order got screwed up someplace.

thx, will fix
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 0/3] Support standard SRIOV configuration for IB VFs

2015-05-25 Thread Or Gerlitz

On Tue, May 26, 2015 at 1:32 AM, Jason Gunthorpe
 wrote:
> On Tue, May 26, 2015 at 12:50:52AM +0300, Or Gerlitz wrote:
>> On Tue, May 26, 2015 at 12:14 AM, Jason Gunthorpe
>>  wrote:
>> > On Mon, May 25, 2015 at 11:04:41PM +0300, Or Gerlitz wrote:
>> >
>> >> OK, so rewinding a bit, the IB VF [1] identity is their 8 bytes port
>> >> GUID, and as Jason noted the user/kernel API allows to deliver up to
>> >> 32 bytes between user and kernel under the set_vf_mac flow
>> >> (do_setvfinfo() in net/core/rtnetlink.c). Trying it out through
>> >> **non-modified** ip tool and net/core/rtnetlink.c things just work -
>> >> I can set eight bytes value to be the virtual port GUID :
>> >
>> > Was I not perfectly clear? You have to use the 20 byte LLADDR format here:
>> >
>> >> # ip link set dev ib0 vf 1 mac aa:bb:cc:dd:ee:ff:11:22
>>
>> Jason,
>>
>> I am aiming to provision the VF IB end-node address == port GUID (vGUID)
>> in the same manner that VF Eth end-node address is their MAC, not
>> more, not less.
>
> I perfectly understand what you are trying to do.

Good, we should be doing things for a reason, and not just for the sake of doing
them 111% right by someone possibly subjective judgement

> I care about the design and consistency of netlink - and that means
> there is one LLADDR definition for a net device, and every single netlink
> message that touches a LLADDR uses that definition - for IPoIB that is 20
> bytes.

> To violate that design invariant needs an incredibly strong argument,
> and you haven't made it.


> This allows generic code to work with the LLADDR - for instance 'ip
> link set vf mac' should have checked the size of the IFLA_ADDRESS and
> demanded that the address argument is the same number of bytes. It is
> very broken the command happily accepts an 8 byte and 6 byte argument
> for the same device.

OK, so per your view, the existing kernel code for this flow is broken
and you resist my attempt to use it as is, and

> Yes, it is ugly that the PF side's ndo_get_vf_config cannot return the
> same 20 byte address of the VF's ipoib interface, but I think that is
> less ugly than forcing a different address format just for the vf calls.

you claim that what I propose is uglier from the fact that the PF can't
by no means return the 20 VF's IPoIB address and it's OK if I only let
the PF configure 20 bytes with part (say four) of them being arbitrary
and only have consistent 20B get/set by the PF.

Would you be happier if the ipoib ndo_set_vf_mac ndo be

1. getting 20B from user-space and treating 16 of them as the VF
subnet-prefix (8B) + vGUID (8B)

2. checking that the subnet-prefix  is correct

3. provision the vGUID through PF driver / the verb I proposed for the VF

???

on the way back, for the get_vf_config

1. read the VF vGUID from the PF IB driver through the verbs

2. add the port subnet prefix

3. return 20B to user-space

???

> If you have doubts then *ask netdev*. Ask them if ndo_set_vf_mac must
> follow the same address size and format as IFLA_ADDRESS, or if we can
> use something else.

> Such a netlink architecture choice is beyond the authority of linux-rdma.

I am not @ the point to change start changing this specific netlink flow.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 09/10] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device

2015-05-26 Thread Or Gerlitz

On Sat, May 23, 2015 at 7:26 AM, Or Gerlitz  wrote:
> On Wed, May 20, 2015 at 8:53 PM, Or Gerlitz  wrote:
>> On Wed, May 20, 2015 at 6:11 PM, Yann Droneaud  wrote:
>>
>>>> But this is whole purpose of the udata framework in uverbs, right? for
>>>> each uverb command the vendor user-space library has a well defined
>>>> channel to communicate directly with the low level vendor driver
>>>> throughout the uverbs channels.
>>
>>> Uverbs convey information between kernel and userspace drivers to
>>> implement verbs for userspace application. I don't think it's designed
>>> to allow vendor to add random extensions in the best way with regard to
>>> backward/forward compability.
>>
>> Disagree that this is random extension. The people that designed this
>> stack 10y ago (Roland and Co.) looked very nicely forward and realized
>> that not all the HW are the same nor can be put 101% under the same
>> API with no way out, and hence they came up with udata.
>>
>> Please state how you see the role of the uverbs udata mechanism.
>
> Guys, still waiting to hear why you think it's wrong here to use the
> mechanism which was built from day-1 for the purpose of allowing the
> user-space driver library to communicate with the kernel driver and
> pass values in both directions.

Jason, ping, it's fair to require that if you made a review argument against
the design done here and we've responded about a week ago, saying why
this design is valid (e.g goes along the 10y old IB stack udata mechanism and
such) -- you would comment on the response and not  leave it in the air.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 09/10] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device

2015-05-26 Thread Or Gerlitz

On Tue, May 26, 2015 at 7:06 PM, Jason Gunthorpe
 wrote:
> On Tue, May 26, 2015 at 11:10:45AM +0300, Or Gerlitz wrote:
>
>> Jason, ping, it's fair to require that if you made a review argument against
>> the design done here and we've responded about a week ago, saying why
>> this design is valid (e.g goes along the 10y old IB stack udata mechanism and
>> such) -- you would comment on the response and not  leave it in the air.
>
> Was it not clear? Yann and I asked to see the user space side before
> reviewing this series further.

Jason, you (U2 BTW) play really, really hard - refusing to say **one**
word on your approach towards the built-in udata mechanism for uverbs
which I asked you to comment on.

On top of that, as happens **all** the **time** in netdev and possibly
other subsystems, user space facing kernel patches were reviewed and
accepted in this list over the last ten years with-out seeing their
user-space counter parts @ the time of the kernel submission. There's
no reason to impose this as hard requirement just b/c two reviewers
ask that. You don't own this place.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 09/10] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device

2015-05-26 Thread Or Gerlitz

On Tue, May 26, 2015 at 9:53 PM, Jason Gunthorpe
 wrote:
> On Tue, May 26, 2015 at 09:33:18PM +0300, Or Gerlitz wrote:
>> On Tue, May 26, 2015 at 7:06 PM, Jason Gunthorpe
>>  wrote:
>> > On Tue, May 26, 2015 at 11:10:45AM +0300, Or Gerlitz wrote:
>> >
>> >> Jason, ping, it's fair to require that if you made a review argument 
>> >> against
>> >> the design done here and we've responded about a week ago, saying why
>> >> this design is valid (e.g goes along the 10y old IB stack udata mechanism 
>> >> and
>> >> such) -- you would comment on the response and not  leave it in the air.
>> >
>> > Was it not clear? Yann and I asked to see the user space side before
>> > reviewing this series further.
>>
>> Jason, you (U2 BTW) play really, really hard - refusing to say **one**
>> word on your approach towards the built-in udata mechanism for uverbs
>> which I asked you to comment on.

> And I asked to see the user space side and you have angrily refused
> every time.

AFAIR I never ever refused to show any piece of code which went under
my hands towards Linux to any-one.

> So I guess we are both playing hard.

I disagree, you act as sort of being the boss here, stating every now
and then your preferences and way of engineering things as the
ultimate guidelines for Linux and/or RDMA engineering.

> FWIW, your comments on udata seemed compelling

Good to hear

> but I want to see the whole solution before saying I'm OK with it.

go look, not the final cut but should be close to what we'll submit

https://github.com/matanb10/libibverbs timestamp-v0
https://github.com/matanb10/libmlx4 timestamp-v0
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: rdma kernel tree

Doug,

Do you have everything in place such that what you stage for upstream
is subject to linux-next auto merge tests and such?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 09/10] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device

On Wed, May 27, 2015 at 1:07 AM, Jason Gunthorpe
 wrote:
> On Tue, May 26, 2015 at 11:39:04PM +0300, Or Gerlitz wrote:
>> >> Jason, you (U2 BTW) play really, really hard - refusing to say **one**
>> >> word on your approach towards the built-in udata mechanism for uverbs
>> >> which I asked you to comment on.
>>
>> > And I asked to see the user space side and you have angrily refused
>> > every time.
>>
>> AFAIR I never ever refused to show any piece of code which went under
>> my hands towards Linux to any-one.

> For future reference, when someone asks a question and you go off on
> an tangental rant and ignore the question, then that process repeats,
> still without answering the question - most english speakers would
> call that refusing to answer the question. It is not looked upon kindly.

Jason,

It's not that you asked to see the code ala "hey, do you happen to
have a git with
the user space code for people to inspect while doing the review on
the kernel part", but
rather U2 saying in a definitive manner that posting the user space
code should be
imposed as pre-requirement to acceptance of the kernel parts.

In parallel, U2 totally rejected our usage of udata @ on the spot and
when I mentioned
that it's a feature which was designed for that purpose exactly and
from day one, it took
me three reminders to get a "you know what, maybe that can fly"
comment from you.

So here I started to realize that there's something in the attitude
that goes beyond
the details, and I made the you're not the boss comment.


> I'm really confused why you didn't just post the github links last
> week, the patches are all a month old on there. Was it really so
> offensive to you that we wanted to review the kernel UAPI patches and
> verbs patches together?
>
>> > So I guess we are both playing hard.
>>
>> I disagree, you act as sort of being the boss here, stating every now
>> and then your preferences and way of engineering things as the
>> ultimate guidelines for Linux and/or RDMA engineering.
>
> Lets be clear Or, I have given you (and others) some very pointed
> comments and advice, privately and publicly. That is not 'being the
> boss' that is contributing to fix our community.
>
> When it comes to my patch comments, I give direction on what I want to
> see to provide my Reviewed-By.
>
> If you don't like it, then find someone else to review your code.
>
> I'm busy, and I don't work for you. If I don't want to review some
> patches because my questions have been ignored, then that is entirely
> my perogative.
>
> Pinging me *three times this week* on this stupid timestamp thing, is
> somewhere between annoying and offensive.

See above, why I made these pings.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 0/3] Support standard SRIOV configuration for IB VFs


On 5/27/2015 12:11 AM, Jason Gunthorpe wrote:

On Tue, May 26, 2015 at 04:32:58PM -0400, Doug Ledford wrote:


  - ifcfg/udev/networkmanager: So what happens when I do
 ip link add link ib0 name ib0.1 type ipoib
And get two IPoIB interfaces with the same GUID? I doubt any sane
user would want to apply the same config to those two interfaces.

No, they probably don't want to apply the same rules to both interfaces.
I'm not entirely sure I agree with the argument though.  I fully
expected this to fail without a pkey argument on the ip command
line.

Does that matter to the above tools? Are they using PKey,GUID as their
key?


The net stack doesn't allow users to do the same thing with Ethernet
devices, so I'm not sure we shouldn't be disallowing this as opposed to
creating duplicate devices that are identical in all ways except name.

The netstack doesn't allow it for ethernet because it would create a
2nd identical LLADDR, and LLADDRs must be unique.

Because the QPN is part of the LLADDR IB can create two interfaces on
the same physical port that are completely separated by hardware. Read
Haggi's email, he explains how they plan to use this to create
interfaces that can be delegated to namespaces. It is not a bad idea
really..

So prepare for a world where each namespace has a child IPoIB
interface with a unique QPN, but the same Pkey and GUID as the
host. The breakage from assuming GUID == unique will become a problem.


Unbreaking it is a UAPI change, not impossible, but do we really care
enough about 8 or 20 to push for that?

In truth, at least right now, it's all moot.  Since we can't set the
subnet prefix, the qpn, or the flags, anything above 8 bytes is
immutable regardless of how many bytes we pass in.  So even if we say we
aren't going to change the UAPI and for everything to 20, the real world
result is that 8 works exactly the same and has no functional
difference.

Not quite, in the 20 byte format the 8 bytes of the GUID are in the
last 8/20 bytes, so the app would have to place 12 zeros and then the
GUID to follow the 20 byte format (or 4 zeros, the prefix, then the GUID)

This is why the question of 'what is ILFA_VF_MAC' is so important,
every option presented (MAC,GUID,LLADDR) are incompatible with each
other.


I agree with Doug that to be practical here, libvirt and Co. would 
really want to use rtnetlink based provisioning of IB VFs, at least in a 
similar manner done for Eth VFs.


So with this assumption at hand, my vote goes to having user-space to 
provide the eight bytes of vGUID through the ndo_set_vf_mac call into 
IPoIB.


I don't see the real value of user space providing the four zero bytes 
(19-16) and the 8 bytes of the subnet prefix provided by the SM.


My personal thinking is that the important thing to address is 
consistency between what the virtualization system provisions on the 
host (ndo_set_vf_mac) to the DHCP server scheme they build.


Do we have a go here?

Also few comments on DHCP:

If we're talking on different vlans/Eth or pkey/IB - it's totally OK for 
two entities (== IPoIB instances under IB) on the physical subnet to use 
the same identifier (IB/GUID, Eth/MAC) if they are on two different L2 
broadcast domains. The DHCP server is expected to have a different 
mapping scheme per such virtual L2 subnet.


For SRIOV, we don't expect two VFs on the network to use the same vGUID, 
so DHCP wise we should be OK. Today the Client-ID works fine for SRIOV 
schemes which are based on 8byte vGUIDs.


Re two IPoIB child devices using the same GUID and the same pkey, we can 
enhance the system and take advantage of IB Alias GUIDs which today are 
only used for SRIOV for Para-Virtual and other environments too, thanks 
for the heads up on the necessity of doing so.





What does get return? If we accept 8 or 20, then get must return 20.

The get has to return 20 regardless.  It's the only accepted means of
getting all 20 bytes of the LLADDR.

You are conflating IFLA_ADDRESS and IFLA_VF_MAC.

IFLA_VF_MAC could be 8 byte and IFLA_ADDRESS could be 20, I think that
makes no sense, but it wouldn't break existing stuff.



Just to make sure we're on the same page, this thread deals with using 
rtnetlink's IFLA_VF_MAC(== struct ifla_vf_mac) for provisioning vGUID 
for IB VFs, through the PF IPoIB interface, not attempting to use 
IFLA_ADDRESS.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: rdma kernel tree

On Wed, May 27, 2015 at 4:13 PM, Doug Ledford  wrote:
> On Wed, 2015-05-27 at 12:50 +0300, Or Gerlitz wrote:
>> Doug,
>>
>> Do you have everything in place such that what you stage for upstream
>> is subject to linux-next auto merge tests and such?
>>
>> Or.
>
> No, I have 0day in place, but not linux-next to my knowledge (unless
> Stephen already set it up, but I haven't asked him to yet).

Oh, please rush to make sure this is in place,

>
> Stephen, can you set it up so that the linux-next tag from
> git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git is added
> to the linux-next testing please?
>
> --
> Doug Ledford 
>   GPG KeyID: 0E572FDD
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 09/10] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device

On Wed, May 27, 2015 at 9:48 PM, Jason Gunthorpe
 wrote:
> On Wed, May 27, 2015 at 02:54:12PM +0300, Or Gerlitz wrote:
>> On Wed, May 27, 2015 at 1:07 AM, Jason Gunthorpe
>>  wrote:
>> > On Tue, May 26, 2015 at 11:39:04PM +0300, Or Gerlitz wrote:
>> >> >> Jason, you (U2 BTW) play really, really hard - refusing to say **one**
>> >> >> word on your approach towards the built-in udata mechanism for uverbs
>> >> >> which I asked you to comment on.
>> >>
>> >> > And I asked to see the user space side and you have angrily refused
>> >> > every time.
>> >>
>> >> AFAIR I never ever refused to show any piece of code which went under
>> >> my hands towards Linux to any-one.
>>
>> > For future reference, when someone asks a question and you go off on
>> > an tangental rant and ignore the question, then that process repeats,
>> > still without answering the question - most english speakers would
>> > call that refusing to answer the question. It is not looked upon kindly.
>>
>> Jason,
>>
>> It's not that you asked to see the code ala "hey, do you happen to
>> have a git with the user space code for people to inspect while
>> doing the review on the kernel part", but rather U2 saying in a
>> definitive manner that posting the user space code should be imposed
>> as pre-requirement to acceptance of the kernel parts.
>
> I really didn't Or:
>
> First ask:
>  'We can't really look at the uapi changes here without also seeing the
>   verbs side changes.'
>  (I know others on the list feel the same, so I use 'we')
>
> Second ask:
>  'Lets see the verbs side and I'll let you know.'
> (.. to your questions based on my review comments ..)
>
> Third ask:
>  'I think we need to have the same policy.'
> (.. To Yann's point that other kernel communities have a mandatory
>   UAPI policy)
>
> Fourth (exasperated) ask:
>  'Was it not clear? Yann and I asked to see the user space side before
>   reviewing this series further.'
>
> I know you are ESL, and I cut you alot of slack, but *come on* - that
> is incredibly soft language, and certainly not bossing and imposing in
> a definitive manner a blanket requirement on all patches.

Jason,

ESL indeed am I, and in that respect, this clarification, even if
being tedious to set or read, helps.

Still, I'd like to further try and get you from where the bossing thing came:

(1) "show me the user space code prior to acceptance of the kernel
part" never was a requirement on this community since the day we were
born (Q4/2004)

(2) instantly rejecting a usage of a mechanism existing just for that
use case since the first year of our life (2005)

is something perceived by me as two people (that's the U2) that come
and say, "game's over, the old, non-functioning boss is gone, new boss
(== we) in town and forget about everything you knew before".

So you say that is was wrong perception, I hope so. Let's see how Doug
see your feedback, namely either as community reviewer feedback or as
new rules being set overnight, waiting.

This series is (1) simple compared to other stuff being reviewed here
nowadays, and (2) has very nice value to latency sensitive
applications, so two wins, lets get it done.

> Advice: You would be well served to spend a bit more time on your
> emails. I have no idea what 'but rather U2 saying' means, for
> instance. Sometimes I just guess at what you are trying to say :|

point taken

> That is the price we pay for an inclusive international community, but
> everyone needs to be careful before starting a flame war based on
> percived slight in the text and phrasing of a message. email is hard.

point taken. In this case (as you can see from my response above) I am
not convinced yet that this was false positive.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH V3 for-next 1/3] IB/uverbs: Enable device removal when there are active user space applications

On Wed, May 27, 2015 at 8:43 PM, Jason Gunthorpe
 wrote:
> On Wed, May 27, 2015 at 12:04:29PM -0400, Doug Ledford wrote:
>> On Mon, 2015-05-25 at 10:54 -0600, Jason Gunthorpe wrote:
>> > On Wed, May 13, 2015 at 02:10:36PM +0300, Yishai Hadas wrote:
>> >
>> > > + struct srcu_struct  disassociate_srcu;
>> >
>> > There is no need for rcu for this, use a rw sem.
>>
>> The rcu was used becuase it's on the hot path I assume.
>
> Perhaps, I looked at that a bit, it was used on syscall paths, but
> that wasn't even the big reason I made the comment..

Doug, what hot path we have in uverbs?! IB's stack hot path goes from
user-space to the HW, right?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RFC 0/3] Support standard SRIOV configuration for IB VFs

On Wed, May 27, 2015 at 8:53 PM, Doug Ledford  wrote:

> On Wed, 2015-05-27 at 11:11 -0600, Jason Gunthorpe wrote:
> Well, let's just be clear: netlink/iproute2 screwed the pooch on their
> implementation of this stuff.  Any solution that doesn't include fixing
> this up in some way is not really a good solution.

>>  For instance iproute2 would need IB specific code to format
>> the 'ip link show' (review print_vfinfo in iproute2) and to length
>> check 'ip link set vf mac'

>> If we do use 8, then it would be ideal (and my strong preference) to
>> also fix the IFLA_VF_MAC message to have a working length. I think
>> that could be done compatibly with a bit of work. At least that way
>> iproute2 can be kept clean when it learns to do IB, and we could have
>> the option again of using 20 someday if we need.
>>

> Sounds like a reasonable plan.
> Or, this is your patch set, are you going to pick up these action items?

Let see

>> So to be clear, to go with the 8 byte option I suggest:

>>  - Engage netdev/iproute and confirm they are philosophically OK
>>with IFLA_VF_MAC != IFLA_ADDRESS

the last thing netdev are is having philosophical views, they are very
practical (and non-perfect and happy and lively community), the one
thing before the last  netdev are is caring for the rdma subsystem. If
something has to change @ their direct part, we should come with
patches.

>>  - Make a kernel patch to properly size the IFLA_VF_MAC message

You mean the below structure which is expected by the kernel after
they see the IFLA_VF_MAC NL attribute

struct ifla_vf_mac {
 __u32 vf;
 __u8 mac[32]; /* MAX_ADDR_LEN */
 };

How you thought to patch things such that the size of the address
provided by user-space will propagate into the kernel w.o breaking the
NL ABI here?

Why not just take the eight lower bytes and set them NL --> ipoib -->  PF driver

>>  - Make a iproute patch to use the IFLA_VF_MAC size in print_vfinfo
>>instead of hardcoded ETH_ALEN (using len == 32 mean len 6 for compat)

I was thinking to patch iproute to sense the link type: if eth print
six bytes, if ipoib print 8 bytes, simple.

>>  - Drop in the IB patch

sounds good.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next 09/10] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device


On 5/28/2015 1:21 AM, Jason Gunthorpe wrote:

exists to support a unique feature of a single hardware vendor that few 
understand the use case for


Responding in EIM (End In Mind) manner

The use case is very clear, low latency applications using UD or RAW 
PACKET QPs that needs to know the time it takes for different HW/SW 
layers to get their packets through. The verbs version of SO_TIMESTAMP 
and friends (Documentation/networking/timestamping*)-- Christoph, can 
you add some info on common use-cases for this?


I bet that > 20 upstream Eth NIC drivers supports time-stamping, so 
there's no reason that a modern HCA will not support it too.



It is 11 patches, long, introduces several UAPI changes,


 1  IB/core: Change provider's API of create_cq to be extendible
 2  IB/core: Change ib_create_cq to use struct ib_cq_init_attr
 3  IB/core: Add CQ creation time-stamping flag
 4  IB/core: Extend ib_uverbs_create_cq
 5  IB/core: Add timestamp_mask and hca_core_clock to query_device
 6  IB/core: Pass hardware specific data in query_device
 7  IB/mlx4: Add mmap call to map the hardware clock
 8  IB/mlx4: Support extended create_cq and query_device uverbs
 9  IB/mlx4: Add support for timestamp in cq creation
10  IB/mlx4: Add timestamp_mask and hca_core_clock to query_device
11  IB/mlx4: Return hca core clock's offset in query_device 
vendor's data


01-02 just cosmetics that don't add any new functionality
03adding CQ creation flag to the kernel verbs
04new uverbs API to extend CQ creation
05extending uverbs query device to return two more values
06small fix to missing udata mechanics in uverbs query device
07-11 mlx4 provider side of the CQ setup and clock mmaping to user-space

the core of the review should be around the 03-06 zone, and with experts 
such as
Yann (and you) the uverbs part shouldn't be too complex to review and 
fix if needed.



does not implement a standardized feature,


This is standard in Eth NIC, return the time-stamp of when the packet 
arrived/sent




adds new uses of latent kernel functions


ESL I am

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 00/14] IB/mad: Add support for OPA MAD processing.


On 5/20/2015 11:13 AM, ira.we...@intel.com wrote:

Ira Weiny (14):
   IB/mad: Clean up ib_find_send_mad
   IB/mad: Create an RMPP Base header
   IB/mad: Create handle_ib_smi
   IB/mad: Add helper function for smi_handle_dr_smp_send
   IB/mad: Add helper function for smi_handle_dr_smp_recv
   IB/mad: Add helper function for smi_check_forward_dr_smp
   IB/mad: Add base version to ib_create_send_mad
   IB/core: Add rdma_max_mad_size helper
   IB/mad: Convert allocations from kmem_cache to kmalloc
   IB/mad: Add MAD size parameters to process_mad
   IB/core: Add rdma_cap_opa_mad helper
   IB/mad: Add partial Intel OPA MAD support
   IB/mad: Add partial Intel OPA MAD support
   IB/mad: Add final OPA MAD processing


Ira,

so again... patch title can be much better that

"Add helper function name_of_function_x" isn't some
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 00/14] IB/mad: Add support for OPA MAD processing.


On 5/20/2015 11:13 AM, ira.we...@intel.com wrote:

Ira Weiny (14):
   IB/mad: Clean up ib_find_send_mad
   IB/mad: Create an RMPP Base header
   IB/mad: Create handle_ib_smi
   IB/mad: Add helper function for smi_handle_dr_smp_send
   IB/mad: Add helper function for smi_handle_dr_smp_recv
   IB/mad: Add helper function for smi_check_forward_dr_smp
   IB/mad: Add base version to ib_create_send_mad

Ira,

Again... you should do better w.r.t patch titles --

"Add helper function name_of_function_x"

isn't the way to go, try

"Add helper function for functionality X"

no for "Add base version to function_x"

yes for "Add base version when doing X"

no for "Create function_x"
yes for "Add functionality X"

etc

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 for-next 00/12] Add network namespace support in the RDMA-CM


On 5/28/2015 5:07 PM, Doug Ledford wrote:

You would think that, but sometimes important information comes from
totally different places.  See mine and Jason's comments back and forth
in the SRIOV thread started by Or.

Long story short:

ip link add dev ib0 name ib0.1 type ipoib

is totally broken on at least all Red Hat OSes.  It will require
reworking of the network scripts and NetworkManager assumptions to make
it work.  It will also break DHCP on the interface as pkey/guid are the
only items that uniquely identify DHCP clients.  The net result of our
talks was that it is likely that each interface on the same pkey will
require an alias GUID per child interface in order to keep things workable.



Doug,

Just to make sure we're on the same page, you're saying that the IPoIB 
DHCP scheme (client + server) used on RH product uses Client-ID which is 
eight byte long or 20 byte long the four upper bytes masked out (which 
of them?) and hence is broken when multiple entities use the same ID.


Anything else except for that (you said "reworking of the network 
scripts and NetworkManager assumptions to make it work")??


OTOH we realized that the implementation for same PKEY IPoIB childs 
which exist for a while is broken with the RH DHCP scheme and should be 
enhanced.   OTOH these childs can serve as nice building blocks for 
IPoIB containers or virtio-IPoIB scheme.


Note that out of the eleven patches that make the series, only ONE 
relates directly to IPoIB, the rest are either applicable to all the 
transport supported by the RDMA stack, or to IPoIB + RoCE.


Under some assumptions and changes people can test it with DHCP scheme 
different from RH or with non-DHCP based IP address assignment scheme.


So we have a very nice effort and work done by developers, to bring RDMA 
into containers, accompanied by reviewers providing lots of their brain 
power to make it robust.


I don't see why we should stop the whole RDMA containers support train 
just b/c we found out the IPoIB DHCP bug which was there for few years 
before this effort started.


How about let this series to go after the rest of the reviewers comments 
are addressed, s.t under IPoIB it will work on small set of 
environments, while with macvlan based RoCE support to be introduced 
later it will work on wider set of environments.


Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 for-next 07/14] IB/core: GID attribute should be returned from verbs API and cache API

On 5/28/2015 7:07 PM, Jason Gunthorpe wrote:

Patch 8 (the ndev part) is relevant. GID is now related to a ndev and
>we would like
>to expose this information to the user.
>In non rdma-cm applications, how would a user select the gid_index he wants?

I don't mean drop forever, I mean, concentrate on getting this clean
up done, then start discussing UAPI changes separately. Please don't
bury UAPI changes, new features, etc in a cleanup patch series.

Jason,

I agree. This series can be perfectly made without UAPI changes, Matan 
can drop patch #8 and have user-space to just work as they did before, 
for both librdmacm and libibverbs consumers.

As for the RoCE GID table itself, adding in properly net-devices in 
their native Linux kernel form, namely with if_index and name-space -- 
seems to me the correct way to go. This for itself goes just a bit 
beyond refactoring, doesn't add special complexity which wasn't there 
before and has the advantage of doing things right and solid.

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 for-next 00/12] Add network namespace support in the RDMA-CM

On Thu, May 28, 2015 at 9:22 PM, Doug Ledford  wrote:

>> I don't think that is what Doug said.

> Indeed.  There is no need to scrap things, but if the design as it
> stands, and the intended means of creating objects for use in
> containers, is going to result in an unworkable network, then we have to
> re-evaluate how the container constructs are created, and that then has
> possible consequences for how we would get from an incoming packet to
> the proper container.

To be precise, do we agree that the issue here isn't "in the design as
it stands" but rather in a problem we found in the intended way of
assigning IP addresses through DHCP for the containers?

> I'm not trying to stop the "support train" here, but at the same time,
> if the train is headed for a bridge that's out

So what's your concrete saying here? where should we go from here?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V1 00/11] Add completion timestamping support

2015-05-29 Thread Or Gerlitz

On Fri, May 29, Steve Wise  wrote:
> On 5/21/2015 9:56 AM, Or Gerlitz wrote:

>> A user-space application could get the current HW's clock by executing
>>
>> ibv_query_values_ex(struct ibv_context *context, uint32_t q_values,
>>  struct ibv_values_ex *values)
>>
>> The function gets a mask of the values to query and return their values.
>> Vendors could either implement this as a uverb command or use their
>> user-space driver to return those values directly from the HW (the mlx4
>> way).


> I'm just reviewing this now, and I haven't looked at the user side, but it
> appears the CQE timestamp is available for all devices to support or not in
> a generic manner.  IE it is part of the extended CQ UAPI.  But the task of
> fetching the current timestamp value/mask seems to be device-specific.

Steve,

Since there are no kernel consumers at this point, the series only
implemented the IB core kernel parts which are needed for proper usage
@ user-space, namely: CQ creation with a time-stamping flag, and
extension of the query device verb which returns the clock mask and
frequency, etc.

In user-space, applications would work against libibverbs, all
standard. They would query the device, create their CQs, get the
completion time-stamp from the CQE (ibv_wc) and the current clock
sample from a new verb ibv_query_values_ex which was mentioned above,
the user-space code is still in internal review, but here's a ref to
early drop I posted here couple of days ago


https://github.com/matanb10/libibverbs timestamp-v0
https://github.com/matanb10/libmlx4 timestamp-v0


> Shouldn't that also be a standard operation that devices can choose to
> support or not?IE an application can generically setup a CQ and get
> timestamps on CQEs, but it seems the application would have to have
> device-specific code to get the current timestamp/mask.  Perhaps I'm not
> understanding the design?

No, nothing device specific from the application POV

see the ibv_query_values_ex introduction

https://github.com/matanb10/libibverbs/commit/89c439e34ab234d5d58bd92804829a0aa6efb3ca
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V1 10/11] IB/mlx4: Add timestamp_mask and hca_core_clock to query_device

2015-05-30 Thread Or Gerlitz

On Fri, May 29, 2015 at 7:41 PM, Doug Ledford  wrote:
> On Thu, 2015-05-21 at 17:56 +0300, Or Gerlitz wrote:
>> From: Matan Barak 
>>
>> mlx4 needs to report the number of supported timestamp
>> bits (mask) and the hca_core_clock frequency.
>>
>> Signed-off-by: Matan Barak 
>> Signed-off-by: Or Gerlitz 
>> ---
>>  drivers/infiniband/hw/mlx4/main.c |2 ++
>>  1 files changed, 2 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/infiniband/hw/mlx4/main.c 
>> b/drivers/infiniband/hw/mlx4/main.c
>> index ef211c8..a5a90c4 100644
>> --- a/drivers/infiniband/hw/mlx4/main.c
>> +++ b/drivers/infiniband/hw/mlx4/main.c
>> @@ -233,6 +233,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
>>   props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
>>  props->max_mcast_grp;
>>   props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
>> + props->hca_core_clock = dev->dev->caps.hca_core_clock;
>> + props->timestamp_mask = 0xULL;
>>
>>  out:
>>   kfree(in_mad);
>
> Or, please squash 8, 9, and 10 to a single patch.  They all are "add
> support for timestamping to mlx4" but without squashing them together,
> the support they add is broken/partial.

Sure, will do.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V2 0/9] Add completion timestamping support

Hi Doug,

This patchset adds completion timestamping supports for verbs consumers. 

Reviewing the weekend threads, we've changed the flag time to reflect
that this is completion time-stamp and folded the mlx4 actual support 
into one patch.

Regarding the related user-space support, it's possible to add what you
were suggesting, ibv_get_raw_cqe_timestamp() -- returns ts in cycles and 
ibv_get_cqe_timestamp() -- returns ts in ns, this makes the value returned
by the poll cq verb an opaque one that must go through one of  the convertors.

We would to go for one helper ibv_get_timestamp(uint64_t raw_time, flag) which 
could get the raw time-stamp and one of the following flags: RAW_TIME, 
RAW_NS_TIME.

We think this would address the reviewer comments for the kernel submission.

The user-space code is in (still uses IB_CQ_FLAGS_TIMESTAMP and miss the 
conversion functions) 

 https://github.com/matanb10/libibverbs timestamp-v1
 https://github.com/matanb10/libmlx4 timestamp-v1

Timestamping is used by applications in order to know when a WQE was 
received/transmitted by the HW. The value is given is HCA hardware cycles,
but could be easily converted as the hardware's core clock frequecny is 
available through extension of query device. 

Moreover, we add an ability to read the HCA's current clock. This could be 
useful on order to synchronize events to the wall clock.

This functionality is achieved by adding/extending the following verbs:

create_cq - create_cq is extended in order to allow passing creation flags
to the CQ creation function. We change IB/core --> vendors API
to be easily extendible by passing a struct which contains
comp_vectors, cqe and the new flags parameter. In order to create
CQ which supports timestamping, IB_CQ_FLAGS_TIMESTAMP_COMPLETION should be 
given.

query_device - We extend query_device uverb further by giving the hardware's
clock frequency and the timestamp mask (the number of timestamp
bits which are supported). If timestamp isn't supported, 0 is returned.

In order to read the timestamp in the WQE, the user needs to query the device 
for support, create an appropriate CQ (using the extanded uverb with
IB_CQ_FLAGS_TIMESTAMP_COMPLETION) and poll the CQ with an extended poll_cq verb 
(currently,
only implemented in user-space).

In mlx4, allowing the user to read the core clock efficiently involves mapping
this area of the hardware to user-space (being done by using a mmap command)
and reading the clock from the correct offset of the page. 

This offset is returned in the vendor's specific data from mlx4's kernel driver 
to the mlx4's user-space driver. query_device is modified in order to support
passing this vendor specific data. A user-space application could use a new
verb in order to read the hardware's clock.

Translating the hardware's clock into ms could be done by dividing this
value by hca_core_clock (which is returned by the extended version of
query_device uverb).

A user-space application could get the current HW's clock by executing

ibv_query_values_ex(struct ibv_context *context, uint32_t q_values,
struct ibv_values_ex *values)

The function gets a mask of the values to query and return their values.
Vendors could either implement this as a uverb command or use their 
user-space driver to return those values directly from the HW (the mlx4 way).

Matan and Or.

Changes from V1:
 (1) fixed lustre IB's code build
 (2) squashed mlx4 V1 9-11 patches into one
 (3) changed IB_CQ_FLAGS_TIMESTAMP --> IB_CQ_FLAGS_TIMESTAMP_COMPLETION

Changes from V0:
(1) Pass ib_cq_init_attr instead of cqe and comp_vector.
(2) Fix unneeded indentation.
(3) Change flags to u32.
(4) Add const to create_cq's ib_cq_init_attr argument in vendor implementation.

Matan Barak (9):
  IB/core: Change provider's API of create_cq to be extendible
  IB/core: Change ib_create_cq to use struct ib_cq_init_attr
  IB/core: Add CQ creation time-stamping flag
  IB/core: Extend ib_uverbs_create_cq
  IB/core: Add timestamp_mask and hca_core_clock to query_device
  IB/core: Pass hardware specific data in query_device
  IB/mlx4: Add mmap call to map the hardware clock
  IB/mlx4: Support extended create_cq and query_device uverbs
  IB/mlx4: Add support for CQ time-stamping

 drivers/infiniband/core/device.c   |6 +-
 drivers/infiniband/core/mad.c  |5 +-
 drivers/infiniband/core/uverbs.h   |1 +
 drivers/infiniband/core/uverbs_cmd.c   |  188 
 drivers/infiniband/core/uverbs_main.c  |1 +
 drivers/infiniband/core/verbs.c|4 +-
 drivers/infiniband/hw/amso1100/c2_provider.c   |   14 ++-
 drivers/infiniband/hw/cxgb3/iwch_provider.c|   19 ++-
 drivers/infiniband/hw/cxgb4/cq.c   |9 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |8 +-
 drivers/infiniband/hw/cxgb4/provider.c |8 +-
 drivers/infiniband/hw/ehca/ehca

[PATCH for-next V2 2/9] IB/core: Change ib_create_cq to use struct ib_cq_init_attr

From: Matan Barak 

Currently, ib_create_cq uses cqe and comp_vecotr instead
of the extendible ib_cq_init_attr struct.

Earlier patches already changed the vendors to work with
ib_cq_init_attr. This patch changes the consumers too.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/mad.c  |5 -
 drivers/infiniband/core/verbs.c|5 ++---
 drivers/infiniband/hw/ehca/ehca_main.c |6 +-
 drivers/infiniband/hw/mlx4/mad.c   |5 -
 drivers/infiniband/hw/mlx4/main.c  |5 -
 drivers/infiniband/hw/mlx5/main.c  |7 +--
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |9 +++--
 drivers/infiniband/ulp/iser/iser_verbs.c   |6 +-
 drivers/infiniband/ulp/isert/ib_isert.c|6 +-
 drivers/infiniband/ulp/srp/ib_srp.c|   10 --
 drivers/infiniband/ulp/srpt/ib_srpt.c  |5 -
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c|7 +--
 include/rdma/ib_verbs.h|6 ++
 net/9p/trans_rdma.c|5 -
 net/rds/ib_cm.c|8 ++--
 net/rds/iw_cm.c|8 ++--
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |   10 ++
 net/sunrpc/xprtrdma/verbs.c|   10 ++
 18 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 600af26..ad3f729 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2923,6 +2923,7 @@ static int ib_mad_port_open(struct ib_device *device,
unsigned long flags;
char name[sizeof "ib_mad123"];
int has_smi;
+   struct ib_cq_init_attr cq_attr;
 
/* Create new device info */
port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
@@ -2943,9 +2944,11 @@ static int ib_mad_port_open(struct ib_device *device,
if (has_smi)
cq_size *= 2;
 
+   memset(&cq_attr, 0, sizeof(cq_attr));
+   cq_attr.cqe = cq_size;
port_priv->cq = ib_create_cq(port_priv->device,
 ib_mad_thread_completion_handler,
-NULL, port_priv, cq_size, 0);
+NULL, port_priv, &cq_attr);
if (IS_ERR(port_priv->cq)) {
dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
ret = PTR_ERR(port_priv->cq);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index f7615d4..a716ae2 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1075,12 +1075,11 @@ EXPORT_SYMBOL(ib_destroy_qp);
 struct ib_cq *ib_create_cq(struct ib_device *device,
   ib_comp_handler comp_handler,
   void (*event_handler)(struct ib_event *, void *),
-  void *cq_context, int cqe, int comp_vector)
+  void *cq_context, struct ib_cq_init_attr *cq_attr)
 {
struct ib_cq *cq;
-   struct ib_cq_init_attr attr = {.cqe = cqe, .comp_vector = comp_vector};
 
-   cq = device->create_cq(device, &attr, NULL, NULL);
+   cq = device->create_cq(device, cq_attr, NULL, NULL);
 
if (!IS_ERR(cq)) {
cq->device= device;
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c 
b/drivers/infiniband/hw/ehca/ehca_main.c
index 5e30b72..7727556 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -552,6 +552,7 @@ static int ehca_create_aqp1(struct ehca_shca *shca, u32 
port)
struct ib_cq *ibcq;
struct ib_qp *ibqp;
struct ib_qp_init_attr qp_init_attr;
+   struct ib_cq_init_attr cq_attr;
int ret;
 
if (sport->ibcq_aqp1) {
@@ -559,7 +560,10 @@ static int ehca_create_aqp1(struct ehca_shca *shca, u32 
port)
return -EPERM;
}
 
-   ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1), 10, 0);
+   memset(&cq_attr, 0, sizeof(cq_attr));
+   cq_attr.cqe = 10;
+   ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
+   &cq_attr);
if (IS_ERR(ibcq)) {
ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
return PTR_ERR(ibcq);
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 9cd2b00..462e728 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1773,6 +1773,7 @@ static int create_pv_resources(struct ib_device *ibdev, 
int slave, int port,
   int create_tun, struct m

[PATCH for-next V2 1/9] IB/core: Change provider's API of create_cq to be extendible

From: Matan Barak 

Add a new ib_cq_init_attr structure which contains the
previous cqe (minimum number of CQ entries) and comp_vector
(completion vector) in addition to a new flags field.
All vendors' create_cq callbacks are changed in order
to work with the new API.

This commit does not change any functionality.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/uverbs_cmd.c |6 --
 drivers/infiniband/core/verbs.c  |3 ++-
 drivers/infiniband/hw/amso1100/c2_provider.c |7 ++-
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |   11 ---
 drivers/infiniband/hw/cxgb4/cq.c |9 +++--
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h   |8 
 drivers/infiniband/hw/ehca/ehca_cq.c |7 ++-
 drivers/infiniband/hw/ehca/ehca_iverbs.h |3 ++-
 drivers/infiniband/hw/ipath/ipath_cq.c   |9 +++--
 drivers/infiniband/hw/ipath/ipath_verbs.h|3 ++-
 drivers/infiniband/hw/mlx4/cq.c  |8 +++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |3 ++-
 drivers/infiniband/hw/mlx5/cq.c  |   10 --
 drivers/infiniband/hw/mlx5/main.c|3 ++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |5 +++--
 drivers/infiniband/hw/mthca/mthca_provider.c |8 ++--
 drivers/infiniband/hw/nes/nes_verbs.c|   11 ---
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |7 ++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |6 --
 drivers/infiniband/hw/qib/qib_cq.c   |   11 ---
 drivers/infiniband/hw/qib/qib_verbs.h|5 +++--
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |   10 +++---
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h |7 ---
 include/rdma/ib_verbs.h  |   10 --
 24 files changed, 124 insertions(+), 46 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index a9f0489..1954ebb 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1341,6 +1341,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
struct ib_uverbs_event_file*ev_file = NULL;
struct ib_cq   *cq;
int ret;
+   struct ib_cq_init_attr attr = {};
 
if (out_len < sizeof resp)
return -ENOSPC;
@@ -1376,8 +1377,9 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
 
-   cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
-cmd.comp_vector,
+   attr.cqe = cmd.cqe;
+   attr.comp_vector = cmd.comp_vector;
+   cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
 file->ucontext, &udata);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 685a362..f7615d4 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1078,8 +1078,9 @@ struct ib_cq *ib_create_cq(struct ib_device *device,
   void *cq_context, int cqe, int comp_vector)
 {
struct ib_cq *cq;
+   struct ib_cq_init_attr attr = {.cqe = cqe, .comp_vector = comp_vector};
 
-   cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+   cq = device->create_cq(device, &attr, NULL, NULL);
 
if (!IS_ERR(cq)) {
cq->device= device;
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c 
b/drivers/infiniband/hw/amso1100/c2_provider.c
index d396c39..a43e022 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -286,13 +286,18 @@ static int c2_destroy_qp(struct ib_qp *ib_qp)
return 0;
 }
 
-static struct ib_cq *c2_create_cq(struct ib_device *ibdev, int entries, int 
vector,
+static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
+ const struct ib_cq_init_attr *attr,
  struct ib_ucontext *context,
  struct ib_udata *udata)
 {
+   int entries = attr->cqe;
struct c2_cq *cq;
int err;
 
+   if (attr->flags)
+   return ERR_PTR(-EINVAL);
+
cq = kmalloc(sizeof(*cq), GFP_KERNEL);
if (!cq) {
pr_debug("%s: Unable to allocate CQ\n", __func__);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c 
b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 061ef08..2eaf7e8 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -138,10 +138,12 @@ static int iwch_destroy_cq(struct ib_cq

[PATCH for-next V2 4/9] IB/core: Extend ib_uverbs_create_cq

From: Matan Barak 

ib_uverbs_ex_create_cq follows the extension verbs
mechanism. New features (for example, CQ creation flags
field which is added in a downstream patch) could used
via user-space libraries without breaking the ABI.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |  170 ++---
 drivers/infiniband/core/uverbs_main.c |1 +
 include/uapi/rdma/ib_user_verbs.h |   17 
 4 files changed, 154 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b716b08..ba365b6 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -259,5 +259,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
 IB_UVERBS_DECLARE_EX_CMD(create_flow);
 IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
 IB_UVERBS_DECLARE_EX_CMD(query_device);
+IB_UVERBS_DECLARE_EX_CMD(create_cq);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 1954ebb..51311b1 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1330,41 +1330,37 @@ ssize_t ib_uverbs_create_comp_channel(struct 
ib_uverbs_file *file,
return in_len;
 }
 
-ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
-   const char __user *buf, int in_len,
-   int out_len)
+static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+  struct ib_udata *ucore,
+  struct ib_udata *uhw,
+  struct ib_uverbs_ex_create_cq *cmd,
+  size_t cmd_sz,
+  int (*cb)(struct ib_uverbs_file *file,
+struct ib_ucq_object *obj,
+struct 
ib_uverbs_ex_create_cq_resp *resp,
+struct ib_udata *udata,
+void *context),
+  void *context)
 {
-   struct ib_uverbs_create_cq  cmd;
-   struct ib_uverbs_create_cq_resp resp;
-   struct ib_udata udata;
struct ib_ucq_object   *obj;
struct ib_uverbs_event_file*ev_file = NULL;
struct ib_cq   *cq;
int ret;
+   struct ib_uverbs_ex_create_cq_resp resp;
struct ib_cq_init_attr attr = {};
 
-   if (out_len < sizeof resp)
-   return -ENOSPC;
-
-   if (copy_from_user(&cmd, buf, sizeof cmd))
-   return -EFAULT;
-
-   INIT_UDATA(&udata, buf + sizeof cmd,
-  (unsigned long) cmd.response + sizeof resp,
-  in_len - sizeof cmd, out_len - sizeof resp);
-
-   if (cmd.comp_vector >= file->device->num_comp_vectors)
-   return -EINVAL;
+   if (cmd->comp_vector >= file->device->num_comp_vectors)
+   return ERR_PTR(-EINVAL);
 
obj = kmalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
-   return -ENOMEM;
+   return ERR_PTR(-ENOMEM);
 
-   init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, 
&cq_lock_class);
+   init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, 
&cq_lock_class);
down_write(&obj->uobject.mutex);
 
-   if (cmd.comp_channel >= 0) {
-   ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+   if (cmd->comp_channel >= 0) {
+   ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel);
if (!ev_file) {
ret = -EINVAL;
goto err;
@@ -1377,10 +1373,14 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
 
-   attr.cqe = cmd.cqe;
-   attr.comp_vector = cmd.comp_vector;
+   attr.cqe = cmd->cqe;
+   attr.comp_vector = cmd->comp_vector;
+
+   if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
+   attr.flags = cmd->flags;
+
cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
-file->ucontext, &udata);
+file->ucontext, uhw);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
@@ -1399,14 +1399,15 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
goto err_free;
 
memset(&resp, 0, sizeof resp);
-   resp.cq_handle = obj->uobject.id;
-   resp.cqe   = cq-&g

[PATCH for-next V2 5/9] IB/core: Add timestamp_mask and hca_core_clock to query_device

From: Matan Barak 

In order to expose timestamp we need to expose two attributes
in query_device:
timestamp_mask - how many bits are valid in the timestamp.
 timestamp values could be 64bits the most.
hca_core_clock - timestamp is given in HW cycles, hca_core_clock
 is the frequency of the HCA and is necessary in
 order to convert cycles to seconds.

This is added both to ib_query_device and its respective uverbs
counterpart.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/device.c |2 ++
 drivers/infiniband/core/uverbs_cmd.c |   14 ++
 include/rdma/ib_verbs.h  |2 ++
 include/uapi/rdma/ib_user_verbs.h|2 ++
 4 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8d07c12..568cb41 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -539,6 +539,8 @@ EXPORT_SYMBOL(ib_dispatch_event);
 int ib_query_device(struct ib_device *device,
struct ib_device_attr *device_attr)
 {
+   memset(device_attr, 0, sizeof(*device_attr));
+
return device->query_device(device, device_attr);
 }
 EXPORT_SYMBOL(ib_query_device);
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 51311b1..11ee298 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3426,6 +3426,8 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
if (ucore->outlen < resp.response_length)
return -ENOSPC;
 
+   memset(&attr, 0, sizeof(attr));
+
err = device->query_device(device, &attr);
if (err)
return err;
@@ -3450,6 +3452,18 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file 
*file,
 #endif
resp.response_length += sizeof(resp.odp_caps);
 
+   if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask))
+   goto end;
+
+   resp.timestamp_mask = attr.timestamp_mask;
+   resp.response_length += sizeof(resp.timestamp_mask);
+
+   if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock))
+   goto end;
+
+   resp.hca_core_clock = attr.hca_core_clock;
+   resp.response_length += sizeof(resp.hca_core_clock);
+
 end:
err = ib_copy_to_udata(ucore, &resp, resp.response_length);
if (err)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 62e88e1..a45f674 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -227,6 +227,8 @@ struct ib_device_attr {
int sig_prot_cap;
int sig_guard_cap;
struct ib_odp_caps  odp_caps;
+   uint64_ttimestamp_mask;
+   uint64_thca_core_clock;
 };
 
 enum ib_mtu {
diff --git a/include/uapi/rdma/ib_user_verbs.h 
b/include/uapi/rdma/ib_user_verbs.h
index 19f0256..f9b8843 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -223,6 +223,8 @@ struct ib_uverbs_ex_query_device_resp {
__u32 comp_mask;
__u32 response_length;
struct ib_uverbs_odp_caps odp_caps;
+   __u64 timestamp_mask;
+   __u64 hca_core_clock;
 };
 
 struct ib_uverbs_query_port {
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V2 9/9] IB/mlx4: Add support for CQ time-stamping

From: Matan Barak 

This includes:

* support allocation of CQ with the TIMESTAMP_COMPLETION creation flag.

* add timestamp_mask and hca_core_clock to query_device, reporting the
  number of supported timestamp bits (mask) and the hca_core_clock frequency.

* return hca core clock's offset in query_device vendor's data,
  this is needed in order to read the HCA's core clock.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/cq.c  |9 +--
 drivers/infiniband/hw/mlx4/main.c|   38 -
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   16 ++
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 8e44aaa..36eb3d0 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -166,6 +166,7 @@ err_buf:
return err;
 }
 
+#define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_FLAGS_TIMESTAMP_COMPLETION
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
@@ -178,10 +179,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
struct mlx4_uar *uar;
int err;
 
-   if (attr->flags)
+   if (entries < 1 || entries > dev->dev->caps.max_cqes)
return ERR_PTR(-EINVAL);
 
-   if (entries < 1 || entries > dev->dev->caps.max_cqes)
+   if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
return ERR_PTR(-EINVAL);
 
cq = kmalloc(sizeof *cq, GFP_KERNEL);
@@ -194,6 +195,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
spin_lock_init(&cq->lock);
cq->resize_buf = NULL;
cq->resize_umem = NULL;
+   cq->create_flags = attr->flags;
INIT_LIST_HEAD(&cq->send_qp_list);
INIT_LIST_HEAD(&cq->recv_qp_list);
 
@@ -237,7 +239,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
vector = dev->eq_table[vector % ibdev->num_comp_vectors];
 
err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
-   cq->db.dma, &cq->mcq, vector, 0, 0);
+   cq->db.dma, &cq->mcq, vector, 0,
+   !!(cq->create_flags & 
IB_CQ_FLAGS_TIMESTAMP_COMPLETION));
if (err)
goto err_dbmap;
 
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 832d571..c642bf0 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -140,10 +140,27 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
struct ib_smp *out_mad = NULL;
int err = -ENOMEM;
int have_ib_ports;
+   struct mlx4_uverbs_ex_query_device cmd;
+   struct mlx4_uverbs_ex_query_device_resp resp = {.comp_mask = 0};
+   struct mlx4_clock_params clock_params;
 
-   if (uhw->inlen || uhw->outlen)
-   return -EINVAL;
+   if (uhw->inlen) {
+   if (uhw->inlen < sizeof(cmd))
+   return -EINVAL;
+
+   err = ib_copy_from_udata(&cmd, uhw, sizeof(cmd));
+   if (err)
+   return err;
+
+   if (cmd.comp_mask)
+   return -EINVAL;
+
+   if (cmd.reserved)
+   return -EINVAL;
+   }
 
+   resp.response_length = offsetof(typeof(resp), response_length) +
+   sizeof(resp.response_length);
in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
if (!in_mad || !out_mad)
@@ -233,7 +250,24 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
   props->max_mcast_grp;
props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
+   props->hca_core_clock = dev->dev->caps.hca_core_clock;
+   props->timestamp_mask = 0xULL;
 
+   err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
+   if (err)
+   goto out;
+
+   if (uhw->outlen >= resp.response_length + 
sizeof(resp.hca_core_clock_offset)) {
+   resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE;
+   resp.response_length += sizeof(resp.hca_core_clock_offset);
+   resp.comp_mask |= QUERY_DEVICE_RESP_MASK_TIMESTAMP;
+   }
+
+   if (uhw->outlen) {
+   err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+   if (err)
+   goto out;
+   }
 out:
kfree(in_mad);
kfree(out_mad);
diff --git a/drivers/infiniband/hw/mlx

[PATCH for-next V2 6/9] IB/core: Pass hardware specific data in query_device

From: Matan Barak 

Vendors should be able to pass vendor specific data to/from
user-space via query_device uverb. In order to do this,
we need to pass the vendors' specific udata.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/device.c |4 +++-
 drivers/infiniband/core/uverbs_cmd.c |2 +-
 drivers/infiniband/hw/amso1100/c2_provider.c |7 +--
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |8 ++--
 drivers/infiniband/hw/cxgb4/provider.c   |8 ++--
 drivers/infiniband/hw/ehca/ehca_hca.c|6 +-
 drivers/infiniband/hw/ehca/ehca_iverbs.h |3 ++-
 drivers/infiniband/hw/ipath/ipath_verbs.c|7 +--
 drivers/infiniband/hw/mlx4/main.c|6 +-
 drivers/infiniband/hw/mlx5/main.c|9 +++--
 drivers/infiniband/hw/mthca/mthca_provider.c |7 +--
 drivers/infiniband/hw/nes/nes_verbs.c|6 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |6 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |3 ++-
 drivers/infiniband/hw/qib/qib_verbs.c|6 --
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |6 +-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h |3 ++-
 include/rdma/ib_verbs.h  |3 ++-
 18 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 568cb41..694bd66 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -539,9 +539,11 @@ EXPORT_SYMBOL(ib_dispatch_event);
 int ib_query_device(struct ib_device *device,
struct ib_device_attr *device_attr)
 {
+   struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+
memset(device_attr, 0, sizeof(*device_attr));
 
-   return device->query_device(device, device_attr);
+   return device->query_device(device, device_attr, &uhw);
 }
 EXPORT_SYMBOL(ib_query_device);
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 11ee298..bbb02ff 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3428,7 +3428,7 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
 
memset(&attr, 0, sizeof(attr));
 
-   err = device->query_device(device, &attr);
+   err = device->query_device(device, &attr, uhw);
if (err)
return err;
 
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c 
b/drivers/infiniband/hw/amso1100/c2_provider.c
index a43e022..382f109 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -63,13 +63,16 @@
 #include "c2_provider.h"
 #include "c2_user.h"
 
-static int c2_query_device(struct ib_device *ibdev,
-  struct ib_device_attr *props)
+static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+  struct ib_udata *uhw)
 {
struct c2_dev *c2dev = to_c2dev(ibdev);
 
pr_debug("%s:%u\n", __func__, __LINE__);
 
+   if (uhw->inlen || uhw->outlen)
+   return -EINVAL;
+
*props = c2dev->props;
return 0;
 }
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c 
b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 2eaf7e8..c4b5936 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1150,13 +1150,17 @@ static u64 fw_vers_string_to_u64(struct iwch_dev 
*iwch_dev)
   (fw_mic & 0x);
 }
 
-static int iwch_query_device(struct ib_device *ibdev,
-struct ib_device_attr *props)
+static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+struct ib_udata *uhw)
 {
 
struct iwch_dev *dev;
+
PDBG("%s ibdev %p\n", __func__, ibdev);
 
+   if (uhw->inlen || uhw->outlen)
+   return -EINVAL;
+
dev = to_iwch_dev(ibdev);
memset(props, 0, sizeof *props);
memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
diff --git a/drivers/infiniband/hw/cxgb4/provider.c 
b/drivers/infiniband/hw/cxgb4/provider.c
index ef08a9f..05a96a5 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -301,13 +301,17 @@ static int c4iw_query_gid(struct ib_device *ibdev, u8 
port, int index,
return 0;
 }
 
-static int c4iw_query_device(struct ib_device *ibdev,
-struct ib_device_attr *props)
+static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+struct ib_udata *uhw)
 {
 
struct c4iw_dev *dev;
+
PDBG("%s ibdev %p\n", __func__, ibdev);
 
+   if (uhw->inlen |

[PATCH for-next V2 8/9] IB/mlx4: Support extended create_cq and query_device uverbs

From: Matan Barak 

Add support for ib_uverbs_ex_create_cq and ib_uverbs_ex_query_device
by setting the appropriate bit in uverbs_ex_cmd_mask.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/main.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 3992349..832d571 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2323,6 +2323,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
}
 
+   ibdev->ib_dev.uverbs_ex_cmd_mask |=
+   (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) |
+   (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ);
+
mlx4_ib_alloc_eqs(dev, ibdev);
 
spin_lock_init(&iboe->lock);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V2 3/9] IB/core: Add CQ creation time-stamping flag

From: Matan Barak 

Add CQ creation flag which dictates that the created CQ will report
completion time-stamp value in the WC.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 include/rdma/ib_verbs.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 321a0b3..62e88e1 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -173,6 +173,10 @@ struct ib_odp_caps {
} per_transport_caps;
 };
 
+enum ib_cq_creation_flags {
+   IB_CQ_FLAGS_TIMESTAMP_COMPLETION   = 1 << 0,
+};
+
 struct ib_cq_init_attr {
unsigned intcqe;
int comp_vector;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V2 7/9] IB/mlx4: Add mmap call to map the hardware clock

From: Matan Barak 

In order to read the HCA's cycle counter efficiently in
user space, we need to map the HCA's register.
This is done through mmap call.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/main.c |   18 +-
 drivers/net/ethernet/mellanox/mlx4/main.c |   19 +++
 include/linux/mlx4/device.h   |9 +
 3 files changed, 45 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 914beae..3992349 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -716,8 +716,24 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, 
struct vm_area_struct *vma)
   dev->dev->caps.num_uars,
   PAGE_SIZE, vma->vm_page_prot))
return -EAGAIN;
-   } else
+   } else if (vma->vm_pgoff == 3) {
+   struct mlx4_clock_params params;
+   int ret = mlx4_get_internal_clock_params(dev->dev, ¶ms);
+
+   if (ret)
+   return ret;
+
+   vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+   if (io_remap_pfn_range(vma, vma->vm_start,
+  
(pci_resource_start(dev->dev->persist->pdev,
+  params.bar) +
+   params.offset)
+  >> PAGE_SHIFT,
+  PAGE_SIZE, vma->vm_page_prot))
+   return -EAGAIN;
+   } else {
return -EINVAL;
+   }
 
return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index ced5eca..70de39c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1674,6 +1674,25 @@ static int map_internal_clock(struct mlx4_dev *dev)
return 0;
 }
 
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+  struct mlx4_clock_params *params)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+
+   if (mlx4_is_slave(dev))
+   return -ENOTSUPP;
+
+   if (!params)
+   return -EINVAL;
+
+   params->bar = priv->fw.clock_bar;
+   params->offset = priv->fw.clock_offset;
+   params->size = MLX4_CLOCK_SIZE;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params);
+
 static void unmap_internal_clock(struct mlx4_dev *dev)
 {
struct mlx4_priv *priv = mlx4_priv(dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 83e80ab..f94984f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -829,6 +829,12 @@ struct mlx4_dev {
struct mlx4_vf_dev *dev_vfs;
 };
 
+struct mlx4_clock_params {
+   u64 offset;
+   u8 bar;
+   u8 size;
+};
+
 struct mlx4_eqe {
u8  reserved1;
u8  type;
@@ -1485,4 +1491,7 @@ int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
 enum mlx4_access_reg_method method,
 struct mlx4_ptys_reg *ptys_reg);
 
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+  struct mlx4_clock_params *params);
+
 #endif /* MLX4_DEVICE_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V2 0/9] Add completion timestamping support


On 5/31/2015 3:14 PM, Or Gerlitz wrote:

We would to go for one helper


Would like to go for one helper
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V2 0/9] Add completion timestamping support

2015-06-02 Thread Or Gerlitz


On 6/2/2015 5:35 PM, Doug Ledford wrote:

On Mon, 2015-06-01 at 10:43 -0600, Jason Gunthorpe wrote:

On Mon, Jun 01, 2015 at 07:25:04AM -0400, Doug Ledford wrote:


attempted abstraction of ibverbs.  Passing in the wc struct allows the
driver to internally allocate a wc struct with extra private elements
and pass that back to the user, when the user passes it back to
ibv_get_timestamp the elements are there in the private portion of the
struct.

wc structures are allocated by the caller, there is no option for the
driver to create private elements.

Well, they *are* using an extended work completion structure.  Unlike
what I mentioned, where they create a larger one themselves, you have to
allocate a struct ibv_wc_ex instead of a struct ibv_wc and then you have
to call poll_cq_ex, which expects a struct ibv_wc_ex.

So, just so everyone is clear on this point: the current user space
implementation of this feature creates an unversioned, newly named
ibv_wc_ex struct that is ibv_wc with a 64bit timestamp tacked on at the
end (not 64bit aligned either).  If we ever wanted to have a different
extension to our ibv_wc struct, there is no good way to do that.  If, at
some point, we had multiple extension and the user was able to select
which they wanted to utilize, this structure extension is not flexible
enough to deal with that.  At a minimum, if we are going to have a one
shot extension to the wc struct like this, I would prefer to see it
called struct ibv_wc_timestamp and there be a ibv_poll_cq_timestamp.  At
least that way people would not use the generic _ex and assume this is
the one and only _ex that we will ever need for work completions.

Jason, when the XRC and flow steering extensions were added to
libibverbs, you complained loudly that they were not added in the agreed
upon format and cited a previous on list discussion.  Do you have a link
to that discussion?


Doug,

Do we agree that this part of the discussion (and also the below point) 
are related to the libibverbs API to applications and not to the kernel 
-> user API to support time-stamping?


Or.




AFAIK, Christoph's use case is essentially the only meaningful use
case for this feature, generalizing too much may destroy the
performance that is valuable here.

There is actually room in a 64byte cacheline for two 64bit timestamps
and another 2 bytes of padding or something else.





--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V2 8/9] IB/mlx4: Support extended create_cq and query_device uverbs

On Wed, Jun 3, 2015 at 7:31 PM, Jason Gunthorpe
 wrote:
> On Wed, Jun 03, 2015 at 11:57:12AM +0300, Matan Barak wrote:
>> That's a general comment regarding the extension mechanism.
>
> Yes, but it is also a specific comment about patch #4 which adds,
> ib_uverbs_ex_create_cq.
>
> Based on the implementation of create_cq, it is pretty clear that
> every driver supports ib_uverbs_ex_create_cq, so patch #4 should just
> force the flag in the device register function so it is globally enabled.

But the other drivers currently do not support any CQ creation flag
and hence no extended functionality, I don't see the point signaling
towards user-space that the verb is supported, please elaborate.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 for-next 00/12] Add network namespace support in the RDMA-CM

On Wed, Jun 3, 2015 at 7:14 PM, Jason Gunthorpe
 wrote:
> On Wed, Jun 03, 2015 at 01:03:01PM +0300, Haggai Eran wrote:
>> > Then I'm inclined to say that we should map for namespaces using device,
>> > port, guid/gid, pkey.  And in this situation, since a unique guid/gid on
>> > any given pkey maps to a unique dhcp identifier and a unique ipv6
>> > lladdr, this becomes freely interchangeable with device, port, pkey,
>> > address mappings that this patchset was built around.
>>
>> What if we change the namespaces patches to map (device, port, GID,
>> P_Key, IP) to netdev / namespace? That is, to use both the GID and the
>> IP address.
>
> As I keep saying, you are not supposed to use the IP address as a key
> to find the netdev, that is the wrong way to use the Linux netdev
> model.
>
> Requiring unique GID/PKey allows the implementation to avoid this
> wrongness, which would be simplifying and more correct.
>
> That is the appeal to blocking this scenario when children are created.

Jason,

The IPoIB RTNL childs were added around release 3.6/7 of the upstream
kernel and are part of the kernel UAPI. They are perfectly used in
bunch of schemes:

1.  when static IP address assignment is used

2. under PV scheme, when the guest has para-virtual Eth NIC and the
host does routing between the back-end (e.g tap or alike) and  the
IPoIB child. Or when the host does tunneling (vxlan) and alike and
sends down the encapsulated packet through a host IP address assigned
to the IPoIB child

3. etc few more

Indeed the DHCP story isn't working there and to get DHCP work
something has to be done. But this issue can't serve for blocking the
existing UAPI and introduce regression to working systems.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V2 8/9] IB/mlx4: Support extended create_cq and query_device uverbs

On Wed, Jun 3, 2015 at 10:16 PM, Jason Gunthorpe
 wrote:
> On Wed, Jun 03, 2015 at 09:58:25PM +0300, Or Gerlitz wrote:
>> On Wed, Jun 3, 2015 at 7:31 PM, Jason Gunthorpe
>>  wrote:
>> > On Wed, Jun 03, 2015 at 11:57:12AM +0300, Matan Barak wrote:
>> >> That's a general comment regarding the extension mechanism.
>> >
>> > Yes, but it is also a specific comment about patch #4 which adds,
>> > ib_uverbs_ex_create_cq.
>> >
>> > Based on the implementation of create_cq, it is pretty clear that
>> > every driver supports ib_uverbs_ex_create_cq, so patch #4 should just
>> > force the flag in the device register function so it is globally enabled.

>> But the other drivers currently do not support any CQ creation flag
>> and hence no extended functionality, I don't see the point signaling
>> towards user-space that the verb is supported, please elaborate.

> They support the base functionality, the flags = 0 case.

which doesn't let consumers to use any new functionality.

> There is no reason to block access to the base functionality via the
> extended api. That just creates hassles for userspace.
> If userspace detects the extended API is present, it can just
> switch unconditionally all usage to that API.

This is user-space run time story, they don't have the knowledge that
all the LL drivers supports the extended api for CQ creation. We had
to check the flag and in all LL drivers since the in-kernel IB stack
has no (and need not to have any) notion of extended calls.

> This is how most new kernel syscalls are introduced (glibc
> does this transparently).

That's an interesting comment. And you know what, basically we can add
auto support for that call in uverbs.

But the point here is a bit different: I somehow have the feeling that
unless ~each and every one of your review comments are accepted to the
letter, no inclusion.

You are not the maintainer here, and even maintainers prefer not to
force each of their detailed comments on submitters.

This specific comment relates TINY in-kernel thing that can be changed later.

If from ten comments you give me I accept as is five, with the other
five I am trying to argue, on two of them we agree to my side, on two
we go your side and on the last one we let the maintainer to cut, this
is a healthy process that makes sense.

Currently it's feels like of either accepting 98% of the comments you
give or no acceptance.

> Detecting what flags a driver supports (if any) is any entirely
> different and orthogonal issue to introducing comp_mask/etc.

I didn't say that the which flags are supported detection relates to
exposing that extended uverbs call. I don't understand the "is any
entirely different" part of the sentence, is that as of me being
EMS-er?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V2 0/9] Add completion timestamping support

On Tue, Jun 2, 2015 at 9:08 PM, Jason Gunthorpe
 wrote:

> Or, the question in my mind based on looking at the UAPI patches is
> what things should be driver private and what should be general.
>
> Broadly my thoughts:
>  - Should the frequency and mask be general, or driver private? If the
>cycles->ns conversion is a function they should be driver private.
>Even if they are general at libibverbs, they don't *have* to be in
>the kernel's general query response.

If they are general in libibverbs, what's the point not to put them in
the kernel's general query response?

>  - Should frequency even be frequency? Most clocks are expressed
>accurately as a period in picoseconds. Frequency is more often
>imprecise. (eg ethernet is 3200 ps or 312.5MHz)
>However FDR/EDR is fractional for both (4693. ps vs
>213.0681818181818 MHz)
>Precision is very important for time conversions, so a
>multiply-divide scheme would be ideal.

>From Christoph's response I got the impression that our proposal of
exposing frequency and mask combined with raw time stamps excellently
fits typical user needs, so I thought we're good. Doug made a comment
that things look OK to him and the rest of the work would be when we
come to review the user-space patches.

>This is suggesting to me these details really are not general.
>  - There should be much better definition on what all this stuff is,
>units for frequency? When is the timestamp applied?

The timestamp is applied when the WC is generated, as Doug asked, we
changed the flag name to reflect that. I guess that the units for
frequency are MHz but I will check that and we can document it in the
kernel IB core patch and later in man pages.

>  - Should an app even be exposed to mask? This is very difficult
>to use correctly in the general case. Only cases where an app is
>restarted more often than a wrap period are trivial to use properly.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4 for-next 00/12] Add network namespace support in the RDMA-CM

On Wed, Jun 3, 2015 at 10:53 PM, Jason Gunthorpe
 wrote:
> On Wed, Jun 03, 2015 at 10:05:34PM +0300, Or Gerlitz wrote:
>
>> Indeed the DHCP story isn't working there and to get DHCP work
>> something has to be done. But this issue can't serve for blocking the
>> existing UAPI and introduce regression to working systems.
>
> It is not DHCP that concerns me, it is the fact we can't combine net
> namespaces, RDMA-CM and duplicate GUID IPoIB children together without
> adding hacks to the kernel. Searching netdevs by IP is a hack.
>
> I'm mostly fine with it as an optional capability, similar to macvlan,
> I just don't see how to cleanly integrate it with RDMA CM and
> namespaces. And I don't see what RDMA CM is supposed to do when
> it hits this case.
>
> So, any ideas that don't involve the searching for IP hack??
>
> [And yes, as discussed with Haggie, it is not the worst hack in the
>  world, and maybe we can live with it, but lets understand the trade
>  offs carefully]

As Haggai wrote, if we let the using IP address thing to fly up, we have
support for RDMA in containers using the RDMA-CM at IPoIB environments.
This will let people test, use, experiment, fix, interact (and even
production-it when static IP address assignment scheme is used).

Later, usage of alias GUIDs for IPoIB RTNL childs would allow to
remove the IP thing.

Later, the next stage/s in Matan's work on the RoCE GID table would
allow to support MACVLAN and hence RoCE too.

This is how the Linux kernel being evolved since the 2.5 failure to
come up with giant releases -- doing things in relativity small steps.

> Also, now that this has been brought up, I think you need to make a
> patch to fix the IPv6 SLAAC breakage this caused. It looks trivial to
> modify addrconf_ifid_infiniband to return error if the IPoIB child is
> sharing a guid. It was not good at all to push the child patches
> forward to 3.6/3.7 if you knew that IPv6 SLAAC was broken by them.

Till the alias GUID thing is introduced, maybe we can patch
addrconf_ifid_infiniband to use the QPN value from the device HW
address to come up with unique IPv6 link local address, agree? where
you think we can place the 24 bits QPN?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V2 0/9] Add completion timestamping support

2015-06-06 Thread Or Gerlitz

On Sat, Jun 6, 2015 at 6:45 PM, Doug Ledford  wrote:

>>> So no, I disagree that rough is fine for anything.

>> I am sorry but the practical issues that we are dealing with in
>> timekeeping today shows just the opposite. For a true comparison of clocks
>> with nanosecond accuracy you would need time corrected values and that is
>> a challenge due to the variances of the clocks that we see.

> Jason's point, and one that isn't addressed yet, is that this might not
> be variance in the clocks and instead might be a design flaw in the API
> you are using and the way the clock speeds are passed to user space.
> Changing from int MHz to int KHz might solve your problem.

OK, so if we have the UAPI to pass the clock frequency in KHz that
would put us in a better place? seems very much doable.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] IB/core: Don't advertise SA in RoCE port capabilities

2015-06-10 Thread Or Gerlitz


On 6/10/2015 8:35 AM, Moni Shoua wrote:

The Subnet Administrator (SA) is not a component of the RoCE spec.
Therefore, it should not be a capability of a RoCE port.

Change-Id: Iadfaa56bdc9f6e28f46d009064c2d15969293cf7


Please remove the internal Gerrit IDs we use prior to sending patches out


Signed-off-by: Moni Shoua


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V2 0/9] Add completion timestamping support

2015-06-10 Thread Or Gerlitz

On 6/10/2015 4:26 AM, Christoph Lameter wrote:

>I have no problem with a bare metal interface exposing this.  But pretendin=
>g that it's generic and that this is the one and only way that this could b=
>e implemented doesn't make it so.

This is a way it was implemented and its usable. Shooting for pie in the
sky does not bring us anything. Nor ideas of requirements from a new
experimental API that does not support the basic features that we need
and seems to be on its way to mess up the latencies of access to RDMA 
operations.

Doug,

What's your maintainer say here?

The current proposal has:

1. raw HCA clock completion generation time-stamp for CQEs
2. HCA clock frequency in KHZ
3. mask telling how many bits are relevant from the 64bit time-stamp

This is fairly simple, practical and very much usable to FSI 
applications and users, and can be extended later if someone comes up 
with better/other combination of the frequency/mask. Have a GO?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH for-next V5 00/12] Move RoCE GID management to IB/Core

2015-06-10 Thread Or Gerlitz


On 6/9/2015 10:27 AM, Matan Barak wrote:



On 6/9/2015 12:37 AM, Hefty, Sean wrote:
Previously, every vendor implemented its net device notifiers in its 
own

driver. This introduces a huge code duplication as figuring




  28 files changed, 2253 insertions(+), 860 deletions(-)


How does adding 1400 lines of code help reduce code duplication?

Can you please explain and justify why this change is actually needed?



Let's look at this change from several perspectives:

(1) Each vedor lost ~250 lines of GID management code just by this 
change. In the future it's very probable that more vendor drivers will 
implement RoCE. This removes the burden and code duplication required 
by them to implement a full RoCE support and is a lot more scalable 
than the current approach.


(2) All vendors are now aligned. For example, mlx4 driver had bonding 
support but ocrdma didn't have such support. The user expects the same 
behavior regardless the vendor's driver.


(3) When making something more general it usually requires more lines 
of code as it introduces API and doesn't cut corners assuming anything 
on the vendor's driver.


(4) This is a per-requisite to the RoCE V2 series. I'm sure you 
remember we first submitted this patch-set as a part of the RoCE V2 
series. Adding more features to the RoCE GID management will make the 
code duplication a lot worse than just ~250 lines. I don't think it's 
fair playing "lets divide the RoCE V2 patch-set to several patch-sets" 
and then say "why do we need this  at all". Let alone, the 
other there reasons are more than enough IMHO.




Sean, this change is needed b/c two drivers have (mlx4 and ocrda) and 
more two to come soon (mlx5 and soft-Roce) would have the very same 
logic of constructing the port GID table according to netdev events and 
such, no point in repeating this logic/code over and over.


Matan explained why we don't have 2 x Y deletions and 1 x Y insertions.

Jason, can you ack that this post addressed your comments?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [patch -next] net/mlx5_core: fix an error code

On Thu, Jun 11, 2015 at 11:50 AM, Dan Carpenter
 wrote:
> We return success if mlx5e_alloc_sq_db() fails but we should return an
> error code.
>
> Fixes: f62b8bb8f2d3 ('net/mlx5: Extend mlx5_core to support ConnectX-4 
> Ethernet functionality')
> Signed-off-by: Dan Carpenter 
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> index 7348c51..075e517 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
> @@ -525,7 +525,8 @@ static int mlx5e_create_sq(struct mlx5e_channel *c,
> sq->uar_map = sq->uar.map;
> sq->bf_buf_size = (1 << MLX5_CAP_GEN(mdev, log_bf_reg_size)) / 2;
>
> -   if (mlx5e_alloc_sq_db(sq, cpu_to_node(c->cpu)))
> +   err = mlx5e_alloc_sq_db(sq, cpu_to_node(c->cpu));
> +   if (err)
> goto err_sq_wq_destroy;
>
> sq->txq = netdev_get_tx_queue(priv->netdev,

Dan, nice catch, the team here just handed me the same fix, but you
submitted 1st...

Acked-by: Or Gerlitz 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V3 6/8] IB/core: Pass hardware specific data in query_device

From: Matan Barak 

Vendors should be able to pass vendor specific data to/from
user-space via query_device uverb. In order to do this,
we need to pass the vendors' specific udata.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/device.c |4 +++-
 drivers/infiniband/core/uverbs_cmd.c |2 +-
 drivers/infiniband/hw/amso1100/c2_provider.c |7 +--
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |8 ++--
 drivers/infiniband/hw/cxgb4/provider.c   |8 ++--
 drivers/infiniband/hw/ehca/ehca_hca.c|6 +-
 drivers/infiniband/hw/ehca/ehca_iverbs.h |3 ++-
 drivers/infiniband/hw/ipath/ipath_verbs.c|7 +--
 drivers/infiniband/hw/mlx4/main.c|6 +-
 drivers/infiniband/hw/mlx5/main.c|9 +++--
 drivers/infiniband/hw/mthca/mthca_provider.c |7 +--
 drivers/infiniband/hw/nes/nes_verbs.c|6 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |6 +-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |3 ++-
 drivers/infiniband/hw/qib/qib_verbs.c|6 --
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |6 +-
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h |3 ++-
 include/rdma/ib_verbs.h  |3 ++-
 18 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 568cb41..694bd66 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -539,9 +539,11 @@ EXPORT_SYMBOL(ib_dispatch_event);
 int ib_query_device(struct ib_device *device,
struct ib_device_attr *device_attr)
 {
+   struct ib_udata uhw = {.outlen = 0, .inlen = 0};
+
memset(device_attr, 0, sizeof(*device_attr));
 
-   return device->query_device(device, device_attr);
+   return device->query_device(device, device_attr, &uhw);
 }
 EXPORT_SYMBOL(ib_query_device);
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 11ee298..bbb02ff 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3428,7 +3428,7 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
 
memset(&attr, 0, sizeof(attr));
 
-   err = device->query_device(device, &attr);
+   err = device->query_device(device, &attr, uhw);
if (err)
return err;
 
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c 
b/drivers/infiniband/hw/amso1100/c2_provider.c
index a43e022..382f109 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -63,13 +63,16 @@
 #include "c2_provider.h"
 #include "c2_user.h"
 
-static int c2_query_device(struct ib_device *ibdev,
-  struct ib_device_attr *props)
+static int c2_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+  struct ib_udata *uhw)
 {
struct c2_dev *c2dev = to_c2dev(ibdev);
 
pr_debug("%s:%u\n", __func__, __LINE__);
 
+   if (uhw->inlen || uhw->outlen)
+   return -EINVAL;
+
*props = c2dev->props;
return 0;
 }
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c 
b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 2eaf7e8..c4b5936 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -1150,13 +1150,17 @@ static u64 fw_vers_string_to_u64(struct iwch_dev 
*iwch_dev)
   (fw_mic & 0x);
 }
 
-static int iwch_query_device(struct ib_device *ibdev,
-struct ib_device_attr *props)
+static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+struct ib_udata *uhw)
 {
 
struct iwch_dev *dev;
+
PDBG("%s ibdev %p\n", __func__, ibdev);
 
+   if (uhw->inlen || uhw->outlen)
+   return -EINVAL;
+
dev = to_iwch_dev(ibdev);
memset(props, 0, sizeof *props);
memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
diff --git a/drivers/infiniband/hw/cxgb4/provider.c 
b/drivers/infiniband/hw/cxgb4/provider.c
index ef08a9f..05a96a5 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -301,13 +301,17 @@ static int c4iw_query_gid(struct ib_device *ibdev, u8 
port, int index,
return 0;
 }
 
-static int c4iw_query_device(struct ib_device *ibdev,
-struct ib_device_attr *props)
+static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr 
*props,
+struct ib_udata *uhw)
 {
 
struct c4iw_dev *dev;
+
PDBG("%s ibdev %p\n", __func__, ibdev);
 
+   if (uhw->inlen |

[PATCH for-next V3 4/8] IB/core: Extend ib_uverbs_create_cq

From: Matan Barak 

ib_uverbs_ex_create_cq follows the extension verbs
mechanism. New features (for example, CQ creation flags
field which is added in a downstream patch) could used
via user-space libraries without breaking the ABI.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/uverbs.h  |1 +
 drivers/infiniband/core/uverbs_cmd.c  |  170 ++---
 drivers/infiniband/core/uverbs_main.c |1 +
 include/uapi/rdma/ib_user_verbs.h |   17 
 4 files changed, 154 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index b716b08..ba365b6 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -259,5 +259,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
 IB_UVERBS_DECLARE_EX_CMD(create_flow);
 IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
 IB_UVERBS_DECLARE_EX_CMD(query_device);
+IB_UVERBS_DECLARE_EX_CMD(create_cq);
 
 #endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 1954ebb..51311b1 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1330,41 +1330,37 @@ ssize_t ib_uverbs_create_comp_channel(struct 
ib_uverbs_file *file,
return in_len;
 }
 
-ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
-   const char __user *buf, int in_len,
-   int out_len)
+static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+  struct ib_udata *ucore,
+  struct ib_udata *uhw,
+  struct ib_uverbs_ex_create_cq *cmd,
+  size_t cmd_sz,
+  int (*cb)(struct ib_uverbs_file *file,
+struct ib_ucq_object *obj,
+struct 
ib_uverbs_ex_create_cq_resp *resp,
+struct ib_udata *udata,
+void *context),
+  void *context)
 {
-   struct ib_uverbs_create_cq  cmd;
-   struct ib_uverbs_create_cq_resp resp;
-   struct ib_udata udata;
struct ib_ucq_object   *obj;
struct ib_uverbs_event_file*ev_file = NULL;
struct ib_cq   *cq;
int ret;
+   struct ib_uverbs_ex_create_cq_resp resp;
struct ib_cq_init_attr attr = {};
 
-   if (out_len < sizeof resp)
-   return -ENOSPC;
-
-   if (copy_from_user(&cmd, buf, sizeof cmd))
-   return -EFAULT;
-
-   INIT_UDATA(&udata, buf + sizeof cmd,
-  (unsigned long) cmd.response + sizeof resp,
-  in_len - sizeof cmd, out_len - sizeof resp);
-
-   if (cmd.comp_vector >= file->device->num_comp_vectors)
-   return -EINVAL;
+   if (cmd->comp_vector >= file->device->num_comp_vectors)
+   return ERR_PTR(-EINVAL);
 
obj = kmalloc(sizeof *obj, GFP_KERNEL);
if (!obj)
-   return -ENOMEM;
+   return ERR_PTR(-ENOMEM);
 
-   init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, 
&cq_lock_class);
+   init_uobj(&obj->uobject, cmd->user_handle, file->ucontext, 
&cq_lock_class);
down_write(&obj->uobject.mutex);
 
-   if (cmd.comp_channel >= 0) {
-   ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+   if (cmd->comp_channel >= 0) {
+   ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel);
if (!ev_file) {
ret = -EINVAL;
goto err;
@@ -1377,10 +1373,14 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
 
-   attr.cqe = cmd.cqe;
-   attr.comp_vector = cmd.comp_vector;
+   attr.cqe = cmd->cqe;
+   attr.comp_vector = cmd->comp_vector;
+
+   if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
+   attr.flags = cmd->flags;
+
cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
-file->ucontext, &udata);
+file->ucontext, uhw);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
@@ -1399,14 +1399,15 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
goto err_free;
 
memset(&resp, 0, sizeof resp);
-   resp.cq_handle = obj->uobject.id;
-   resp.cqe   = cq-&g

[PATCH for-next V3 3/8] IB/core: Add CQ creation time-stamping flag

From: Matan Barak 

Add CQ creation flag which dictates that the created CQ will report
completion time-stamp value in the WC.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 include/rdma/ib_verbs.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index e2f9eac..0cb7a0d 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -173,6 +173,10 @@ struct ib_odp_caps {
} per_transport_caps;
 };
 
+enum ib_cq_creation_flags {
+   IB_CQ_FLAGS_TIMESTAMP_COMPLETION   = 1 << 0,
+};
+
 struct ib_cq_init_attr {
unsigned intcqe;
int comp_vector;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V3 2/8] IB/core: Change ib_create_cq to use struct ib_cq_init_attr

From: Matan Barak 

Currently, ib_create_cq uses cqe and comp_vecotr instead
of the extendible ib_cq_init_attr struct.

Earlier patches already changed the vendors to work with
ib_cq_init_attr. This patch changes the consumers too.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/mad.c  |4 +++-
 drivers/infiniband/core/verbs.c|6 +++---
 drivers/infiniband/hw/ehca/ehca_main.c |5 -
 drivers/infiniband/hw/mlx4/mad.c   |4 +++-
 drivers/infiniband/hw/mlx4/main.c  |4 +++-
 drivers/infiniband/hw/mlx5/main.c  |6 --
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |8 ++--
 drivers/infiniband/ulp/iser/iser_verbs.c   |5 -
 drivers/infiniband/ulp/isert/ib_isert.c|5 -
 drivers/infiniband/ulp/srp/ib_srp.c|9 +++--
 drivers/infiniband/ulp/srpt/ib_srpt.c  |4 +++-
 .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c|6 --
 include/rdma/ib_verbs.h|7 +++
 net/9p/trans_rdma.c|4 +++-
 net/rds/ib_cm.c|7 +--
 net/rds/iw_cm.c|7 +--
 net/sunrpc/xprtrdma/svc_rdma_transport.c   |9 +
 net/sunrpc/xprtrdma/verbs.c|9 +
 18 files changed, 74 insertions(+), 35 deletions(-)

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 600af26..533c0b2 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -2923,6 +2923,7 @@ static int ib_mad_port_open(struct ib_device *device,
unsigned long flags;
char name[sizeof "ib_mad123"];
int has_smi;
+   struct ib_cq_init_attr cq_attr = {};
 
/* Create new device info */
port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
@@ -2943,9 +2944,10 @@ static int ib_mad_port_open(struct ib_device *device,
if (has_smi)
cq_size *= 2;
 
+   cq_attr.cqe = cq_size;
port_priv->cq = ib_create_cq(port_priv->device,
 ib_mad_thread_completion_handler,
-NULL, port_priv, cq_size, 0);
+NULL, port_priv, &cq_attr);
if (IS_ERR(port_priv->cq)) {
dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
ret = PTR_ERR(port_priv->cq);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index f7615d4..c5eafde 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1075,12 +1075,12 @@ EXPORT_SYMBOL(ib_destroy_qp);
 struct ib_cq *ib_create_cq(struct ib_device *device,
   ib_comp_handler comp_handler,
   void (*event_handler)(struct ib_event *, void *),
-  void *cq_context, int cqe, int comp_vector)
+  void *cq_context,
+  const struct ib_cq_init_attr *cq_attr)
 {
struct ib_cq *cq;
-   struct ib_cq_init_attr attr = {.cqe = cqe, .comp_vector = comp_vector};
 
-   cq = device->create_cq(device, &attr, NULL, NULL);
+   cq = device->create_cq(device, cq_attr, NULL, NULL);
 
if (!IS_ERR(cq)) {
cq->device= device;
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c 
b/drivers/infiniband/hw/ehca/ehca_main.c
index 5e30b72..c0e45a4 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -552,6 +552,7 @@ static int ehca_create_aqp1(struct ehca_shca *shca, u32 
port)
struct ib_cq *ibcq;
struct ib_qp *ibqp;
struct ib_qp_init_attr qp_init_attr;
+   struct ib_cq_init_attr cq_attr = {};
int ret;
 
if (sport->ibcq_aqp1) {
@@ -559,7 +560,9 @@ static int ehca_create_aqp1(struct ehca_shca *shca, u32 
port)
return -EPERM;
}
 
-   ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1), 10, 0);
+   cq_attr.cqe = 10;
+   ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1),
+   &cq_attr);
if (IS_ERR(ibcq)) {
ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
return PTR_ERR(ibcq);
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 9cd2b00..5fc2232 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1773,6 +1773,7 @@ static int create_pv_resources(struct ib_device *ibdev, 
int slave, int port,
   int create_tun, struct mlx4_ib_demux_pv_ctx *ctx)
 {
int ret, cq_size;
+   struct ib

[PATCH for-next V3 0/8] Add completion timestamping support

Hi Doug,

This patchset adds the kernel control path for completion timestamping 
support by user-space verbs consumers. 

Timestamping is used by applications in order to know when a WQE was 
received/transmitted by the HW. The value is given is HCA hardware cycles,
but could be easily converted as the hardware's core clock frequecny is 
available through extension of query device. 

Moreover, we add an ability to read the HCA's current clock. This could be 
useful on order to synchronize events to the wall clock.

This functionality is achieved by adding/extending the following verbs:

create_cq - create_cq is extended in order to allow passing creation flags
to the CQ creation function. We change IB/core --> vendors API
to be easily extendible by passing a struct which contains
comp_vectors, cqe and the new flags parameter. In order to create
CQ which supports timestamping, IB_CQ_FLAGS_TIMESTAMP_COMPLETION should be 
given.

query_device - We extend query_device uverb further by giving the hardware's
clock frequency and the timestamp mask (the number of timestamp
bits which are supported). If timestamp isn't supported, 0 is returned.

In order to read the timestamp in the WQE, the user needs to query the device 
for support, create an appropriate CQ (using the extanded uverb with
IB_CQ_FLAGS_TIMESTAMP_COMPLETION) and poll the CQ with an extended poll_cq verb 
(currently,
only implemented in user-space).

In mlx4, allowing the user to read the core clock efficiently involves mapping
this area of the hardware to user-space (being done by using a mmap command)
and reading the clock from the correct offset of the page. 

This offset is returned in the vendor's specific data from mlx4's kernel driver 
to the mlx4's user-space driver. query_device is modified in order to support
passing this vendor specific data. A user-space application could use a new
verb in order to read the hardware's clock.

Translating the hardware's clock into ms could be done by dividing this
value by hca_core_clock (which is returned by the extended version of
query_device uverb).

The below V2 --> V3 changes address the review comments on the kernel bits. 

Jason had another comment calling for re-thinking / questioning the 
need for a dedicated uverbs_ex_cmd_mask in the IB device. This goes 
beyond the scope of this specific series. 

Matan and Or.

Changes from V2:
 (1) Use KHZ for hca_core_clock
 (2) ib_create_cq gets const cq_attr
 (3  use {} initialization instead of memset
 (4) squashed last two mlx4 patches

Changes from V1:
 (1) fixed lustre IB's code build
 (2) squashed mlx4 V1 9-11 patches into one
 (3) changed IB_CQ_FLAGS_TIMESTAMP --> IB_CQ_FLAGS_TIMESTAMP_COMPLETION

Changes from V0:
(1) Pass ib_cq_init_attr instead of cqe and comp_vector.
(2) Fix unneeded indentation.
(3) Change flags to u32.
(4) Add const to create_cq's ib_cq_init_attr argument in vendor implementation.

Matan Barak (9):
  IB/core: Change provider's API of create_cq to be extendible
  IB/core: Change ib_create_cq to use struct ib_cq_init_attr
  IB/core: Add CQ creation time-stamping flag
  IB/core: Extend ib_uverbs_create_cq
  IB/core: Add timestamp_mask and hca_core_clock to query_device
  IB/core: Pass hardware specific data in query_device
  IB/mlx4: Add mmap call to map the hardware clock
  IB/mlx4: Support extended create_cq and query_device uverbs
  IB/mlx4: Add support for CQ time-stamping

 drivers/infiniband/core/device.c   |6 +-
 drivers/infiniband/core/mad.c  |5 +-
 drivers/infiniband/core/uverbs.h   |1 +
 drivers/infiniband/core/uverbs_cmd.c   |  188 
 drivers/infiniband/core/uverbs_main.c  |1 +
 drivers/infiniband/core/verbs.c|4 +-
 drivers/infiniband/hw/amso1100/c2_provider.c   |   14 ++-
 drivers/infiniband/hw/cxgb3/iwch_provider.c|   19 ++-
 drivers/infiniband/hw/cxgb4/cq.c   |9 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |8 +-
 drivers/infiniband/hw/cxgb4/provider.c |8 +-
 drivers/infiniband/hw/ehca/ehca_cq.c   |7 +-
 drivers/infiniband/hw/ehca/ehca_hca.c  |6 +-
 drivers/infiniband/hw/ehca/ehca_iverbs.h   |6 +-
 drivers/infiniband/hw/ehca/ehca_main.c |6 +-
 drivers/infiniband/hw/ipath/ipath_cq.c |9 +-
 drivers/infiniband/hw/ipath/ipath_verbs.c  |7 +-
 drivers/infiniband/hw/ipath/ipath_verbs.h  |3 +-
 drivers/infiniband/hw/mlx4/cq.c|   13 ++-
 drivers/infiniband/hw/mlx4/mad.c   |5 +-
 drivers/infiniband/hw/mlx4/main.c  |   67 +++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h   |   19 ++-
 drivers/infiniband/hw/mlx5/cq.c|   10 +-
 drivers/infiniband/hw/mlx5/main.c  |   19 ++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h   |5 +-
 drive

[PATCH for-next V3 8/8] IB/mlx4: Add support for CQ time-stamping

From: Matan Barak 

This includes:

* support allocation of CQ with the TIMESTAMP_COMPLETION creation flag.

* add timestamp_mask and hca_core_clock to query_device, reporting the
  number of supported timestamp bits (mask) and the hca_core_clock frequency.

* return hca core clock's offset in query_device vendor's data,
  this is needed in order to read the HCA's core clock.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/cq.c  |9 +--
 drivers/infiniband/hw/mlx4/main.c|   42 -
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   16 +
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 8e44aaa..36eb3d0 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -166,6 +166,7 @@ err_buf:
return err;
 }
 
+#define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_FLAGS_TIMESTAMP_COMPLETION
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
const struct ib_cq_init_attr *attr,
struct ib_ucontext *context,
@@ -178,10 +179,10 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
struct mlx4_uar *uar;
int err;
 
-   if (attr->flags)
+   if (entries < 1 || entries > dev->dev->caps.max_cqes)
return ERR_PTR(-EINVAL);
 
-   if (entries < 1 || entries > dev->dev->caps.max_cqes)
+   if (attr->flags & ~CQ_CREATE_FLAGS_SUPPORTED)
return ERR_PTR(-EINVAL);
 
cq = kmalloc(sizeof *cq, GFP_KERNEL);
@@ -194,6 +195,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
spin_lock_init(&cq->lock);
cq->resize_buf = NULL;
cq->resize_umem = NULL;
+   cq->create_flags = attr->flags;
INIT_LIST_HEAD(&cq->send_qp_list);
INIT_LIST_HEAD(&cq->recv_qp_list);
 
@@ -237,7 +239,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
vector = dev->eq_table[vector % ibdev->num_comp_vectors];
 
err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
-   cq->db.dma, &cq->mcq, vector, 0, 0);
+   cq->db.dma, &cq->mcq, vector, 0,
+   !!(cq->create_flags & 
IB_CQ_FLAGS_TIMESTAMP_COMPLETION));
if (err)
goto err_dbmap;
 
diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index a4abebf..37c2660 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -140,10 +140,27 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
struct ib_smp *out_mad = NULL;
int err = -ENOMEM;
int have_ib_ports;
+   struct mlx4_uverbs_ex_query_device cmd;
+   struct mlx4_uverbs_ex_query_device_resp resp = {.comp_mask = 0};
+   struct mlx4_clock_params clock_params;
 
-   if (uhw->inlen || uhw->outlen)
-   return -EINVAL;
+   if (uhw->inlen) {
+   if (uhw->inlen < sizeof(cmd))
+   return -EINVAL;
+
+   err = ib_copy_from_udata(&cmd, uhw, sizeof(cmd));
+   if (err)
+   return err;
+
+   if (cmd.comp_mask)
+   return -EINVAL;
+
+   if (cmd.reserved)
+   return -EINVAL;
+   }
 
+   resp.response_length = offsetof(typeof(resp), response_length) +
+   sizeof(resp.response_length);
in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
if (!in_mad || !out_mad)
@@ -233,7 +250,24 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
   props->max_mcast_grp;
props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
+   props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL;
+   props->timestamp_mask = 0xULL;
 
+   err = mlx4_get_internal_clock_params(dev->dev, &clock_params);
+   if (err)
+   goto out;
+
+   if (uhw->outlen >= resp.response_length + 
sizeof(resp.hca_core_clock_offset)) {
+   resp.hca_core_clock_offset = clock_params.offset % PAGE_SIZE;
+   resp.response_length += sizeof(resp.hca_core_clock_offset);
+   resp.comp_mask |= QUERY_DEVICE_RESP_MASK_TIMESTAMP;
+   }
+
+   if (uhw->outlen) {
+   err = ib_copy_to_udata(uhw, &resp, resp.response_length);
+   if (err)
+   goto out;
+   }
 out:
kfree(in_mad);
kfree(out_mad);
@@ -2322,6 +2356,10 @@

[PATCH for-next V3 5/8] IB/core: Add timestamp_mask and hca_core_clock to query_device

From: Matan Barak 

In order to expose timestamp we need to expose two new attributes in
query_device to be used for CQ completion time-stamping:

timestamp_mask - how many bits are valid in the timestamp, where timestamp
values could be 64bits the most.

hca_core_clock - timestamp is given in HW cycles, the frequency in KHZ units
of the HCA, necessary in order to convert cycles to seconds.

This is added both to ib_query_device and its respective uverbs counterpart.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/device.c |2 ++
 drivers/infiniband/core/uverbs_cmd.c |   14 ++
 include/rdma/ib_verbs.h  |2 ++
 include/uapi/rdma/ib_user_verbs.h|2 ++
 4 files changed, 20 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 8d07c12..568cb41 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -539,6 +539,8 @@ EXPORT_SYMBOL(ib_dispatch_event);
 int ib_query_device(struct ib_device *device,
struct ib_device_attr *device_attr)
 {
+   memset(device_attr, 0, sizeof(*device_attr));
+
return device->query_device(device, device_attr);
 }
 EXPORT_SYMBOL(ib_query_device);
diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index 51311b1..11ee298 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -3426,6 +3426,8 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
if (ucore->outlen < resp.response_length)
return -ENOSPC;
 
+   memset(&attr, 0, sizeof(attr));
+
err = device->query_device(device, &attr);
if (err)
return err;
@@ -3450,6 +3452,18 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file 
*file,
 #endif
resp.response_length += sizeof(resp.odp_caps);
 
+   if (ucore->outlen < resp.response_length + sizeof(resp.timestamp_mask))
+   goto end;
+
+   resp.timestamp_mask = attr.timestamp_mask;
+   resp.response_length += sizeof(resp.timestamp_mask);
+
+   if (ucore->outlen < resp.response_length + sizeof(resp.hca_core_clock))
+   goto end;
+
+   resp.hca_core_clock = attr.hca_core_clock;
+   resp.response_length += sizeof(resp.hca_core_clock);
+
 end:
err = ib_copy_to_udata(ucore, &resp, resp.response_length);
if (err)
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 0cb7a0d..96142b5 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -227,6 +227,8 @@ struct ib_device_attr {
int sig_prot_cap;
int sig_guard_cap;
struct ib_odp_caps  odp_caps;
+   uint64_ttimestamp_mask;
+   uint64_thca_core_clock; /* in KHZ */
 };
 
 enum ib_mtu {
diff --git a/include/uapi/rdma/ib_user_verbs.h 
b/include/uapi/rdma/ib_user_verbs.h
index 19f0256..978841e 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -223,6 +223,8 @@ struct ib_uverbs_ex_query_device_resp {
__u32 comp_mask;
__u32 response_length;
struct ib_uverbs_odp_caps odp_caps;
+   __u64 timestamp_mask;
+   __u64 hca_core_clock; /* in KHZ */
 };
 
 struct ib_uverbs_query_port {
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V3 7/8] IB/mlx4: Add mmap call to map the hardware clock

From: Matan Barak 

In order to read the HCA's cycle counter efficiently in
user space, we need to map the HCA's register.
This is done through mmap call.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/hw/mlx4/main.c |   18 +-
 drivers/net/ethernet/mellanox/mlx4/main.c |   19 +++
 include/linux/mlx4/device.h   |9 +
 3 files changed, 45 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/main.c 
b/drivers/infiniband/hw/mlx4/main.c
index 61febb8..a4abebf 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -716,8 +716,24 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, 
struct vm_area_struct *vma)
   dev->dev->caps.num_uars,
   PAGE_SIZE, vma->vm_page_prot))
return -EAGAIN;
-   } else
+   } else if (vma->vm_pgoff == 3) {
+   struct mlx4_clock_params params;
+   int ret = mlx4_get_internal_clock_params(dev->dev, ¶ms);
+
+   if (ret)
+   return ret;
+
+   vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+   if (io_remap_pfn_range(vma, vma->vm_start,
+  
(pci_resource_start(dev->dev->persist->pdev,
+  params.bar) +
+   params.offset)
+  >> PAGE_SHIFT,
+  PAGE_SIZE, vma->vm_page_prot))
+   return -EAGAIN;
+   } else {
return -EINVAL;
+   }
 
return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index ced5eca..70de39c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1674,6 +1674,25 @@ static int map_internal_clock(struct mlx4_dev *dev)
return 0;
 }
 
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+  struct mlx4_clock_params *params)
+{
+   struct mlx4_priv *priv = mlx4_priv(dev);
+
+   if (mlx4_is_slave(dev))
+   return -ENOTSUPP;
+
+   if (!params)
+   return -EINVAL;
+
+   params->bar = priv->fw.clock_bar;
+   params->offset = priv->fw.clock_offset;
+   params->size = MLX4_CLOCK_SIZE;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_internal_clock_params);
+
 static void unmap_internal_clock(struct mlx4_dev *dev)
 {
struct mlx4_priv *priv = mlx4_priv(dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 83e80ab..f94984f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -829,6 +829,12 @@ struct mlx4_dev {
struct mlx4_vf_dev *dev_vfs;
 };
 
+struct mlx4_clock_params {
+   u64 offset;
+   u8 bar;
+   u8 size;
+};
+
 struct mlx4_eqe {
u8  reserved1;
u8  type;
@@ -1485,4 +1491,7 @@ int mlx4_ACCESS_PTYS_REG(struct mlx4_dev *dev,
 enum mlx4_access_reg_method method,
 struct mlx4_ptys_reg *ptys_reg);
 
+int mlx4_get_internal_clock_params(struct mlx4_dev *dev,
+  struct mlx4_clock_params *params);
+
 #endif /* MLX4_DEVICE_H */
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH for-next V3 1/8] IB/core: Change provider's API of create_cq to be extendible

From: Matan Barak 

Add a new ib_cq_init_attr structure which contains the
previous cqe (minimum number of CQ entries) and comp_vector
(completion vector) in addition to a new flags field.
All vendors' create_cq callbacks are changed in order
to work with the new API.

This commit does not change any functionality.

Signed-off-by: Matan Barak 
Signed-off-by: Or Gerlitz 
---
 drivers/infiniband/core/uverbs_cmd.c |6 --
 drivers/infiniband/core/verbs.c  |3 ++-
 drivers/infiniband/hw/amso1100/c2_provider.c |7 ++-
 drivers/infiniband/hw/cxgb3/iwch_provider.c  |   11 ---
 drivers/infiniband/hw/cxgb4/cq.c |9 +++--
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h   |8 
 drivers/infiniband/hw/ehca/ehca_cq.c |7 ++-
 drivers/infiniband/hw/ehca/ehca_iverbs.h |3 ++-
 drivers/infiniband/hw/ipath/ipath_cq.c   |9 +++--
 drivers/infiniband/hw/ipath/ipath_verbs.h|3 ++-
 drivers/infiniband/hw/mlx4/cq.c  |8 +++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |3 ++-
 drivers/infiniband/hw/mlx5/cq.c  |   10 --
 drivers/infiniband/hw/mlx5/main.c|3 ++-
 drivers/infiniband/hw/mlx5/mlx5_ib.h |5 +++--
 drivers/infiniband/hw/mthca/mthca_provider.c |8 ++--
 drivers/infiniband/hw/nes/nes_verbs.c|   11 ---
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.c  |7 ++-
 drivers/infiniband/hw/ocrdma/ocrdma_verbs.h  |6 --
 drivers/infiniband/hw/qib/qib_cq.c   |   11 ---
 drivers/infiniband/hw/qib/qib_verbs.h|5 +++--
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c |   10 +++---
 drivers/infiniband/hw/usnic/usnic_ib_verbs.h |7 ---
 include/rdma/ib_verbs.h  |   10 --
 24 files changed, 124 insertions(+), 46 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_cmd.c 
b/drivers/infiniband/core/uverbs_cmd.c
index a9f0489..1954ebb 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1341,6 +1341,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
struct ib_uverbs_event_file*ev_file = NULL;
struct ib_cq   *cq;
int ret;
+   struct ib_cq_init_attr attr = {};
 
if (out_len < sizeof resp)
return -ENOSPC;
@@ -1376,8 +1377,9 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&obj->comp_list);
INIT_LIST_HEAD(&obj->async_list);
 
-   cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
-cmd.comp_vector,
+   attr.cqe = cmd.cqe;
+   attr.comp_vector = cmd.comp_vector;
+   cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
 file->ucontext, &udata);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 685a362..f7615d4 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1078,8 +1078,9 @@ struct ib_cq *ib_create_cq(struct ib_device *device,
   void *cq_context, int cqe, int comp_vector)
 {
struct ib_cq *cq;
+   struct ib_cq_init_attr attr = {.cqe = cqe, .comp_vector = comp_vector};
 
-   cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+   cq = device->create_cq(device, &attr, NULL, NULL);
 
if (!IS_ERR(cq)) {
cq->device= device;
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c 
b/drivers/infiniband/hw/amso1100/c2_provider.c
index d396c39..a43e022 100644
--- a/drivers/infiniband/hw/amso1100/c2_provider.c
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -286,13 +286,18 @@ static int c2_destroy_qp(struct ib_qp *ib_qp)
return 0;
 }
 
-static struct ib_cq *c2_create_cq(struct ib_device *ibdev, int entries, int 
vector,
+static struct ib_cq *c2_create_cq(struct ib_device *ibdev,
+ const struct ib_cq_init_attr *attr,
  struct ib_ucontext *context,
  struct ib_udata *udata)
 {
+   int entries = attr->cqe;
struct c2_cq *cq;
int err;
 
+   if (attr->flags)
+   return ERR_PTR(-EINVAL);
+
cq = kmalloc(sizeof(*cq), GFP_KERNEL);
if (!cq) {
pr_debug("%s: Unable to allocate CQ\n", __func__);
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c 
b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 061ef08..2eaf7e8 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -138,10 +138,12 @@ static int iwch_destroy_cq(struct ib_cq

Re: [PATCH] opensm: Add initial support for optimized SLtoVLMappingTable programming


Hal Rosenstock wrote:

On Thu, Oct 29, 2009 at 10:23 PM, Sasha Khapyorsky  wrote:

Implementation description would be very useful. What does "initial support" 
mean?

It means there's more to come in terms of using 
OptimizedSLtoVLMappingProgramming. This is the simplest use/introduction of 
this optional feature.
You can just send people to reads specs, your change log should explain 
what the patch is about, if this is a big change to opensm, maybe even 
RFC it will a detailed writeup


Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RDMA] Fixup IPv6 support and IPv4 routing corner cases for RDMA CM


Jason Gunthorpe wrote:

On Wed, Oct 28, 2009 at 10:05:19AM -0700, Sean Hefty wrote:

A UD endpoint can communicate using multicast and to other UD endpoints.  A 
user could resolve a UD endpoint before joining a multicast group.


So the IP world analog would be:
fd = socket(AF_INET,SOCK_DGRAM);
connect(fd,'Some Unicast Address');
setsockopt(fd,IP_MULITCAST_ADD_MEMBERSHIP,'Some Multicast Address');
sendto(fd,...,'Some Multicast Address');

IP multicast senders don't call IP_ADD_MEMBERSHIP, only receivers

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH RESEND] ib/iser: re-write SG handling for rdma logic

After dma-mapping an SG list provided by the SCSI midlayer, iser has
to make sure the mapped SG is "aligned for RDMA" in the sense that its
possible to produce one mapping in the HCA IOMMU which represents the
whole SG. Next, the mapped SG is formatted for registration with the HCA.

This patch re-writes the logic that does the above, to make it clearer
and simpler. It also fixes a bug in the being aligned for rdma checks,
where a "start" check wasn't done but rather only "end" check.

Signed-off-by: Alexander Nezhinsky 
Signed-off-by: Or Gerlitz 

Index: linux-2.6.32-rc5/drivers/infiniband/ulp/iser/iser_memory.c
===
--- linux-2.6.32-rc5.orig/drivers/infiniband/ulp/iser/iser_memory.c
+++ linux-2.6.32-rc5/drivers/infiniband/ulp/iser/iser_memory.c
@@ -209,6 +209,8 @@ void iser_finalize_rdma_unaligned_sg(str
mem_copy->copy_buf = NULL;
 }

+#define IS_4K_ALIGNED(addr)unsigned long)addr) & ~MASK_4K) == 0)
+
 /**
  * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
  * and returns the length of resulting physical address array (may be less than
@@ -221,62 +223,52 @@ void iser_finalize_rdma_unaligned_sg(str
  * where --few fragments of the same page-- are present in the SG as
  * consecutive elements. Also, it handles one entry SG.
  */
+
 static int iser_sg_to_page_vec(struct iser_data_buf *data,
   struct iser_page_vec *page_vec,
   struct ib_device *ibdev)
 {
-   struct scatterlist *sgl = (struct scatterlist *)data->buf;
-   struct scatterlist *sg;
-   u64 first_addr, last_addr, page;
-   int end_aligned;
-   unsigned int cur_page = 0;
+   struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf;
+   u64 start_addr, end_addr, page, chunk_start = 0;
unsigned long total_sz = 0;
-   int i;
+   unsigned int dma_len;
+   int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;

/* compute the offset of first element */
page_vec->offset = (u64) sgl[0].offset & ~MASK_4K;

+   new_chunk = 1;
+   cur_page  = 0;
for_each_sg(sgl, sg, data->dma_nents, i) {
-   unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
-
+   start_addr = ib_sg_dma_address(ibdev, sg);
+   if (new_chunk)
+   chunk_start = start_addr;
+   dma_len = ib_sg_dma_len(ibdev, sg);
+   end_addr = start_addr + dma_len;
total_sz += dma_len;

-   first_addr = ib_sg_dma_address(ibdev, sg);
-   last_addr  = first_addr + dma_len;
-
-   end_aligned   = !(last_addr  & ~MASK_4K);
-
-   /* continue to collect page fragments till aligned or SG ends */
-   while (!end_aligned && (i + 1 < data->dma_nents)) {
-   sg = sg_next(sg);
-   i++;
-   dma_len = ib_sg_dma_len(ibdev, sg);
-   total_sz += dma_len;
-   last_addr = ib_sg_dma_address(ibdev, sg) + dma_len;
-   end_aligned = !(last_addr  & ~MASK_4K);
+   /* collect page fragments until aligned or end of SG list */
+   if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
+   new_chunk = 0;
+   continue;
}
+   new_chunk = 1;

-   /* handle the 1st page in the 1st DMA element */
-   if (cur_page == 0) {
-   page = first_addr & MASK_4K;
-   page_vec->pages[cur_page] = page;
-   cur_page++;
+   /* address of the first page in the contiguous chunk;
+  masking relevant for the very first SG entry,
+  which might be unaligned */
+   page = chunk_start & MASK_4K;
+   do {
+   page_vec->pages[cur_page++] = page;
page += SIZE_4K;
-   } else
-   page = first_addr;
-
-   for (; page < last_addr; page += SIZE_4K) {
-   page_vec->pages[cur_page] = page;
-   cur_page++;
-   }
-
+   } while (page < end_addr);
}
+
page_vec->data_size = total_sz;
iser_dbg("page_vec->data_size:%d cur_page %d\n", 
page_vec->data_size,cur_page);
return cur_page;
 }

-#define IS_4K_ALIGNED(addr)unsigned long)addr) & ~MASK_4K) == 0)

 /**
  * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
@@ -284,42 +276,40 @@ static int iser_sg_to_page_vec(struct is
  * the number of entries which are aligned correctly. Supports the case where
  * consecutive

Re: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path


Sean Hefty wrote:

Jason and Or, does this seem ready to queue for 2.6.33?
Roland, I have missed your email last week, anyway, as I wrote Sean 
earlier, I'm totally fine with this patch of allowing user space to set 
a patch record for the kernel.


Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] rdma/cm: support option to allow manually setting IB path

2009-11-01 Thread Or Gerlitz


Sean Hefty wrote:

Future changes to the rdma cm can expand on this framework to support the full 
range of features allowed by the IB CM, such as separate forward and reverse 
paths and APM

Sean,

Before enhancing the rdma-cm to support the full feature set of the IB 
CM, something which I personally don't see the actual need for (but I 
will be happy to get educated what applications will or can migrate to 
rdma-cm once this is implemented), how about trying to allow for reduced 
QoS scheme also when the entity that resolved this patch didn't 
consulted with the SA?


IB QoS is based on the query providing the  
tuple and the SA returning a  QoS tuple. Now 
I'd like to see how can we let the application / querying middleware to 
take advantage of the knowledge on what partition it runs and use the SL 
associated with the IPv4 (e.g AF_INET rdma-cm ID's) IPoIB broadcast 
group. This way, one can still program a QoS scheme at the SA which is 
based on partitions.


Looking on mckey, the user space code (e.g ACM), could just do rdma_bind 
to an IP address of an IPoIB NIC that uses this partition and then 
rdma_join to an unmapped multicast address which correspond to the 
broadcast group, take the SL and leave the group, makes sense?


Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] librdmacm/mckey: enforce local binding for unmapped multicast addresses

2009-11-01 Thread Or Gerlitz

enforce local binding is specified for unmapped multicast addresses, otherwise 
mckey
crashes when attempting to use the cma_id->verbs pointer in the port query verb.

Signed-off-by: Or Gerlitz 

Sean, using unmapped multicast addresses I see that a different broacast group 
is
created by the SM such that mckey doesn't manage to join the ipv4 broadcast 
group

$ ./mckey -M ff12:401b::0:0:0:: -b 10.10.5.62 -p 0x2

mckey: joined dgid: ff12:401b::: mlid c00b sl 0

looking in the SA, I see that the MGID used by the rdma-cm is a bif different
from the one used by IPoIB, since the former uses/set only the lower 28 bits 
where
the latter sets the lower 32 bits for this mgid, any idea what can be  done 
here?

$ saquery $THIS_NODE_LID

MCMemberRecord group dump:
MGIDff12:401b::::
Mlid0xC000
Mtu.0x84
pkey0x
Rate0x83
SL..0x0


MCMemberRecord group dump:
MGIDff12:401b:::fff:
Mlid0xC00B
Mtu.0x84
pkey0x
Rate0x83
SL..0x0


Index: librdmacm/examples/mckey.c
===
--- librdmacm.orig/examples/mckey.c
+++ librdmacm/examples/mckey.c
@@ -273,7 +273,7 @@ static int join_handler(struct cmatest_n
char buf[40];

inet_ntop(AF_INET6, param->ah_attr.grh.dgid.raw, buf, 40);
-   printf("mckey: joined dgid: %s\n", buf);
+   printf("mckey: joined dgid: %s mlid %x sl %d\n", buf, 
param->ah_attr.dlid, param->ah_attr.sl);

node->remote_qpn = param->qp_num;
node->remote_qkey = param->qkey;
@@ -556,6 +556,11 @@ int main(int argc, char **argv)
}
}

+   if (unmapped_addr && !src_addr) {
+   printf("unmapped multicast address requires binding to source 
address\n");
+   exit(1);
+   }
+
test.dst_addr = (struct sockaddr *) &test.dst_in;
test.connects_left = connections;

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Crash in bonding

2009-11-02 Thread Or Gerlitz


Pradeep Satyanarayana wrote:
This crash was originally reported against Rhel5.4. However, one can recreate this crash quite easily in OFED-1.5 too. 
I understand that you get the crash when working with the RHEL5.4 
bonding driver, correct? does it happen only with IPoIB devices acting 
as the bonding slaves or also with Ethernet devices? Please note that 
with RHEL 5.4 there's no need to use the ofed provided bonding module, 
more over, I believe that the distro provided one is more stable and 
uptodate in this case. Moving forward, ofed bonding support for newish 
distributions is to be removed. Moni, any reason to support bonding/EL 
5.4 in ofed?


Or.


The steps to recreate the crash are as follows:
1. Run traffic (I used ping) on the IB interfaces through the bond master
2. ifdown ib0
3. ifdown ib1
4. modprobe -r ib_ipoib


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] librdmacm/mckey: enforce local binding for unmapped multicast addresses

2009-11-03 Thread Or Gerlitz


Sean Hefty wrote:

Unmapped multicast groups only support the case where the SA has created the
group with the MGID undefined.  The MGID must be in this format: 0xff1 scope 
0xA01B (see figure 196 on page 928 of the spec).  The kernel checks for this 
specific address format to see if it needs to convert the address or not [...] 
wanted the ability to create a group a get back a unique group ID
I am still not sure to follow you. My basic thought was that unmapped 
multicast addresses are MGIDs specified by the application such that 
rdma-cm doesn't treat them as IPv6 multicast address and no mapping is 
applied on them. From the spec location you have pointed me I understand 
that the intention is for a request to the SA to generate a unique MGID:


1. "if SA receives a request to create a multicast group with the MGID 
undefined"

2.  "the MGID that it creates shall be of the following format"

so there are two parts here, 1st request the SA to create a new group, 
assign it an MGID (what about joining this node/port to the group), 2nd, 
getting back the MGID created by the SA. Looking on the rdma-cm kernel 
code, I don't see where/how it specifies to the SA  that the MGID is 
undefined? shouldn't it not set the MGID bit in the component mask in 
this case? next, I don't see where the MGID created by the SA is given 
back to the application. I guess still miss something here, can you 
clarify, thanks


Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 19/25] mlx4: Randomizing mac addresses for slaves

2009-11-04 Thread Or Gerlitz

On Wed, Nov 4, 2009 at 10:04 PM, Roland Dreier  wrote:
>> +#define MLX4_MAC_HEAD               0x2c900ULL

> Is this a good idea?  You're basically choosing 24 random bits within your 
> OUI...
> seems the chance of collision with another MAC used on the same network is
> high enough that it could easily happen in practice on a moderately big 
> network.

yes, this has been brought by Stephen and others on this last back on
September 11th, this year @
http://marc.info/?l=linux-netdev&m=125263488409128

> Can you pick a reserved range or something?

Using different OUI for the VF device wouldn't help either I think,
since the #VF becomes fairly big even on a modest side cluster with
(say) a VM consuming VF per 1-2 cores.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] librdmacm/mckey: enforce local binding for unmapped multicast addresses

2009-11-07 Thread Or Gerlitz


Sean Hefty wrote:

I merged this with your other patch to mckey and applied them to my tree
  
I don't see this @ 
http://www.openfabrics.org/git/?p=~shefty/librdmacm.git, were you 
referring a local clone?


Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: QoS in local SA entity

2009-11-07 Thread Or Gerlitz


Sean Hefty wrote:

I wasn't trying to limit how the SA could 'distribute' QoS information to the 
end nodes.  ACM will obtain QoS information from the SA when it joins its
multicast groups
excellent... still, this is dependent on how the ACM MGIDs are 
constructed, I'll take a look on the code.



ACM is intended to be a service that's used by the librdmacm to resolve address 
mappings and routes.  Trying to have ACM use the librdmacm ends up with a 
circular dependency.  That's the part I'm trying to avoid.


fail-enough, I believe that my suggestion is doable also without 
circular dependency, e.g as you indicated below or with a fairly small 
enhancement of librdmacm, see next




ACM uses address mappings as defined in an address configuration file (IP ->
device, port, pkey).  The address file can be created using the provided 
ib_acme utility, which uses the current system configuration (in an ugly way, 
but it works).  I think this provides QoS behavior similar to what you're 
describing
I assume you are referring to an IP local to the system where ACM runs 
on correct? this would work well for applications calling rdma_bind 
and/or rdma_resolve_address while specifying a source address. To 
support also the case of application which do neither of these two, that 
is call rdma_resolve_addr with dest address only, I suggest to enhance 
librdmacm-calling-ACM flow and resolve the source address using route 
lookup from user space, next the librdmacm can issue rdma_bind on behalf 
of this ID and you have the  triplet at your hand so 
now the ACM call can be made form librdmacm. Writing this, I realized 
that better(should) be done also for apps _resove_addr with src ip 
specified. This way you have unified flow for the ACM use in librdmacm 
for either of apps A,B,C below


A.1 rdma_bind(src=X)
A.2 rdma_resolve_addr(src=null, dst=Y)

B.1 rdma_resolve_addr(src=null, dst=Y)

C.1 rdma_resolve_addr(src=X, dst=Y)

where librdmacm calling-ACM flow is

L1. compute source address
L2. issue kernel rdma_bind to source address and resolve pkey>
L3. issue ACM address (DGID) resolution call using (pkey>, dest-ip)


makes sense? if yes, what's the need in the address configuration file?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: QoS in local SA entity

2009-11-08 Thread Or Gerlitz


Jason Gunthorpe wrote:

The entire point of the rdma_getaddrinfo + AF_IB is to avoid hacking up 
librdmacm for every address lookup/cache scheme someone invents
the entire simple point I am trying to make is that rdma_getaddrinfo + 
AF_INET is doable, is simple and is needed to keep up the essence of the 
rdma-cm. I don't see how AF_IB buys anything to anyone that but if you 
want to push it up as long as AF_INET is first and most 
supported/interoperable future/present go and add your bits. As you 
indicated the route lookup I was mentioning could be done in 
rdma_addrinfo, sure with  &res including both source and destination 
addresses. No rdma_resolve_addr2 is needed the one that exists now has  
source addresses specified, I  don't see that extra info is needed for 
AF_INET that was resolved with rdma_getaddrinfo is this AF_IB specific?


I don't see why the app should bother on calling rdma_getaddrinfo, it 
can be done by librdmacm with rdma_getaddrinfo having multiple modules 
as you suggested. I am in favor of the approach suggested by Sean of 
librdmacm either doing its native flow or under environment variable 
doing an alternative flow, where your suggestion not to have the 2nd 
flow being tightly coupled with ACM, e.g through using get_addrinfo 
abstraction and friends makes sense (yes!)


Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH RESEND] ib/iser: re-write SG handling for rdma logic




This patch re-writes the logic that does the above, to make it clearer and simpler. It also fixes a 
bug in the being aligned for rdma checks, where a "start" check wasn't done but rather 
only "end" check.
  
Roland, I don't see this patch in your for-next branch, any reason not 
to merge this?


Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: QoS in local SA entity


Sean Hefty wrote:

[...] The current implementation of ACM converts this to:
** Source sends a multicast request to destination IP
** Destination sends a response with IP to DGID mapping
- Path record is constructed from multicast group information   
ACM needs to know what the local addresses are, so it can respond to requests
for those addresses
okay got it. Still, how do you see my suggestion on the unified/modified 
librdmacm flow (L1/L2/L3 in my email) which would be taken when working 
against a "DGID/Route" provider such as ACM?


Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: QoS in local SA entity

Jason Gunthorpe wrote:

The extra info in rdma_resolve_addr2 carries the IB specific path information
from the rdma_getaddrinfo module to the kernel for the address pair. The entire
purpose of AF_IB is to let user space tell the kernel it does not want a kernel
side ND and PR query, instead user space will provide all the information.
The kernel patches posted by Sean replace the ND/PR flow with a two
steps process, first specifying a DGID to the kernel next specifying a
PATH. My suggestion is to have a librdmacm initiated bind before the
sending the DGID to the kernel, this way AF_INET would be supported
perfectly under the slight limitation that the source address port, pkey> tuple would be chosen by route lookup and not by the
neigh->dev that what resolved by the kernel ND. This is only when the
modified flow of librdmacm is taken (e.g under user specification with
environment variable etc).

--If-- on top of that you want to add AF_IB, we may be able to do that,
but I don't see why the whole thing should be made for AF_IB only.

Think of it this way, ACM takes over the entire process of what AF_INET does in
the kernel. AF_INET talks directly to the IB CM module in the kernel. Thus, it
also makes sense that ACM would need to talk to IB CM directly as well. AF_IB
is that direct connection.

I don't agree we must state it this way. I see ACM as an alternative way
for AF_INET to resolve ND/PR.

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Re: LID reconfiguration

> One more question;  I saw librdmacm which looked nice but it does not
> support multi-path connections.  It would eliminate a lot of code if we
> could use this

what are your needs?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: LID reconfiguration