RE: [PATCH v4 19/19] IB/mad: Implement Intel Omni-Path Architecture MAD processing

2015-04-03 Thread Hefty, Sean
> diff --git a/drivers/infiniband/core/agent.c
> b/drivers/infiniband/core/agent.c
> index b6bd305..18275a5 100644
> --- a/drivers/infiniband/core/agent.c
> +++ b/drivers/infiniband/core/agent.c
> @@ -80,13 +80,17 @@ ib_get_agent_port(struct ib_device *device, int
> port_num)
> 
>  void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
>struct ib_wc *wc, struct ib_device *device,
> -  int port_num, int qpn)
> +  int port_num, int qpn, u32 resp_mad_len,
> +  int opa)

Can't OPA support be determined by looking at the device structure?

>  {
>   struct ib_agent_port_private *port_priv;
>   struct ib_mad_agent *agent;
>   struct ib_mad_send_buf *send_buf;
>   struct ib_ah *ah;
> + size_t data_len;
> + size_t hdr_len;
>   struct ib_mad_send_wr_private *mad_send_wr;
> + u8 base_version;
> 
>   if (device->node_type == RDMA_NODE_IB_SWITCH)
>   port_priv = ib_get_agent_port(device, 0);
> @@ -106,16 +110,29 @@ void agent_send_response(struct ib_mad *mad, struct
> ib_grh *grh,
>   return;
>   }
> 
> + /* base version determines MAD size */
> + base_version = mad->mad_hdr.base_version;
> + if (opa && base_version == OPA_MGMT_BASE_VERSION) {
> + data_len = resp_mad_len - JUMBO_MGMT_MAD_HDR;
> + hdr_len = JUMBO_MGMT_MAD_HDR;
> + } else {
> + data_len = IB_MGMT_MAD_DATA;
> + hdr_len = IB_MGMT_MAD_HDR;
> + }

I _think_ this can be simplified to:

hdr_len = IB_MGMT_MAD_HDR;
data_len = resp_mad_len - hdr_len;

IB should set resp_mad_len = 256 in all cases.

> +
>   send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
> -   IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
> -   GFP_KERNEL,
> -   IB_MGMT_BASE_VERSION);
> +   hdr_len, data_len, GFP_KERNEL,
> +   base_version);
>   if (IS_ERR(send_buf)) {
>   dev_err(&device->dev, "ib_create_send_mad error\n");
>   goto err1;
>   }
> 
> - memcpy(send_buf->mad, mad, sizeof *mad);
> + if (opa && base_version == OPA_MGMT_BASE_VERSION)
> + memcpy(send_buf->mad, mad, JUMBO_MGMT_MAD_HDR + data_len);
> + else
> + memcpy(send_buf->mad, mad, sizeof(*mad));

And this may be able to be simplified to:

memcpy(send_buf->mad, mad, resp_mad_len);

> +
>   send_buf->ah = ah;
> 
>   if (device->node_type == RDMA_NODE_IB_SWITCH) {
> diff --git a/drivers/infiniband/core/agent.h
> b/drivers/infiniband/core/agent.h
> index 6669287..1dee837 100644
> --- a/drivers/infiniband/core/agent.h
> +++ b/drivers/infiniband/core/agent.h
> @@ -46,6 +46,7 @@ extern int ib_agent_port_close(struct ib_device *device,
> int port_num);
> 
>  extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
>   struct ib_wc *wc, struct ib_device *device,
> - int port_num, int qpn);
> + int port_num, int qpn, u32 resp_mad_len,
> + int opa);
> 
>  #endif   /* __AGENT_H_ */
> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
> index 5aefe4c..9b7dc36 100644
> --- a/drivers/infiniband/core/mad.c
> +++ b/drivers/infiniband/core/mad.c
> @@ -3,6 +3,7 @@
>   * Copyright (c) 2005 Intel Corporation.  All rights reserved.
>   * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
>   * Copyright (c) 2009 HNR Consulting. All rights reserved.
> + * Copyright (c) 2014 Intel Corporation.  All rights reserved.
>   *
>   * This software is available to you under a choice of one of two
>   * licenses.  You may choose to be licensed under the terms of the GNU
> @@ -44,6 +45,7 @@
>  #include "mad_priv.h"
>  #include "mad_rmpp.h"
>  #include "smi.h"
> +#include "opa_smi.h"
>  #include "agent.h"
> 
>  MODULE_LICENSE("Dual BSD/GPL");
> @@ -733,6 +735,7 @@ static int handle_outgoing_dr_smp(struct
> ib_mad_agent_private *mad_agent_priv,
>  {
>   int ret = 0;
>   struct ib_smp *smp = mad_send_wr->send_buf.mad;
> + struct opa_smp *opa_smp = (struct opa_smp *)smp;
>   unsigned long flags;
>   struct ib_mad_local_private *local;
>   struct ib_mad_private *mad_priv;
> @@ -744,6 +747,9 @@ static int handle_outgoing_dr_smp(struct
> ib_mad_agent_private *mad_agent_priv,
>   struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
>   size_t in_mad_size = mad_agent_priv->agent.device-
> >cached_dev_attrs.max_mad_size;
>   size_t out_mad_size;
> + u16 drslid;
> + int opa = mad_agent_priv->qp_info->qp->device-
> >cached_dev_attrs.device_cap_flags2 &
> +   IB_DEVICE_OPA_MAD_SUPPORT;
> 
>   if (device->node_type == RDMA_NODE_IB_SWITCH &&
>   smp->mgmt_

RE: [PATCH v4 15/19] IB/mad: Create jumbo_mad data structures

2015-04-03 Thread Hefty, Sean
> > diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
> > index 00a5e51..80e7cf4 100644
> > --- a/include/rdma/ib_mad.h
> > +++ b/include/rdma/ib_mad.h
> > @@ -136,6 +136,11 @@ enum {
> > IB_MGMT_DEVICE_HDR = 64,
> > IB_MGMT_DEVICE_DATA = 192,
> > IB_MGMT_MAD_SIZE = IB_MGMT_MAD_HDR + IB_MGMT_MAD_DATA,
> > +   JUMBO_MGMT_MAD_HDR = IB_MGMT_MAD_HDR,
> > +   JUMBO_MGMT_MAD_DATA = 2024,
> > +   JUMBO_MGMT_RMPP_HDR = IB_MGMT_RMPP_HDR,

Examining a later patch in this series highlighted that JUMBO_MGMT_MAD_HDR and 
JUMBO_MGMT_RMPP_HDR are the same size as the corresponding IB_MGMT_* defines.  
This seems unnecessary.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 18/19] IB/mad: Implement Intel Omni-Path Architecture SMP processing

2015-04-03 Thread Hefty, Sean
> @@ -236,6 +252,24 @@ enum smi_action smi_handle_dr_smp_recv(struct ib_smp
> *smp, u8 node_type,
>   smp->dr_slid == IB_LID_PERMISSIVE);
>  }
> 
> +/*
> + * Adjust information for a received SMP
> + * Return 0 if the SMP should be dropped

The function returns an enum.  The comment of returning 0 is misleading.  The 
entire comment seems unnecessary. 

> + */
> +enum smi_action opa_smi_handle_dr_smp_recv(struct opa_smp *smp, u8
> node_type,
> +int port_num, int phys_port_cnt)
> +{
> + return __smi_handle_dr_smp_recv(node_type, port_num, phys_port_cnt,
> + &smp->hop_ptr, smp->hop_cnt,
> + smp->route.dr.initial_path,
> + smp->route.dr.return_path,
> + opa_get_smp_direction(smp),
> + smp->route.dr.dr_dlid ==
> + OPA_LID_PERMISSIVE,
> + smp->route.dr.dr_slid ==
> + OPA_LID_PERMISSIVE);
> +}
> +
>  static inline
>  enum smi_forward_action __smi_check_forward_dr_smp(u8 hop_ptr, u8
> hop_cnt,
>  u8 direction,
> @@ -277,6 +311,16 @@ enum smi_forward_action
> smi_check_forward_dr_smp(struct ib_smp *smp)
> smp->dr_slid != IB_LID_PERMISSIVE);
>  }
> 
> +enum smi_forward_action opa_smi_check_forward_dr_smp(struct opa_smp *smp)
> +{
> + return __smi_check_forward_dr_smp(smp->hop_ptr, smp->hop_cnt,
> +   opa_get_smp_direction(smp),
> +   smp->route.dr.dr_dlid ==
> +   OPA_LID_PERMISSIVE,
> +   smp->route.dr.dr_slid ==
> +   OPA_LID_PERMISSIVE);
> +}
> +
>  /*
>   * Return the forwarding port number from initial_path for outgoing SMP
> and
>   * from return_path for returning SMP
> @@ -286,3 +330,13 @@ int smi_get_fwd_port(struct ib_smp *smp)
>   return (!ib_get_smp_direction(smp) ? smp->initial_path[smp-
> >hop_ptr+1] :
>   smp->return_path[smp->hop_ptr-1]);
>  }
> +
> +/*
> + * Return the forwarding port number from initial_path for outgoing SMP
> and
> + * from return_path for returning SMP
> + */
> +int opa_smi_get_fwd_port(struct opa_smp *smp)
> +{
> + return !opa_get_smp_direction(smp) ? smp->route.dr.initial_path[smp-
> >hop_ptr+1] :
> + smp->route.dr.return_path[smp->hop_ptr-1];
> +}
> diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
> index aff96ba..e95c537 100644
> --- a/drivers/infiniband/core/smi.h
> +++ b/drivers/infiniband/core/smi.h
> @@ -62,6 +62,9 @@ extern enum smi_action smi_handle_dr_smp_send(struct
> ib_smp *smp,
>   * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
>   * via process_mad
>   */
> +/* NOTE: This is called on opa_smp's don't check fields which are not
> common
> + * between ib_smp and opa_smp
> + */

This comment suggests that the function is not correct for OPA.  ?

>  static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
> struct ib_device *device)
>  {
> @@ -77,6 +80,9 @@ static inline enum smi_action smi_check_local_smp(struct
> ib_smp *smp,
>   * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
>   * via process_mad
>   */
> +/* NOTE: This is called on opa_smp's don't check fields which are not
> common
> + * between ib_smp and opa_smp
> + */

Same comment

>  static inline enum smi_action smi_check_local_returning_smp(struct ib_smp
> *smp,
>  struct ib_device *device)
>  {
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 17/19] IB/mad: Implement support for Intel Omni-Path Architecture base version MADs in ib_create_send_mad

2015-04-03 Thread Hefty, Sean
> @@ -937,20 +937,31 @@ struct ib_mad_send_buf * ib_create_send_mad(struct
> ib_mad_agent *mad_agent,
>   struct ib_mad_send_wr_private *mad_send_wr;
>   int pad, message_size, ret, size;
>   void *buf;
> + size_t mad_size;
> + int opa;
> 
>   mad_agent_priv = container_of(mad_agent, struct
> ib_mad_agent_private,
> agent);
> - pad = get_pad_size(hdr_len, data_len);
> +
> + opa = mad_agent_priv->agent.device-
> >cached_dev_attrs.device_cap_flags2 &
> +   IB_DEVICE_OPA_MAD_SUPPORT;
> +
> + if (opa && base_version == OPA_MGMT_BASE_VERSION)
> + mad_size = sizeof(struct jumbo_mad);
> + else
> + mad_size = sizeof(struct ib_mad);

Didn't an earlier patch make is possible to read the mad_size directly from the 
device?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 16/19] IB/mad: Add Intel Omni-Path Architecture defines

2015-04-03 Thread Hefty, Sean
>  /* Registration table sizes */
>  #define MAX_MGMT_CLASS   80
> -#define MAX_MGMT_VERSION 8
> +#define MAX_MGMT_VERSION 0x83

It's unfortunate that this results in a big jump in used versions.  Mad_priv.h 
defines this:

struct ib_mad_port_private {
...
struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];

struct ib_mad_mgmt_version_table {
struct ib_mad_mgmt_class_table *class;
struct ib_mad_mgmt_vendor_class_table *vendor;
};

This ends up allocating about 2K of data per port of NULL pointers.  Not a huge 
deal, but still.

I don't have a great fix here.  Maybe the version[] array can be the necessary 
size, with some sort of simple mapping function from version to the index?
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 15/19] IB/mad: Create jumbo_mad data structures

2015-04-03 Thread Hefty, Sean
> Define jumbo_mad and jumbo_rmpp_mad.

I would just use 'opa_mad' in place of 'jumbo_mad'.  Jumbo sounds like a 
marketing term or elephant name.
 
> Jumbo MAD structures are 2K versions of ib_mad and ib_rmpp_mad structures.
> Currently only OPA base version MADs are of this type.
> 
> Create an RMPP Base header to share between ib_rmpp_mad and jumbo_rmpp_mad
> 
> Update existing code to use the new structures.
> 
> Signed-off-by: Ira Weiny 
> 
> ---
>  drivers/infiniband/core/mad.c  |  18 +++---
>  drivers/infiniband/core/mad_priv.h |   2 +
>  drivers/infiniband/core/mad_rmpp.c | 120 ++--
> -
>  drivers/infiniband/core/user_mad.c |  16 ++---
>  include/rdma/ib_mad.h  |  26 +++-
>  5 files changed, 103 insertions(+), 79 deletions(-)
> 
> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
> index 2145294..316b4b2 100644
> --- a/drivers/infiniband/core/mad.c
> +++ b/drivers/infiniband/core/mad.c
> @@ -883,7 +883,7 @@ static int alloc_send_rmpp_list(struct
> ib_mad_send_wr_private *send_wr,
>   gfp_t gfp_mask)
>  {
>   struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
> - struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
> + struct ib_rmpp_base *rmpp_base = send_buf->mad;
>   struct ib_rmpp_segment *seg = NULL;
>   int left, seg_size, pad;
> 
> @@ -909,10 +909,10 @@ static int alloc_send_rmpp_list(struct
> ib_mad_send_wr_private *send_wr,
>   if (pad)
>   memset(seg->data + seg_size - pad, 0, pad);
> 
> - rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
> + rmpp_base->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
> agent.rmpp_version;
> - rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
> - ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
> + rmpp_base->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
> + ib_set_rmpp_flags(&rmpp_base->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
> 
>   send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
>   struct ib_rmpp_segment, list);
> @@ -1748,14 +1748,14 @@ out:
>  static int is_rmpp_data_mad(struct ib_mad_agent_private *mad_agent_priv,
>  struct ib_mad_hdr *mad_hdr)
>  {
> - struct ib_rmpp_mad *rmpp_mad;
> + struct ib_rmpp_base *rmpp_base;
> 
> - rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
> + rmpp_base = (struct ib_rmpp_base *)mad_hdr;
>   return !mad_agent_priv->agent.rmpp_version ||
>   !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
> - !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
> + !(ib_get_rmpp_flags(&rmpp_base->rmpp_hdr) &
>   IB_MGMT_RMPP_FLAG_ACTIVE) ||
> - (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
> + (rmpp_base->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
>  }
> 
>  static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
> @@ -1897,7 +1897,7 @@ static void ib_mad_complete_recv(struct
> ib_mad_agent_private *mad_agent_priv,
>   spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
>   if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
>  && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad-
> >mad_hdr.mgmt_class)
> -&& (ib_get_rmpp_flags(&((struct ib_rmpp_mad
> *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
> +&& (ib_get_rmpp_flags(&((struct ib_rmpp_base
> *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
>   & IB_MGMT_RMPP_FLAG_ACTIVE)) {
>   /* user rmpp is in effect
>* and this is an active RMPP MAD
> diff --git a/drivers/infiniband/core/mad_priv.h
> b/drivers/infiniband/core/mad_priv.h
> index d1a0b0e..d71ddcc 100644
> --- a/drivers/infiniband/core/mad_priv.h
> +++ b/drivers/infiniband/core/mad_priv.h
> @@ -80,6 +80,8 @@ struct ib_mad_private {
>   struct ib_mad mad;
>   struct ib_rmpp_mad rmpp_mad;
>   struct ib_smp smp;
> + struct jumbo_mad jumbo_mad;
> + struct jumbo_rmpp_mad jumbo_rmpp_mad;
>   } mad;
>  } __attribute__ ((packed));
> 
> diff --git a/drivers/infiniband/core/mad_rmpp.c
> b/drivers/infiniband/core/mad_rmpp.c
> index 2379e2d..7184530 100644
> --- a/drivers/infiniband/core/mad_rmpp.c
> +++ b/drivers/infiniband/core/mad_rmpp.c
> @@ -111,10 +111,10 @@ void ib_cancel_rmpp_recvs(struct
> ib_mad_agent_private *agent)
>  }
> 
>  static void format_ack(struct ib_mad_send_buf *msg,
> -struct ib_rmpp_mad *data,
> +struct ib_rmpp_base *data,
>  struct mad_rmpp_recv *rmpp_recv)
>  {
> - struct ib_rmpp_mad *ack = msg->mad;
> + struct ib_rmpp_base *ack = msg->mad;
>   unsigned long flags;
> 
>

RE: [PATCH v4 12/19] IB/mad: Add MAD size parameters to process_mad

2015-04-03 Thread Hefty, Sean
> -static struct ib_mad_private *alloc_mad_priv(struct ib_device *dev)
> +static struct ib_mad_private *alloc_mad_priv(struct ib_device *dev,
> +  size_t *mad_size)
>  {
> + *mad_size = dev->cached_dev_attrs.max_mad_size;

Why does this function return the value that the caller can just read from the 
device?

Actually, it's odd for an alloc() call to return how much it allocated, rather 
than taking that as input.

>   return (kmalloc(sizeof(struct ib_mad_private_header) +
> - sizeof(struct ib_grh) +
> - dev->cached_dev_attrs.max_mad_size, GFP_ATOMIC));
> + sizeof(struct ib_grh) + *mad_size, GFP_ATOMIC));
>  }
> 
>  /*
> @@ -741,6 +742,8 @@ static int handle_outgoing_dr_smp(struct
> ib_mad_agent_private *mad_agent_priv,
>   u8 port_num;
>   struct ib_wc mad_wc;
>   struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
> + size_t in_mad_size = mad_agent_priv->agent.device-
> >cached_dev_attrs.max_mad_size;
> + size_t out_mad_size;
> 
>   if (device->node_type == RDMA_NODE_IB_SWITCH &&
>   smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
> @@ -777,7 +780,7 @@ static int handle_outgoing_dr_smp(struct
> ib_mad_agent_private *mad_agent_priv,
>   local->mad_priv = NULL;
>   local->recv_mad_agent = NULL;
> 
> - mad_priv = alloc_mad_priv(mad_agent_priv->agent.device);
> + mad_priv = alloc_mad_priv(mad_agent_priv->agent.device,
> &out_mad_size);
>   if (!mad_priv) {
>   ret = -ENOMEM;
>   dev_err(&device->dev, "No memory for local response MAD\n");
> @@ -792,8 +795,9 @@ static int handle_outgoing_dr_smp(struct
> ib_mad_agent_private *mad_agent_priv,
> 
>   /* No GRH for DR SMP */
>   ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
> -   (struct ib_mad *)smp,
> -   (struct ib_mad *)&mad_priv->mad);
> +   (struct ib_mad_hdr *)smp, in_mad_size,
> +   (struct ib_mad_hdr *)&mad_priv->mad,
> +   &out_mad_size);

Rather than calling device->process_mad() directly, would it be better to call 
a common function?  So we can avoid adding:

> + struct ib_mad *in_mad = (struct ib_mad *)in;
> + struct ib_mad *out_mad = (struct ib_mad *)out;
> +
> + if (in_mad_size != sizeof(*in_mad) || *out_mad_size !=
> sizeof(*out_mad))
> + return IB_MAD_RESULT_FAILURE;

to existing drivers?


>   switch (ret)
>   {
>   case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
> @@ -2011,6 +2015,7 @@ static void ib_mad_recv_done_handler(struct
> ib_mad_port_private *port_priv,
>   struct ib_mad_agent_private *mad_agent;
>   int port_num;
>   int ret = IB_MAD_RESULT_SUCCESS;
> + size_t resp_mad_size;
> 
>   mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
>   qp_info = mad_list->mad_queue->qp_info;
> @@ -2038,7 +2043,7 @@ static void ib_mad_recv_done_handler(struct
> ib_mad_port_private *port_priv,
>   if (!validate_mad(&recv->mad.mad.mad_hdr, qp_info->qp->qp_num))
>   goto out;
> 
> - response = alloc_mad_priv(port_priv->device);
> + response = alloc_mad_priv(port_priv->device, &resp_mad_size);
>   if (!response) {
>   dev_err(&port_priv->device->dev,
>   "ib_mad_recv_done_handler no memory for response
> buffer\n");
> @@ -2063,8 +2068,10 @@ static void ib_mad_recv_done_handler(struct
> ib_mad_port_private *port_priv,
>   ret = port_priv->device->process_mad(port_priv->device, 0,
>port_priv->port_num,
>wc, &recv->grh,
> -  &recv->mad.mad,
> -  &response->mad.mad);
> +  (struct ib_mad_hdr *)&recv-
> >mad.mad,
> +  port_priv->device-
> >cached_dev_attrs.max_mad_size,

This is the size of the allocated buffer.  Something based on wc.byte_len seems 
like a better option.


> +  (struct ib_mad_hdr 
> *)&response-
> >mad.mad,
> +  &resp_mad_size);
>   if (ret & IB_MAD_RESULT_SUCCESS) {
>   if (ret & IB_MAD_RESULT_CONSUMED)
>   goto out;
> @@ -2687,7 +2694,10 @@ static int ib_mad_post_receive_mads(struct
> ib_mad_qp_info *qp_info,
>   mad_priv = mad;
>   mad = NULL;
>   } else {
> - mad_priv = alloc_mad_priv(qp_info->port_priv->device);
> + size_t mad_size;
> +
> + mad_priv = alloc_mad_priv(qp_info->port_priv->device,

RE: [PATCH v4 08/19] IB/mad: Add helper function for smi_handle_dr_smp_send

2015-04-03 Thread Hefty, Sean
Reviewed-by: Sean Hefty 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 04/19] IB/mad: Change ib_response_mad signature to take ib_mad_hdr rather than ib_mad

2015-04-03 Thread Hefty, Sean
Reviewed-by: Sean Hefty 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 03/19] IB/mad: Change validate_mad signature to take ib_mad_hdr rather than ib_mad

2015-04-03 Thread Hefty, Sean
Reviewed-by: Sean Hefty 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v4 02/19] IB/core: Cache device attributes for use by upper level drivers

2015-04-03 Thread Hefty, Sean
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
> index 0d74f1d..0116e4b 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -1675,6 +1675,7 @@ struct ib_device {
>   u32  local_dma_lkey;
>   u8   node_type;
>   u8   phys_port_cnt;
> + struct ib_device_attrcached_dev_attrs;
>  };

Looking at the device attributes, I think all of the values are static for a 
given device.  If this is indeed the case, then I would just remove the word 
'cached' from the field name.  Cached makes me think of the values dynamically 
changing, and if that's the case, then this patch isn't sufficient.

Alternatively, if there's only a few values that ULPs need, maybe just store 
those.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [oss-security] RE: CVE-2014-8159 kernel: infiniband: uverbs: unprotected physical memory access

2015-04-03 Thread Dominique Martinet
Hi,

Shachar Raindel wrote on Fri, Apr 03, 2015 at 11:49:13AM +:
> > couldn't get it to work - ibv_reg_mr would return EINVAL on an address
> > obtained by mmap.
> 
> Were you mmaping a normal disk file, or was the mmap targeting an MMIO of
> another hardware device? mmap of a normal disk file should work also with
> normal memory registration, assuming you are providing the proper length.

On a proper file.
It was a year or two ago, I actually tried again now and it does seem
to work alright even on older kernels... I wonder what I did wrong back
then!

> mmap of the MMIO area of another hardware device (i.e. interfacing an FPGA,
> NVRAM, or similar things) requires some code changes on both sides. The
> current kernel code in the infiniband side is using get_user_pages, which
> does not support MMIO pages. The proposed PeerDirect patches [1] allows peer
> device to declare ownership of virtual address ranges, and enable such
> registration. However, these patches are have not yet been merged upstream.

Interesting, I don't need this right now but it's good to know.

> > Conceptually as well I'm not sure how it's supposed to work, mmap should
> > only actually issue reads when memory access issues page faults (give or
> > take suitable readahead logic), but I don't think direct memory access
> > from the IB/RDMA adapter would issue such page faults ?
> 
> You are correct. RDMA adapters without ODP support do not issue page faults.
> Instead, during memory registration, the ib_umem code calls get_user_pages,
> which ensures all relevant pages are in memory, and pins them as needed.
> 
> > Likewise on writes, would need the kernel to notice memory has been
> > written and pages are dirty/needs flushing.
> > 
> 
> Similarly, when deregistering a writable memory region, the kernel driver
> marks all pages as dirty before unpinning them. You can see the code doing
> this in [2].

Ok this makes sense, thanks for clearing it up.

> Liran Liss gave a presentation about ODP at OFA [3]. The technology is
> available for ConnectIB devices using the most recent firmware and kernel
> versions above 3.19.

I don't have any recent kernel around, but I'll give it a shot next
week.
(I'm working on a userspace file server, nfs-ganesha, so being able to
mmap a file and use it directly for I/O is very promising for me!)


> [1] http://www.spinics.net/lists/linux-rdma/msg21770.html
> [2] http://lxr.free-electrons.com/source/drivers/infiniband/core/umem.c#L62
> [3] 
> https://www.openfabrics.org/images/Workshops_2014/DevWorkshop/presos/Tuesday/pdf/09.30_2014_OFA_Workshop_ODP_update_final.pdf
>  and https://www.youtube.com/watch?v=KbrlsXQbHCw


Thanks for the detailed reply, I'll dig a bit further and come back
straight to the list if need to.

-- 
Dominique Martinet
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: CVE-2014-8159 kernel: infiniband: uverbs: unprotected physical memory access

2015-04-03 Thread Yann Droneaud
Hi,

Le vendredi 03 avril 2015 à 08:39 +, Haggai Eran a écrit :
> On Thursday, April 2, 2015 11:40 PM, Yann Droneaud  
> wrote:
> > Le jeudi 02 avril 2015 à 16:44 +, Shachar Raindel a écrit :
> >> > -Original Message-
> >> > From: Yann Droneaud [mailto:ydrone...@opteya.com]
> >> > Sent: Thursday, April 02, 2015 7:35 PM
> > 
> >> > Another related question: as the large memory range could be registered
> >> > by user space with ibv_reg_mr(pd, base, size, IB_ACCESS_ON_DEMAND),
> >> > what's prevent the kernel to map a file as the result of mmap(0, ...)
> >> > in this  region, making it available remotely through IBV_WR_RDMA_READ /
> >> > IBV_WR_RDMA_WRITE ?
> >> >
> >>
> >> This is not a bug. This is a feature.
> >>
> >> Exposing a file through RDMA, using ODP, can be done exactly like this.
> >> Given that the application explicitly requested this behavior, I don't
> >> see why it is a problem.
> > 
> > If the application cannot choose what will end up in the region it has
> > registered, it's an issue !
> > 
> > What might happen if one library in a program call mmap(0, size, ...) to
> > load a file storing a secret (a private key), and that file ends up
> > being mapped in an registered but otherwise free region (afaict, the
> > kernel is allowed to do it) ?
> > What might happen if one library in a program call call mmap(0,
> > size, ..., MAP_ANONYMOUS,...) to allocate memory, call mlock(), then
> > write in this location a secret (a passphrase), and that area ends up
> > in the memory region registered for on demand paging ?
> > 
> > The application haven't choose to disclose these confidential piece of
> > information, but they are available for reading/writing by remote
> > through RDMA given it knows the rkey of the memory region (which is a
> > 32bits value).
> > 
> > I hope I'm missing something, because I'm not feeling confident such
> > behavor is a feature.
> 
> What we are aiming for is the possibility to register the entire process' 
> address 
> space for RDMA operations (if the process chooses to use this feature).
> This is similar to multiple threads accessing the same address space. I'm 
> sure 
> you wouldn't be complaining about the ability of one thread to access the 
> secret 
> passphrase mmapped by another thread in your example.
> 
> > I'm trying to understand how the application can choose what is exposed
> > through RDMA if it registers a very large memory region for later use
> > (but do not actually explicitly map something there yet): what's the
> > consequences ?
> > 
> >void *start = sbrk(0);
> >size_t size = ULONG_MAX - (unsigned long)start;
> > 
> >ibv_reg_mr(pd, start, size, IB_ACCESS_ON_DEMAND)
> 
> The consequences are exactly as you wrote. Just as giving a non-ODP rkey 
> to a remote node allows the node to access the registered memory behind that 
> rkey, giving an ODP rkey to a remote node allows that node to access the 
> virtual address space behind that rkey.
> 

There's a difference: it's impossible to give a valid non-ODP rkey that
point to a memory region not already mapped (backed by a file for 
example), so the application *choose* the content of the memory to be
made accessible remotely before making it accessible.

As I understand the last explanation regarding ODP, at creation time,
an ODP rkey can point to a free, unused, unallocated memory portion.
At this point the kernel can happily map anything the application
(and its libraries) want to map at a (almost) *random* address that
could be in (or partially in) the ODP memory region.

And I have a problem with such random behavior. Allowing this is seems
dangerous and should be done with care.

I believe the application must kept the control of what's end up in its 
ODP registered memory region.

Especially for multi thread program: imagine one thread creating a large
memory region for its future purposes, then send the rkey to a remote 
peer and wait for some work to be done.
In the mean time another call mmap(0, ...) to map a file at a kernel 
chosen address, and that address happen to be in the memory region 
registered by the other thread:

1) the first thread is amputated from a portion of memory it was 
willing to use;
2) the data used by the second thread is accessible to the remote 
peer(s) while not expected.

Speculatively registering memory seems dangerous for any use case I
could think of.

Regards.

-- 
Yann Droneaud
OPTEYA


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [oss-security] RE: CVE-2014-8159 kernel: infiniband: uverbs: unprotected physical memory access

2015-04-03 Thread Shachar Raindel
Hi Dominique,

> -Original Message-
> From: Dominique Martinet [mailto:dominique.marti...@cea.fr]
> Sent: Thursday, April 02, 2015 8:44 PM
> To: Shachar Raindel
> Subject: Re: [oss-security] RE: CVE-2014-8159 kernel: infiniband:
> uverbs: unprotected physical memory access
> 
> Hi,
> 
> Shachar Raindel wrote on Thu, Apr 02, 2015:
> > From: Yann Droneaud [mailto:ydrone...@opteya.com]
> > > Another related question: as the large memory range could be
> registered
> > > by user space with ibv_reg_mr(pd, base, size, IB_ACCESS_ON_DEMAND),
> > > what's prevent the kernel to map a file as the result of mmap(0,
> ...)
> > > in this  region, making it available remotely through
> IBV_WR_RDMA_READ /
> > > IBV_WR_RDMA_WRITE ?
> > >
> >
> > This is not a bug. This is a feature.
> >
> > Exposing a file through RDMA, using ODP, can be done exactly like
> this.
> > Given that the application explicitly requested this behavior, I don't
> > see why it is a problem. Actually, some of our tests use such flows.
> > The mmu notifiers mechanism allow us to do this safely. When the page
> is
> > written back to disk, it is removed from the ODP mapping. When it is
> > accessed by the HCA, it is brought back to RAM.
> 
> Forgive the private reply, but I've actually tried that a while ago and

No problem, better than e-mailing the entire world on RDMA specific topics.
As this discussion is relevant to the Linux RDMA users, I'm adding back
the linux-rdma mailing list.

> couldn't get it to work - ibv_reg_mr would return EINVAL on an address
> obtained by mmap.

Were you mmaping a normal disk file, or was the mmap targeting an MMIO of
another hardware device? mmap of a normal disk file should work also with
normal memory registration, assuming you are providing the proper length.

mmap of the MMIO area of another hardware device (i.e. interfacing an FPGA,
NVRAM, or similar things) requires some code changes on both sides. The
current kernel code in the infiniband side is using get_user_pages, which
does not support MMIO pages. The proposed PeerDirect patches [1] allows peer
device to declare ownership of virtual address ranges, and enable such
registration. However, these patches are have not yet been merged upstream.

> 
> Conceptually as well I'm not sure how it's supposed to work, mmap should
> only actually issue reads when memory access issues page faults (give or
> take suitable readahead logic), but I don't think direct memory access
> from the IB/RDMA adapter would issue such page faults ?

You are correct. RDMA adapters without ODP support do not issue page faults.
Instead, during memory registration, the ib_umem code calls get_user_pages,
which ensures all relevant pages are in memory, and pins them as needed.

> Likewise on writes, would need the kernel to notice memory has been
> written and pages are dirty/needs flushing.
> 

Similarly, when deregistering a writable memory region, the kernel driver
marks all pages as dirty before unpinning them. You can see the code doing
this in [2].

> So, what am I missing? I'd wager it's that "ODP" you're talking about,
> do you have any documentation I could skim through ?
> 

Liran Liss gave a presentation about ODP at OFA [3]. The technology is
available for ConnectIB devices using the most recent firmware and kernel
versions above 3.19.

Thanks,
--Shachar

[1] http://www.spinics.net/lists/linux-rdma/msg21770.html
[2] http://lxr.free-electrons.com/source/drivers/infiniband/core/umem.c#L62
[3] 
https://www.openfabrics.org/images/Workshops_2014/DevWorkshop/presos/Tuesday/pdf/09.30_2014_OFA_Workshop_ODP_update_final.pdf
 and https://www.youtube.com/watch?v=KbrlsXQbHCw




Re: CVE-2014-8159 kernel: infiniband: uverbs: unprotected physical memory access

2015-04-03 Thread Haggai Eran
On Thursday, April 2, 2015 11:40 PM, Yann Droneaud  wrote:
> Le jeudi 02 avril 2015 à 16:44 +, Shachar Raindel a écrit :
>> > -Original Message-
>> > From: Yann Droneaud [mailto:ydrone...@opteya.com]
>> > Sent: Thursday, April 02, 2015 7:35 PM
> 
>> > Another related question: as the large memory range could be registered
>> > by user space with ibv_reg_mr(pd, base, size, IB_ACCESS_ON_DEMAND),
>> > what's prevent the kernel to map a file as the result of mmap(0, ...)
>> > in this  region, making it available remotely through IBV_WR_RDMA_READ /
>> > IBV_WR_RDMA_WRITE ?
>> >
>>
>> This is not a bug. This is a feature.
>>
>> Exposing a file through RDMA, using ODP, can be done exactly like this.
>> Given that the application explicitly requested this behavior, I don't
>> see why it is a problem.
> 
> If the application cannot choose what will end up in the region it has
> registered, it's an issue !
> 
> What might happen if one library in a program call mmap(0, size, ...) to
> load a file storing a secret (a private key), and that file ends up
> being mapped in an registered but otherwise free region (afaict, the
> kernel is allowed to do it) ?
> What might happen if one library in a program call call mmap(0,
> size, ..., MAP_ANONYMOUS,...) to allocate memory, call mlock(), then
> write in this location a secret (a passphrase), and that area ends up
> in the memory region registered for on demand paging ?
> 
> The application haven't choose to disclose these confidential piece of
> information, but they are available for reading/writing by remote
> through RDMA given it knows the rkey of the memory region (which is a
> 32bits value).
> 
> I hope I'm missing something, because I'm not feeling confident such
> behavor is a feature.

What we are aiming for is the possibility to register the entire process' 
address 
space for RDMA operations (if the process chooses to use this feature).
This is similar to multiple threads accessing the same address space. I'm sure 
you wouldn't be complaining about the ability of one thread to access the 
secret 
passphrase mmapped by another thread in your example.

> I'm trying to understand how the application can choose what is exposed
> through RDMA if it registers a very large memory region for later use
> (but do not actually explicitly map something there yet): what's the
> consequences ?
> 
>void *start = sbrk(0);
>size_t size = ULONG_MAX - (unsigned long)start;
> 
>ibv_reg_mr(pd, start, size, IB_ACCESS_ON_DEMAND)

The consequences are exactly as you wrote. Just as giving a non-ODP rkey 
to a remote node allows the node to access the registered memory behind that 
rkey, giving an ODP rkey to a remote node allows that node to access the 
virtual address space behind that rkey.

Regards,
Haggai--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html