Re: [PATCH] ib_ipoib: Scatter-Gather support in connected mode

2015-01-28 Thread Yann Droneaud
Hi,

Le mardi 27 janvier 2015 à 03:21 -0800, Yuval Shaia a écrit :
> With this fix Scatter-Gather will be supported also in connected mode
> 

Please explain the issue with current kernel and the advantage
of the proposed fix in the commit message. Perhaps benchmarks
against various HCAs would be welcome.

> Signed-off-by: Yuval Shaia 
> ---
>  drivers/infiniband/ulp/ipoib/ipoib.h  |8 +--
>  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  107 
> +++--
>  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |3 +-
>  drivers/infiniband/ulp/ipoib/ipoib_main.c |3 +-
>  4 files changed, 91 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h 
> b/drivers/infiniband/ulp/ipoib/ipoib.h
> index d7562be..fb42834 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -170,11 +170,6 @@ struct ipoib_tx_buf {
>   u64 mapping[MAX_SKB_FRAGS + 1];
>  };
>  
> -struct ipoib_cm_tx_buf {
> - struct sk_buff *skb;
> - u64 mapping;
> -};
> -
>  struct ib_cm_id;
>  
>  struct ipoib_cm_data {
> @@ -233,7 +228,7 @@ struct ipoib_cm_tx {
>   struct net_device   *dev;
>   struct ipoib_neigh  *neigh;
>   struct ipoib_path   *path;
> - struct ipoib_cm_tx_buf *tx_ring;
> + struct ipoib_tx_buf *tx_ring;
>   unsigned tx_head;
>   unsigned tx_tail;
>   unsigned longflags;
> @@ -496,6 +491,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int 
> flush);
>  
>  void ipoib_mcast_dev_down(struct net_device *dev);
>  void ipoib_mcast_dev_flush(struct net_device *dev);
> +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
>  
>  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
>  struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c 
> b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index 933efce..056e4d2 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv 
> *priv, int frags,
>   ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, 
> DMA_FROM_DEVICE);
>  }
>  
> +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
> +   struct ipoib_tx_buf *tx_req)
> +{
> + struct sk_buff *skb;
> + int i, offs;
> +
> + skb = tx_req->skb;
> + if (skb_shinfo(skb)->nr_frags) {
> + offs = 0;
> + if (skb_headlen(skb)) {
> + ib_dma_unmap_single(priv->ca, tx_req->mapping[0],
> + skb_headlen(skb), DMA_TO_DEVICE);
> + offs = 1;
> + }
> + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
> + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +
> + ib_dma_unmap_single(priv->ca, tx_req->mapping[i + offs],
> + skb_frag_size(frag), DMA_TO_DEVICE);
> + }
> + } else
> + ib_dma_unmap_single(priv->ca, tx_req->mapping[0], skb->len,
> + DMA_TO_DEVICE);
> +}
> +
>  static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
>  {
>   struct ipoib_dev_priv *priv = netdev_priv(dev);
> @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv *priv,
>   return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
>  }
>  
> +static inline int post_send_sg(struct ipoib_dev_priv *priv,
> +struct ipoib_cm_tx *tx,
> +unsigned int wr_id,
> +struct sk_buff *skb,
> +u64 mapping[MAX_SKB_FRAGS + 1])
> +{
> + struct ib_send_wr *bad_wr;
> + int i, off;
> + skb_frag_t *frags = skb_shinfo(skb)->frags;
> + int nr_frags = skb_shinfo(skb)->nr_frags;
> +
> + if (skb_headlen(skb)) {
> + priv->tx_sge[0].addr = mapping[0];
> + priv->tx_sge[0].length = skb_headlen(skb);
> + off = 1;
> + } else
> + off = 0;
> +
> + for (i = 0; i < nr_frags; ++i) {
> + priv->tx_sge[i + off].addr = mapping[i + off];
> + priv->tx_sge[i + off].length = frags[i].size;
> + }
> + priv->tx_wr.num_sge = nr_frags + off;
> + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
> +
> + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
> +}
> +
>  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct 
> ipoib_cm_tx *tx)
>  {
>   struct ipoib_dev_priv *priv = netdev_priv(dev);
> - struct ipoib_cm_tx_buf *tx_req;
> - u64 addr;
> + struct ipoib_tx_buf *tx_req;
> + u64 addr = 0;
>   int rc;
>  
>   if (unlikely(skb->len > tx->mtu)) {
> @@ -735,24 +788,37 @@ void ipoib_cm_send(struct net_device *dev, struct 
> sk_bu

Re: [PATCH 3/4] IB/uverbs: ex_query_device: check request's comp_mask

2015-01-28 Thread Yann Droneaud
Hi,

Le mardi 27 janvier 2015 à 08:50 +0200, Haggai Eran a écrit :
> On 26/01/2015 13:17, Yann Droneaud wrote:
> > ...
> > Le dimanche 25 janvier 2015 à 17:23 +0200, Haggai Eran a écrit :
> >> On 22/01/2015 15:28, Yann Droneaud wrote:
> >>> This patch ensures the extended QUERY_DEVICE uverbs request's
> >>> comp_mask has only known values. If userspace returns unknown
> >>> features, -EINVAL will be returned, allowing to probe/discover
> >>> which features are currently supported by the kernel.
> >>
> >> This probing process will be much more cumbersome than it needs to be
> >> because userspace will have to call QUERY_DEVICE repetitively with
> >> different comp_mask fields until it finds out the exact set of supported
> >> bits.
> >>
> > 
> > O(log2(N))
> 
> I don't think user space developers will be happy having to do trial and 
> error to determine what features the kernel driver supports. It might be 
> even more then O(log2(N)). If my understanding of comp_mask bits usage is 
> correct it would O(N). But it's not the time complexity I'm worried about,
> it's the fact that it requires user-space developers to go through hoops in
> order to get information that can be much more easily exported.
> 

But there's currently *NO* such mean that could give a hint to userspace
developer whether one bit of request's comp_mask is currently effective
in extended QUERY_DEVICE uverbs. While we have such mean in CREATE_FLOW
and DESTROY_FLOW: unsupported bits trigger -EINVAL.

In QUERY_DEVICE, userspace developer is allowed to set request's 
comp_mask to -1: it won't hurt it on current kernel, so why not using 
this value or any other random value ? The program will run: it's "Works
For Me".

But the same program (either binary or source code) might fail on
newer kernel where some bits in comp_mask gain a meaning not supported
by the program.

> > Or you had to add a different interface, dedicated to retrieve the exact
> > supported feature mask.
> > 
> >>> Moreover, it also ensure the requested features set in comp_mask
> >>> are sequentially set, not skipping intermediate features, eg. the
> >>> "highest" requested feature also request all the "lower" ones.
> >>> This way each new feature will have to be stacked on top of the
> >>> existing ones: this is especially important for the request and
> >>> response data structures where fields are added after the
> >>> current ones when expanded to support a new feature.
> >>
> >> I think it is perfectly acceptable that not all drivers will implement
> >> all available features, and so you shouldn't enforce this requirement.
> > 
> > With regard to QUERY_DEVICE: the data structure layout depends on the
> > comp_mask value. So either you propose a way to express multipart data
> > structure (see CMSG or NETLINK), or we have to ensure the data structure
> > is ever-growing, with each new chunck stacked over the existing ones:
> > that's the purpose of :
> > 
> > if (cmd.comp_mask & (cmd.comp_mask + 1))
> > return -EINVAL;
> > 
> >> Also, it makes the comp_mask nothing more than a wasteful version number
> >> between 0 and 63.
> > 
> > That's what I've already explained earlier in "Re: [PATCH v3 06/17]
> > IB/core: Add support for extended query device caps":
> > 
> > http://mid.gmane.org/1421844612.13543.40.ca...@opteya.com
> 
> Yes, you wrote there:
> > Regarding comp_mask (not for this particular verb):
> > 
> > It's not clear whether request's comp_mask describe the request or the
> > response, as such I'm puzzled.
> > 
> > How would the kernel and the userspace be able to parse the request and
> > the response if they ignore unknown bits ?
> > 
> > How would they be able to skip the unrecognized chunk of the request and
> > response buffer ?
> > 
> > How one bit in a comp_mask is related to a chunk in the request or
> > response ?
> > 
> > It's likely the kernel or userspace would have to skip the remaining
> > comp_mask's bits after encountering an unknown bit as the size of the
> > corresponding chunk in request or response would be unknown, making
> > impossible to locate the corresponding chunk for the next bit set in
> > comp_mask. Having said that, comp_mask is just a complicated way of
> > expressing a version, which is turn describe a size (ever growing).
> 
> It is my understanding that each comp_mask bit marks a set of fields in 
> the command or in the response struct as valid, so the struct format 
> remains the same and the kernel and userspace don't need to make 
> difficult calculation as to where each field is, but you can still pass 
> a high bit set in comp_mask with one of the lower bits cleared.
> 

How can the struct format remain the same, if some fields are added
to implement newer feature ?

> I couldn't find this explicit detail in the mailing list, but I did found
> a presentation that was presented in OFA International Developer 
> Workshop 2013 [1], that gave an example of of an verb where each 
> comp_mask bit marked a dif

[PATCH infiniband-diags] Align infiniband-diags MLNX device IDs with OpenSM

2015-01-28 Thread Hal Rosenstock

Same device IDs as in opensm/osm_subnet.c:is_mlnx_ext_port_info_supported

Switch-IB is device ID 0xcb20 rather than 0xcb84

Signed-off-by: Hal Rosenstock 
---
diff --git a/libibnetdisc/src/ibnetdisc.c b/libibnetdisc/src/ibnetdisc.c
index f3c6000..e346905 100644
--- a/libibnetdisc/src/ibnetdisc.c
+++ b/libibnetdisc/src/ibnetdisc.c
@@ -203,7 +203,7 @@ static int is_mlnx_ext_port_info_supported(ibnd_port_t * 
port)
 {
uint16_t devid = (uint16_t) mad_get_field(port->node->info, 0, 
IB_NODE_DEVID_F);
 
-   if (devid == 0xc738 || devid == 0xcb84)
+   if ((devid >= 0xc738 && devid <= 0xc73b) || devid == 0xcb20)
return 1;
if (devid >= 0x1003 && devid <= 0x1016)
return 1;
diff --git a/src/ibdiag_common.c b/src/ibdiag_common.c
index 666edbc..e09623d 100644
--- a/src/ibdiag_common.c
+++ b/src/ibdiag_common.c
@@ -530,7 +530,7 @@ int is_port_info_extended_supported(ib_portid_t * dest, int 
port,
 int is_mlnx_ext_port_info_supported(uint32_t devid)
 {
if (ibd_ibnetdisc_flags & IBND_CONFIG_MLX_EPI) {
-   if (devid == 0xc738 || devid == 0xcb84)
+   if ((devid >= 0xc738 && devid <= 0xc73b) || devid == 0xcb20)
return 1;
if (devid >= 0x1003 && devid <= 0x1016)
return 1;
diff --git a/src/vendstat.c b/src/vendstat.c
index 7fc4a11..11ee73e 100644
--- a/src/vendstat.c
+++ b/src/vendstat.c
@@ -144,8 +144,8 @@ typedef struct {
 
 static uint16_t ext_fw_info_device[][2] = {
{0x0245, 0x0245},   /* Switch-X */
-   {0xc738, 0xc738},   /* Switch-X */
-   {0xcb84, 0xcb84},   /* Switch-IB */
+   {0xc738, 0xc73b},   /* Switch-X */
+   {0xcb20, 0xcb20},   /* Switch-IB */
{0x01b3, 0x01b3},   /* IS-4 */
{0x1003, 0x1016},   /* Connect-X */
{0x, 0x}};
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 state for BasePort0 switches

2015-01-28 Thread Weiny, Ira
> >>
> >> +static int is_route_inactive_port0(Node * node, Port * port, Switch
> >> +*
> >> +sw) {
> >
> > I would prefer this function be named something like:
> >
> > port_inactive_not_bsp0
> >
> > The current name is confusing as to what the logic is.
> 
> How about is_port_inactive ?

That is fine.

> 
> >> +  int res = 0;
> >> +  /* not active for HCA and for enhanced port0 Switches */
> >> +  if (port->state != 4 &&
> >
> > Please use #defines here even though the original code did not.
> 
> This is an issue throughout infiniband-diags. I think it should be addressed 
> by
> separate patch(es) for this.

I would rather see us do the right thing going forward.  This saves work trying 
to find all these majic numbers later.

Ira


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 infiniband-diags] ibtracert.c: Remove checking the port 0 state for base switch port 0

2015-01-28 Thread Hal Rosenstock
From: Dan Ben Yosef 

The port 0 state check needed to be only for HCA/Routers and
EnhancedPort0 switches.
For BasePort0 switches, the PortState field, in PortInfo arrtibute, for port0 is
undefined.

Signed-off-by: Dan Ben Yosef 
---
Changes since v1:
Changed is_route_inactive_port0 to is_port_inactive
Moved description from above use of routine to above routine (is_port_inactive)

diff --git a/src/ibtracert.c b/src/ibtracert.c
index d32968b..9414618 100644
--- a/src/ibtracert.c
+++ b/src/ibtracert.c
@@ -92,6 +92,7 @@ struct Switch {
int mccap;
int linearFDBtop;
int fdb_base;
+   int enhsp0;
int8_t fdb[64];
char switchinfo[64];
 };
@@ -114,6 +115,24 @@ struct Node {
 Node *nodesdist[MAXHOPS];
 uint64_t target_portguid;
 
+/*
+ * is_port_inactive
+ * Checks whether or not the port state is other than active.
+ * The "sw" argument is only relevant when the port is on a
+ * switch; for HCAs and routers, this argument is ignored.
+ * Returns 1 when port is not active and 0 when active.
+ * Base switch port 0 is considered always active.
+ */
+static int is_port_inactive(Node * node, Port * port, Switch * sw)
+{
+   int res = 0;
+   if (port->state != 4 &&
+   (node->type != IB_NODE_SWITCH ||
+(node->type == IB_NODE_SWITCH && sw->enhsp0)))
+   res = 1;
+   return res;
+}
+
 static int get_node(Node * node, Port * port, ib_portid_t * portid)
 {
void *pi = port->portinfo, *ni = node->nodeinfo, *nd = node->nodedesc;
@@ -164,6 +183,7 @@ static int switch_lookup(Switch * sw, ib_portid_t * portid, 
int lid)
 
mad_decode_field(si, IB_SW_LINEAR_FDB_CAP_F, &sw->linearcap);
mad_decode_field(si, IB_SW_LINEAR_FDB_TOP_F, &sw->linearFDBtop);
+   mad_decode_field(si, IB_SW_ENHANCED_PORT0_F, &sw->enhsp0);
 
if (lid >= sw->linearcap && lid > sw->linearFDBtop)
return -1;
@@ -248,7 +268,7 @@ static int find_route(ib_portid_t * from, ib_portid_t * to, 
int dump)
Port *port, fromport, toport, nextport;
Switch sw;
int maxhops = MAXHOPS;
-   int portnum, outport;
+   int portnum, outport = 255, next_sw_outport = 255;
 
memset(&fromnode,0,sizeof(Node));
memset(&tonode,0,sizeof(Node));
@@ -274,20 +294,25 @@ static int find_route(ib_portid_t * from, ib_portid_t * 
to, int dump)
portnum = port->portnum;
 
dump_endnode(dump, "From", node, port);
+   if (node->type == IB_NODE_SWITCH) {
+   next_sw_outport = switch_lookup(&sw, from, to->lid);
+   if (next_sw_outport < 0 || next_sw_outport > node->numports) {
+   /* needed to print the port in badtbl */
+   outport = next_sw_outport;
+   goto badtbl;
+   }
+   }
 
while (maxhops--) {
-   if (port->state != 4)
+   if (is_port_inactive(node, port, &sw))
goto badport;
 
if (sameport(port, &toport))
break;  /* found */
 
-   outport = portnum;
if (node->type == IB_NODE_SWITCH) {
DEBUG("switch node");
-   if ((outport = switch_lookup(&sw, from, to->lid)) < 0 ||
-   outport > node->numports)
-   goto badtbl;
+   outport = next_sw_outport;
 
if (extend_dpath(&from->drpath, outport) < 0)
goto badpath;
@@ -307,6 +332,7 @@ static int find_route(ib_portid_t * from, ib_portid_t * to, 
int dump)
   (node->type == IB_NODE_ROUTER)) {
int ca_src = 0;
 
+   outport = portnum;
DEBUG("ca or router node");
if (!sameport(port, &fromport)) {
IBWARN
@@ -335,8 +361,19 @@ static int find_route(ib_portid_t * from, ib_portid_t * 
to, int dump)
nextport.portnum =
from->drpath.p[from->drpath.cnt + 1];
}
+   /* only if the next node is a switch, get switch info */
+   if (nextnode.type == IB_NODE_SWITCH) {
+   next_sw_outport = switch_lookup(&sw, from, to->lid);
+   if (next_sw_outport < 0 ||
+   next_sw_outport > nextnode.numports) {
+   /* needed to print the port in badtbl */
+   outport = next_sw_outport;
+   goto badtbl;
+   }
+   }
+
port = &nextport;
-   if (port->state != 4)
+   if (is_port_inactive(&nextnode, port, &sw))
goto badoutport;
node = &nextnode;
portnum = port->portnum;
--
To unsubscribe 

Re: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 state for BasePort0 switches

2015-01-28 Thread Hal Rosenstock
On 1/28/2015 1:27 PM, Weiny, Ira wrote:
> Sorry for the delay.
> 
>> -Original Message-
>> From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
>> ow...@vger.kernel.org] On Behalf Of Hal Rosenstock
>> Sent: Tuesday, January 20, 2015 5:00 AM
>> To: Weiny, Ira
>> Cc: Dan Ben-Yosef; linux-rdma (linux-rdma@vger.kernel.org)
>> Subject: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 
>> state
>> for BasePort0 switches
>>
>> From: Dan Ben Yosef 
>>
>> The port 0 state check is needed only for HCA/Routers and
>> EnhancedPort0 switches.
>>
>> For BasePort0 switches, the PortState field in the PortInfo attribute for 
>> port0 is
>> undefined (not used).
>>
>> Signed-off-by: Dan Ben Yosef 
>> ---
>>  src/ibtracert.c |   47 ---
>>  1 files changed, 40 insertions(+), 7 deletions(-)
>>
>> diff --git a/src/ibtracert.c b/src/ibtracert.c index d32968b..326200b 100644
>> --- a/src/ibtracert.c
>> +++ b/src/ibtracert.c
>> @@ -92,6 +92,7 @@ struct Switch {
>>  int mccap;
>>  int linearFDBtop;
>>  int fdb_base;
>> +int enhsp0;
>>  int8_t fdb[64];
>>  char switchinfo[64];
>>  };
>> @@ -114,6 +115,17 @@ struct Node {
>>  Node *nodesdist[MAXHOPS];
>>  uint64_t target_portguid;
>>
>> +static int is_route_inactive_port0(Node * node, Port * port, Switch *
>> +sw) {
> 
> I would prefer this function be named something like:
> 
> port_inactive_not_bsp0
> 
> The current name is confusing as to what the logic is.

How about is_port_inactive ?

>> +int res = 0;
>> +/* not active for HCA and for enhanced port0 Switches */
>> +if (port->state != 4 &&
> 
> Please use #defines here even though the original code did not.

This is an issue throughout infiniband-diags. I think it should be
addressed by separate patch(es) for this.

>> +(node->type != IB_NODE_SWITCH ||
>> + (node->type == IB_NODE_SWITCH && sw->enhsp0)))
>> +res = 1;
>> +return res;
>> +}
>> +
>>  static int get_node(Node * node, Port * port, ib_portid_t * portid)  {
>>  void *pi = port->portinfo, *ni = node->nodeinfo, *nd = node->nodedesc;
>> @@ -164,6 +176,7 @@ static int switch_lookup(Switch * sw, ib_portid_t *
>> portid, int lid)
>>
>>  mad_decode_field(si, IB_SW_LINEAR_FDB_CAP_F, &sw->linearcap);
>>  mad_decode_field(si, IB_SW_LINEAR_FDB_TOP_F, &sw-
>>> linearFDBtop);
>> +mad_decode_field(si, IB_SW_ENHANCED_PORT0_F, &sw->enhsp0);
>>
>>  if (lid >= sw->linearcap && lid > sw->linearFDBtop)
>>  return -1;
>> @@ -248,7 +261,7 @@ static int find_route(ib_portid_t * from, ib_portid_t *
>> to, int dump)
>>  Port *port, fromport, toport, nextport;
>>  Switch sw;
>>  int maxhops = MAXHOPS;
>> -int portnum, outport;
>> +int portnum, outport = 255, next_sw_outport = 255;
>>
>>  memset(&fromnode,0,sizeof(Node));
>>  memset(&tonode,0,sizeof(Node));
>> @@ -274,20 +287,28 @@ static int find_route(ib_portid_t * from, ib_portid_t *
>> to, int dump)
>>  portnum = port->portnum;
>>
>>  dump_endnode(dump, "From", node, port);
>> +if (node->type == IB_NODE_SWITCH) {
>> +next_sw_outport = switch_lookup(&sw, from, to->lid);
>> +if (next_sw_outport < 0 || next_sw_outport > node->numports)
>> {
>> +/* needed to print the port in badtbl */
>> +outport = next_sw_outport;
>> +goto badtbl;
>> +}
>> +}
>>
>>  while (maxhops--) {
>> -if (port->state != 4)
>> +/* checking if the port state isn't active.
>> + * The "sw" argument is only relevant when the port is a
>> + * switch port for HCAs this argument is ignored */
> 
> This comment should go above the function signature and be enhanced to 
> clarify that it is checking for port state inactive except for BSP0 which has 
> no state.

Will do that in next version of the patch.

-- Hal

> 
> Ira
> 
> 
>> +if (is_route_inactive_port0(node, port, &sw))
>>  goto badport;
>>
>>  if (sameport(port, &toport))
>>  break;  /* found */
>>
>> -outport = portnum;
>>  if (node->type == IB_NODE_SWITCH) {
>>  DEBUG("switch node");
>> -if ((outport = switch_lookup(&sw, from, to->lid)) < 0 ||
>> -outport > node->numports)
>> -goto badtbl;
>> +outport = next_sw_outport;
>>
>>  if (extend_dpath(&from->drpath, outport) < 0)
>>  goto badpath;
>> @@ -307,6 +328,7 @@ static int find_route(ib_portid_t * from, ib_portid_t *
>> to, int dump)
>> (node->type == IB_NODE_ROUTER)) {
>>  int ca_src = 0;
>>
>> +outport = portnum;
>>  DEBUG("ca or router node");
>>  if (!sameport(por

Re: [PATCH 3/4] IB/uverbs: ex_query_device: check request's comp_mask

2015-01-28 Thread Haggai Eran
On 28/01/2015 15:19, Yann Droneaud wrote:
> Hi,
> 
> Le mardi 27 janvier 2015 à 08:50 +0200, Haggai Eran a écrit :
>> On 26/01/2015 13:17, Yann Droneaud wrote:
>>> ...
>>> Le dimanche 25 janvier 2015 à 17:23 +0200, Haggai Eran a écrit :
 On 22/01/2015 15:28, Yann Droneaud wrote:
> This patch ensures the extended QUERY_DEVICE uverbs request's
> comp_mask has only known values. If userspace returns unknown
> features, -EINVAL will be returned, allowing to probe/discover
> which features are currently supported by the kernel.

 This probing process will be much more cumbersome than it needs to be
 because userspace will have to call QUERY_DEVICE repetitively with
 different comp_mask fields until it finds out the exact set of supported
 bits.

>>>
>>> O(log2(N))
>>
>> I don't think user space developers will be happy having to do trial and 
>> error to determine what features the kernel driver supports. It might be 
>> even more then O(log2(N)). If my understanding of comp_mask bits usage is 
>> correct it would O(N). But it's not the time complexity I'm worried about,
>> it's the fact that it requires user-space developers to go through hoops in
>> order to get information that can be much more easily exported.
>>
> 
> But there's currently *NO* such mean that could give a hint to userspace
> developer whether one bit of request's comp_mask is currently effective
> in extended QUERY_DEVICE uverbs. While we have such mean in CREATE_FLOW
> and DESTROY_FLOW: unsupported bits trigger -EINVAL.
> 
> In QUERY_DEVICE, userspace developer is allowed to set request's 
> comp_mask to -1: it won't hurt it on current kernel, so why not using 
> this value or any other random value ? The program will run: it's "Works
> For Me".
> 
> But the same program (either binary or source code) might fail on
> newer kernel where some bits in comp_mask gain a meaning not supported
> by the program.

Why don't we make the command comp_mask in QUERY_DEVICE zero in both
versions. The behavior of the command with comp_mask = 0 will be to
return the maximum amount of data that fits in the response buffer. The
kernel will return -EINVAL if the input command comp_mask is not zero in
the current version.

If in the future we want to alter the behavior or add more input fields
to QUERY_DEVICE, we would use new bits.

>>> Or you had to add a different interface, dedicated to retrieve the exact
>>> supported feature mask.
>>>
> Moreover, it also ensure the requested features set in comp_mask
> are sequentially set, not skipping intermediate features, eg. the
> "highest" requested feature also request all the "lower" ones.
> This way each new feature will have to be stacked on top of the
> existing ones: this is especially important for the request and
> response data structures where fields are added after the
> current ones when expanded to support a new feature.

 I think it is perfectly acceptable that not all drivers will implement
 all available features, and so you shouldn't enforce this requirement.
>>>
>>> With regard to QUERY_DEVICE: the data structure layout depends on the
>>> comp_mask value. So either you propose a way to express multipart data
>>> structure (see CMSG or NETLINK), or we have to ensure the data structure
>>> is ever-growing, with each new chunck stacked over the existing ones:
>>> that's the purpose of :
>>>
>>> if (cmd.comp_mask & (cmd.comp_mask + 1))
>>> return -EINVAL;
>>>
 Also, it makes the comp_mask nothing more than a wasteful version number
 between 0 and 63.
>>>
>>> That's what I've already explained earlier in "Re: [PATCH v3 06/17]
>>> IB/core: Add support for extended query device caps":
>>>
>>> http://mid.gmane.org/1421844612.13543.40.ca...@opteya.com
>>
>> Yes, you wrote there:
>>> Regarding comp_mask (not for this particular verb):
>>>
>>> It's not clear whether request's comp_mask describe the request or the
>>> response, as such I'm puzzled.
>>>
>>> How would the kernel and the userspace be able to parse the request and
>>> the response if they ignore unknown bits ?
>>>
>>> How would they be able to skip the unrecognized chunk of the request and
>>> response buffer ?
>>>
>>> How one bit in a comp_mask is related to a chunk in the request or
>>> response ?
>>>
>>> It's likely the kernel or userspace would have to skip the remaining
>>> comp_mask's bits after encountering an unknown bit as the size of the
>>> corresponding chunk in request or response would be unknown, making
>>> impossible to locate the corresponding chunk for the next bit set in
>>> comp_mask. Having said that, comp_mask is just a complicated way of
>>> expressing a version, which is turn describe a size (ever growing).
>>
>> It is my understanding that each comp_mask bit marks a set of fields in 
>> the command or in the response struct as valid, so the struct format 
>> remains the same and the kernel and userspace don't need 

RE: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 state for BasePort0 switches

2015-01-28 Thread Weiny, Ira
Sorry for the delay.

> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
> ow...@vger.kernel.org] On Behalf Of Hal Rosenstock
> Sent: Tuesday, January 20, 2015 5:00 AM
> To: Weiny, Ira
> Cc: Dan Ben-Yosef; linux-rdma (linux-rdma@vger.kernel.org)
> Subject: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 
> state
> for BasePort0 switches
> 
> From: Dan Ben Yosef 
> 
> The port 0 state check is needed only for HCA/Routers and
> EnhancedPort0 switches.
> 
> For BasePort0 switches, the PortState field in the PortInfo attribute for 
> port0 is
> undefined (not used).
> 
> Signed-off-by: Dan Ben Yosef 
> ---
>  src/ibtracert.c |   47 ---
>  1 files changed, 40 insertions(+), 7 deletions(-)
> 
> diff --git a/src/ibtracert.c b/src/ibtracert.c index d32968b..326200b 100644
> --- a/src/ibtracert.c
> +++ b/src/ibtracert.c
> @@ -92,6 +92,7 @@ struct Switch {
>   int mccap;
>   int linearFDBtop;
>   int fdb_base;
> + int enhsp0;
>   int8_t fdb[64];
>   char switchinfo[64];
>  };
> @@ -114,6 +115,17 @@ struct Node {
>  Node *nodesdist[MAXHOPS];
>  uint64_t target_portguid;
> 
> +static int is_route_inactive_port0(Node * node, Port * port, Switch *
> +sw) {

I would prefer this function be named something like:

port_inactive_not_bsp0

The current name is confusing as to what the logic is.

> + int res = 0;
> + /* not active for HCA and for enhanced port0 Switches */
> + if (port->state != 4 &&

Please use #defines here even though the original code did not.

> + (node->type != IB_NODE_SWITCH ||
> +  (node->type == IB_NODE_SWITCH && sw->enhsp0)))
> + res = 1;
> + return res;
> +}
> +
>  static int get_node(Node * node, Port * port, ib_portid_t * portid)  {
>   void *pi = port->portinfo, *ni = node->nodeinfo, *nd = node->nodedesc;
> @@ -164,6 +176,7 @@ static int switch_lookup(Switch * sw, ib_portid_t *
> portid, int lid)
> 
>   mad_decode_field(si, IB_SW_LINEAR_FDB_CAP_F, &sw->linearcap);
>   mad_decode_field(si, IB_SW_LINEAR_FDB_TOP_F, &sw-
> >linearFDBtop);
> + mad_decode_field(si, IB_SW_ENHANCED_PORT0_F, &sw->enhsp0);
> 
>   if (lid >= sw->linearcap && lid > sw->linearFDBtop)
>   return -1;
> @@ -248,7 +261,7 @@ static int find_route(ib_portid_t * from, ib_portid_t *
> to, int dump)
>   Port *port, fromport, toport, nextport;
>   Switch sw;
>   int maxhops = MAXHOPS;
> - int portnum, outport;
> + int portnum, outport = 255, next_sw_outport = 255;
> 
>   memset(&fromnode,0,sizeof(Node));
>   memset(&tonode,0,sizeof(Node));
> @@ -274,20 +287,28 @@ static int find_route(ib_portid_t * from, ib_portid_t *
> to, int dump)
>   portnum = port->portnum;
> 
>   dump_endnode(dump, "From", node, port);
> + if (node->type == IB_NODE_SWITCH) {
> + next_sw_outport = switch_lookup(&sw, from, to->lid);
> + if (next_sw_outport < 0 || next_sw_outport > node->numports)
> {
> + /* needed to print the port in badtbl */
> + outport = next_sw_outport;
> + goto badtbl;
> + }
> + }
> 
>   while (maxhops--) {
> - if (port->state != 4)
> + /* checking if the port state isn't active.
> +  * The "sw" argument is only relevant when the port is a
> +  * switch port for HCAs this argument is ignored */

This comment should go above the function signature and be enhanced to clarify 
that it is checking for port state inactive except for BSP0 which has no state.


Ira


> + if (is_route_inactive_port0(node, port, &sw))
>   goto badport;
> 
>   if (sameport(port, &toport))
>   break;  /* found */
> 
> - outport = portnum;
>   if (node->type == IB_NODE_SWITCH) {
>   DEBUG("switch node");
> - if ((outport = switch_lookup(&sw, from, to->lid)) < 0 ||
> - outport > node->numports)
> - goto badtbl;
> + outport = next_sw_outport;
> 
>   if (extend_dpath(&from->drpath, outport) < 0)
>   goto badpath;
> @@ -307,6 +328,7 @@ static int find_route(ib_portid_t * from, ib_portid_t *
> to, int dump)
>  (node->type == IB_NODE_ROUTER)) {
>   int ca_src = 0;
> 
> + outport = portnum;
>   DEBUG("ca or router node");
>   if (!sameport(port, &fromport)) {
>   IBWARN
> @@ -335,8 +357,19 @@ static int find_route(ib_portid_t * from, ib_portid_t *
> to, int dump)
>   nextport.portnum =
>   from->drpath.p[from->drpath.cnt + 1];
>   }
> + /