Re: [PATCH] ib_ipoib: Scatter-Gather support in connected mode
Hi, Le mardi 27 janvier 2015 à 03:21 -0800, Yuval Shaia a écrit : > With this fix Scatter-Gather will be supported also in connected mode > Please explain the issue with current kernel and the advantage of the proposed fix in the commit message. Perhaps benchmarks against various HCAs would be welcome. > Signed-off-by: Yuval Shaia > --- > drivers/infiniband/ulp/ipoib/ipoib.h |8 +-- > drivers/infiniband/ulp/ipoib/ipoib_cm.c | 107 > +++-- > drivers/infiniband/ulp/ipoib/ipoib_ib.c |3 +- > drivers/infiniband/ulp/ipoib/ipoib_main.c |3 +- > 4 files changed, 91 insertions(+), 30 deletions(-) > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h > b/drivers/infiniband/ulp/ipoib/ipoib.h > index d7562be..fb42834 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib.h > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h > @@ -170,11 +170,6 @@ struct ipoib_tx_buf { > u64 mapping[MAX_SKB_FRAGS + 1]; > }; > > -struct ipoib_cm_tx_buf { > - struct sk_buff *skb; > - u64 mapping; > -}; > - > struct ib_cm_id; > > struct ipoib_cm_data { > @@ -233,7 +228,7 @@ struct ipoib_cm_tx { > struct net_device *dev; > struct ipoib_neigh *neigh; > struct ipoib_path *path; > - struct ipoib_cm_tx_buf *tx_ring; > + struct ipoib_tx_buf *tx_ring; > unsigned tx_head; > unsigned tx_tail; > unsigned longflags; > @@ -496,6 +491,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int > flush); > > void ipoib_mcast_dev_down(struct net_device *dev); > void ipoib_mcast_dev_flush(struct net_device *dev); > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); > > #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG > struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev); > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c > b/drivers/infiniband/ulp/ipoib/ipoib_cm.c > index 933efce..056e4d2 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c > @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv > *priv, int frags, > ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, > DMA_FROM_DEVICE); > } > > +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv, > + struct ipoib_tx_buf *tx_req) > +{ > + struct sk_buff *skb; > + int i, offs; > + > + skb = tx_req->skb; > + if (skb_shinfo(skb)->nr_frags) { > + offs = 0; > + if (skb_headlen(skb)) { > + ib_dma_unmap_single(priv->ca, tx_req->mapping[0], > + skb_headlen(skb), DMA_TO_DEVICE); > + offs = 1; > + } > + for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) { > + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; > + > + ib_dma_unmap_single(priv->ca, tx_req->mapping[i + offs], > + skb_frag_size(frag), DMA_TO_DEVICE); > + } > + } else > + ib_dma_unmap_single(priv->ca, tx_req->mapping[0], skb->len, > + DMA_TO_DEVICE); > +} > + > static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) > { > struct ipoib_dev_priv *priv = netdev_priv(dev); > @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv *priv, > return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); > } > > +static inline int post_send_sg(struct ipoib_dev_priv *priv, > +struct ipoib_cm_tx *tx, > +unsigned int wr_id, > +struct sk_buff *skb, > +u64 mapping[MAX_SKB_FRAGS + 1]) > +{ > + struct ib_send_wr *bad_wr; > + int i, off; > + skb_frag_t *frags = skb_shinfo(skb)->frags; > + int nr_frags = skb_shinfo(skb)->nr_frags; > + > + if (skb_headlen(skb)) { > + priv->tx_sge[0].addr = mapping[0]; > + priv->tx_sge[0].length = skb_headlen(skb); > + off = 1; > + } else > + off = 0; > + > + for (i = 0; i < nr_frags; ++i) { > + priv->tx_sge[i + off].addr = mapping[i + off]; > + priv->tx_sge[i + off].length = frags[i].size; > + } > + priv->tx_wr.num_sge = nr_frags + off; > + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; > + > + return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); > +} > + > void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct > ipoib_cm_tx *tx) > { > struct ipoib_dev_priv *priv = netdev_priv(dev); > - struct ipoib_cm_tx_buf *tx_req; > - u64 addr; > + struct ipoib_tx_buf *tx_req; > + u64 addr = 0; > int rc; > > if (unlikely(skb->len > tx->mtu)) { > @@ -735,24 +788,37 @@ void ipoib_cm_send(struct net_device *dev, struct > sk_bu
Re: [PATCH 3/4] IB/uverbs: ex_query_device: check request's comp_mask
Hi, Le mardi 27 janvier 2015 à 08:50 +0200, Haggai Eran a écrit : > On 26/01/2015 13:17, Yann Droneaud wrote: > > ... > > Le dimanche 25 janvier 2015 à 17:23 +0200, Haggai Eran a écrit : > >> On 22/01/2015 15:28, Yann Droneaud wrote: > >>> This patch ensures the extended QUERY_DEVICE uverbs request's > >>> comp_mask has only known values. If userspace returns unknown > >>> features, -EINVAL will be returned, allowing to probe/discover > >>> which features are currently supported by the kernel. > >> > >> This probing process will be much more cumbersome than it needs to be > >> because userspace will have to call QUERY_DEVICE repetitively with > >> different comp_mask fields until it finds out the exact set of supported > >> bits. > >> > > > > O(log2(N)) > > I don't think user space developers will be happy having to do trial and > error to determine what features the kernel driver supports. It might be > even more then O(log2(N)). If my understanding of comp_mask bits usage is > correct it would O(N). But it's not the time complexity I'm worried about, > it's the fact that it requires user-space developers to go through hoops in > order to get information that can be much more easily exported. > But there's currently *NO* such mean that could give a hint to userspace developer whether one bit of request's comp_mask is currently effective in extended QUERY_DEVICE uverbs. While we have such mean in CREATE_FLOW and DESTROY_FLOW: unsupported bits trigger -EINVAL. In QUERY_DEVICE, userspace developer is allowed to set request's comp_mask to -1: it won't hurt it on current kernel, so why not using this value or any other random value ? The program will run: it's "Works For Me". But the same program (either binary or source code) might fail on newer kernel where some bits in comp_mask gain a meaning not supported by the program. > > Or you had to add a different interface, dedicated to retrieve the exact > > supported feature mask. > > > >>> Moreover, it also ensure the requested features set in comp_mask > >>> are sequentially set, not skipping intermediate features, eg. the > >>> "highest" requested feature also request all the "lower" ones. > >>> This way each new feature will have to be stacked on top of the > >>> existing ones: this is especially important for the request and > >>> response data structures where fields are added after the > >>> current ones when expanded to support a new feature. > >> > >> I think it is perfectly acceptable that not all drivers will implement > >> all available features, and so you shouldn't enforce this requirement. > > > > With regard to QUERY_DEVICE: the data structure layout depends on the > > comp_mask value. So either you propose a way to express multipart data > > structure (see CMSG or NETLINK), or we have to ensure the data structure > > is ever-growing, with each new chunck stacked over the existing ones: > > that's the purpose of : > > > > if (cmd.comp_mask & (cmd.comp_mask + 1)) > > return -EINVAL; > > > >> Also, it makes the comp_mask nothing more than a wasteful version number > >> between 0 and 63. > > > > That's what I've already explained earlier in "Re: [PATCH v3 06/17] > > IB/core: Add support for extended query device caps": > > > > http://mid.gmane.org/1421844612.13543.40.ca...@opteya.com > > Yes, you wrote there: > > Regarding comp_mask (not for this particular verb): > > > > It's not clear whether request's comp_mask describe the request or the > > response, as such I'm puzzled. > > > > How would the kernel and the userspace be able to parse the request and > > the response if they ignore unknown bits ? > > > > How would they be able to skip the unrecognized chunk of the request and > > response buffer ? > > > > How one bit in a comp_mask is related to a chunk in the request or > > response ? > > > > It's likely the kernel or userspace would have to skip the remaining > > comp_mask's bits after encountering an unknown bit as the size of the > > corresponding chunk in request or response would be unknown, making > > impossible to locate the corresponding chunk for the next bit set in > > comp_mask. Having said that, comp_mask is just a complicated way of > > expressing a version, which is turn describe a size (ever growing). > > It is my understanding that each comp_mask bit marks a set of fields in > the command or in the response struct as valid, so the struct format > remains the same and the kernel and userspace don't need to make > difficult calculation as to where each field is, but you can still pass > a high bit set in comp_mask with one of the lower bits cleared. > How can the struct format remain the same, if some fields are added to implement newer feature ? > I couldn't find this explicit detail in the mailing list, but I did found > a presentation that was presented in OFA International Developer > Workshop 2013 [1], that gave an example of of an verb where each > comp_mask bit marked a dif
[PATCH infiniband-diags] Align infiniband-diags MLNX device IDs with OpenSM
Same device IDs as in opensm/osm_subnet.c:is_mlnx_ext_port_info_supported Switch-IB is device ID 0xcb20 rather than 0xcb84 Signed-off-by: Hal Rosenstock --- diff --git a/libibnetdisc/src/ibnetdisc.c b/libibnetdisc/src/ibnetdisc.c index f3c6000..e346905 100644 --- a/libibnetdisc/src/ibnetdisc.c +++ b/libibnetdisc/src/ibnetdisc.c @@ -203,7 +203,7 @@ static int is_mlnx_ext_port_info_supported(ibnd_port_t * port) { uint16_t devid = (uint16_t) mad_get_field(port->node->info, 0, IB_NODE_DEVID_F); - if (devid == 0xc738 || devid == 0xcb84) + if ((devid >= 0xc738 && devid <= 0xc73b) || devid == 0xcb20) return 1; if (devid >= 0x1003 && devid <= 0x1016) return 1; diff --git a/src/ibdiag_common.c b/src/ibdiag_common.c index 666edbc..e09623d 100644 --- a/src/ibdiag_common.c +++ b/src/ibdiag_common.c @@ -530,7 +530,7 @@ int is_port_info_extended_supported(ib_portid_t * dest, int port, int is_mlnx_ext_port_info_supported(uint32_t devid) { if (ibd_ibnetdisc_flags & IBND_CONFIG_MLX_EPI) { - if (devid == 0xc738 || devid == 0xcb84) + if ((devid >= 0xc738 && devid <= 0xc73b) || devid == 0xcb20) return 1; if (devid >= 0x1003 && devid <= 0x1016) return 1; diff --git a/src/vendstat.c b/src/vendstat.c index 7fc4a11..11ee73e 100644 --- a/src/vendstat.c +++ b/src/vendstat.c @@ -144,8 +144,8 @@ typedef struct { static uint16_t ext_fw_info_device[][2] = { {0x0245, 0x0245}, /* Switch-X */ - {0xc738, 0xc738}, /* Switch-X */ - {0xcb84, 0xcb84}, /* Switch-IB */ + {0xc738, 0xc73b}, /* Switch-X */ + {0xcb20, 0xcb20}, /* Switch-IB */ {0x01b3, 0x01b3}, /* IS-4 */ {0x1003, 0x1016}, /* Connect-X */ {0x, 0x}}; -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 state for BasePort0 switches
> >> > >> +static int is_route_inactive_port0(Node * node, Port * port, Switch > >> +* > >> +sw) { > > > > I would prefer this function be named something like: > > > > port_inactive_not_bsp0 > > > > The current name is confusing as to what the logic is. > > How about is_port_inactive ? That is fine. > > >> + int res = 0; > >> + /* not active for HCA and for enhanced port0 Switches */ > >> + if (port->state != 4 && > > > > Please use #defines here even though the original code did not. > > This is an issue throughout infiniband-diags. I think it should be addressed > by > separate patch(es) for this. I would rather see us do the right thing going forward. This saves work trying to find all these majic numbers later. Ira -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCHv2 infiniband-diags] ibtracert.c: Remove checking the port 0 state for base switch port 0
From: Dan Ben Yosef The port 0 state check needed to be only for HCA/Routers and EnhancedPort0 switches. For BasePort0 switches, the PortState field, in PortInfo arrtibute, for port0 is undefined. Signed-off-by: Dan Ben Yosef --- Changes since v1: Changed is_route_inactive_port0 to is_port_inactive Moved description from above use of routine to above routine (is_port_inactive) diff --git a/src/ibtracert.c b/src/ibtracert.c index d32968b..9414618 100644 --- a/src/ibtracert.c +++ b/src/ibtracert.c @@ -92,6 +92,7 @@ struct Switch { int mccap; int linearFDBtop; int fdb_base; + int enhsp0; int8_t fdb[64]; char switchinfo[64]; }; @@ -114,6 +115,24 @@ struct Node { Node *nodesdist[MAXHOPS]; uint64_t target_portguid; +/* + * is_port_inactive + * Checks whether or not the port state is other than active. + * The "sw" argument is only relevant when the port is on a + * switch; for HCAs and routers, this argument is ignored. + * Returns 1 when port is not active and 0 when active. + * Base switch port 0 is considered always active. + */ +static int is_port_inactive(Node * node, Port * port, Switch * sw) +{ + int res = 0; + if (port->state != 4 && + (node->type != IB_NODE_SWITCH || +(node->type == IB_NODE_SWITCH && sw->enhsp0))) + res = 1; + return res; +} + static int get_node(Node * node, Port * port, ib_portid_t * portid) { void *pi = port->portinfo, *ni = node->nodeinfo, *nd = node->nodedesc; @@ -164,6 +183,7 @@ static int switch_lookup(Switch * sw, ib_portid_t * portid, int lid) mad_decode_field(si, IB_SW_LINEAR_FDB_CAP_F, &sw->linearcap); mad_decode_field(si, IB_SW_LINEAR_FDB_TOP_F, &sw->linearFDBtop); + mad_decode_field(si, IB_SW_ENHANCED_PORT0_F, &sw->enhsp0); if (lid >= sw->linearcap && lid > sw->linearFDBtop) return -1; @@ -248,7 +268,7 @@ static int find_route(ib_portid_t * from, ib_portid_t * to, int dump) Port *port, fromport, toport, nextport; Switch sw; int maxhops = MAXHOPS; - int portnum, outport; + int portnum, outport = 255, next_sw_outport = 255; memset(&fromnode,0,sizeof(Node)); memset(&tonode,0,sizeof(Node)); @@ -274,20 +294,25 @@ static int find_route(ib_portid_t * from, ib_portid_t * to, int dump) portnum = port->portnum; dump_endnode(dump, "From", node, port); + if (node->type == IB_NODE_SWITCH) { + next_sw_outport = switch_lookup(&sw, from, to->lid); + if (next_sw_outport < 0 || next_sw_outport > node->numports) { + /* needed to print the port in badtbl */ + outport = next_sw_outport; + goto badtbl; + } + } while (maxhops--) { - if (port->state != 4) + if (is_port_inactive(node, port, &sw)) goto badport; if (sameport(port, &toport)) break; /* found */ - outport = portnum; if (node->type == IB_NODE_SWITCH) { DEBUG("switch node"); - if ((outport = switch_lookup(&sw, from, to->lid)) < 0 || - outport > node->numports) - goto badtbl; + outport = next_sw_outport; if (extend_dpath(&from->drpath, outport) < 0) goto badpath; @@ -307,6 +332,7 @@ static int find_route(ib_portid_t * from, ib_portid_t * to, int dump) (node->type == IB_NODE_ROUTER)) { int ca_src = 0; + outport = portnum; DEBUG("ca or router node"); if (!sameport(port, &fromport)) { IBWARN @@ -335,8 +361,19 @@ static int find_route(ib_portid_t * from, ib_portid_t * to, int dump) nextport.portnum = from->drpath.p[from->drpath.cnt + 1]; } + /* only if the next node is a switch, get switch info */ + if (nextnode.type == IB_NODE_SWITCH) { + next_sw_outport = switch_lookup(&sw, from, to->lid); + if (next_sw_outport < 0 || + next_sw_outport > nextnode.numports) { + /* needed to print the port in badtbl */ + outport = next_sw_outport; + goto badtbl; + } + } + port = &nextport; - if (port->state != 4) + if (is_port_inactive(&nextnode, port, &sw)) goto badoutport; node = &nextnode; portnum = port->portnum; -- To unsubscribe
Re: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 state for BasePort0 switches
On 1/28/2015 1:27 PM, Weiny, Ira wrote: > Sorry for the delay. > >> -Original Message- >> From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma- >> ow...@vger.kernel.org] On Behalf Of Hal Rosenstock >> Sent: Tuesday, January 20, 2015 5:00 AM >> To: Weiny, Ira >> Cc: Dan Ben-Yosef; linux-rdma (linux-rdma@vger.kernel.org) >> Subject: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 >> state >> for BasePort0 switches >> >> From: Dan Ben Yosef >> >> The port 0 state check is needed only for HCA/Routers and >> EnhancedPort0 switches. >> >> For BasePort0 switches, the PortState field in the PortInfo attribute for >> port0 is >> undefined (not used). >> >> Signed-off-by: Dan Ben Yosef >> --- >> src/ibtracert.c | 47 --- >> 1 files changed, 40 insertions(+), 7 deletions(-) >> >> diff --git a/src/ibtracert.c b/src/ibtracert.c index d32968b..326200b 100644 >> --- a/src/ibtracert.c >> +++ b/src/ibtracert.c >> @@ -92,6 +92,7 @@ struct Switch { >> int mccap; >> int linearFDBtop; >> int fdb_base; >> +int enhsp0; >> int8_t fdb[64]; >> char switchinfo[64]; >> }; >> @@ -114,6 +115,17 @@ struct Node { >> Node *nodesdist[MAXHOPS]; >> uint64_t target_portguid; >> >> +static int is_route_inactive_port0(Node * node, Port * port, Switch * >> +sw) { > > I would prefer this function be named something like: > > port_inactive_not_bsp0 > > The current name is confusing as to what the logic is. How about is_port_inactive ? >> +int res = 0; >> +/* not active for HCA and for enhanced port0 Switches */ >> +if (port->state != 4 && > > Please use #defines here even though the original code did not. This is an issue throughout infiniband-diags. I think it should be addressed by separate patch(es) for this. >> +(node->type != IB_NODE_SWITCH || >> + (node->type == IB_NODE_SWITCH && sw->enhsp0))) >> +res = 1; >> +return res; >> +} >> + >> static int get_node(Node * node, Port * port, ib_portid_t * portid) { >> void *pi = port->portinfo, *ni = node->nodeinfo, *nd = node->nodedesc; >> @@ -164,6 +176,7 @@ static int switch_lookup(Switch * sw, ib_portid_t * >> portid, int lid) >> >> mad_decode_field(si, IB_SW_LINEAR_FDB_CAP_F, &sw->linearcap); >> mad_decode_field(si, IB_SW_LINEAR_FDB_TOP_F, &sw- >>> linearFDBtop); >> +mad_decode_field(si, IB_SW_ENHANCED_PORT0_F, &sw->enhsp0); >> >> if (lid >= sw->linearcap && lid > sw->linearFDBtop) >> return -1; >> @@ -248,7 +261,7 @@ static int find_route(ib_portid_t * from, ib_portid_t * >> to, int dump) >> Port *port, fromport, toport, nextport; >> Switch sw; >> int maxhops = MAXHOPS; >> -int portnum, outport; >> +int portnum, outport = 255, next_sw_outport = 255; >> >> memset(&fromnode,0,sizeof(Node)); >> memset(&tonode,0,sizeof(Node)); >> @@ -274,20 +287,28 @@ static int find_route(ib_portid_t * from, ib_portid_t * >> to, int dump) >> portnum = port->portnum; >> >> dump_endnode(dump, "From", node, port); >> +if (node->type == IB_NODE_SWITCH) { >> +next_sw_outport = switch_lookup(&sw, from, to->lid); >> +if (next_sw_outport < 0 || next_sw_outport > node->numports) >> { >> +/* needed to print the port in badtbl */ >> +outport = next_sw_outport; >> +goto badtbl; >> +} >> +} >> >> while (maxhops--) { >> -if (port->state != 4) >> +/* checking if the port state isn't active. >> + * The "sw" argument is only relevant when the port is a >> + * switch port for HCAs this argument is ignored */ > > This comment should go above the function signature and be enhanced to > clarify that it is checking for port state inactive except for BSP0 which has > no state. Will do that in next version of the patch. -- Hal > > Ira > > >> +if (is_route_inactive_port0(node, port, &sw)) >> goto badport; >> >> if (sameport(port, &toport)) >> break; /* found */ >> >> -outport = portnum; >> if (node->type == IB_NODE_SWITCH) { >> DEBUG("switch node"); >> -if ((outport = switch_lookup(&sw, from, to->lid)) < 0 || >> -outport > node->numports) >> -goto badtbl; >> +outport = next_sw_outport; >> >> if (extend_dpath(&from->drpath, outport) < 0) >> goto badpath; >> @@ -307,6 +328,7 @@ static int find_route(ib_portid_t * from, ib_portid_t * >> to, int dump) >> (node->type == IB_NODE_ROUTER)) { >> int ca_src = 0; >> >> +outport = portnum; >> DEBUG("ca or router node"); >> if (!sameport(por
Re: [PATCH 3/4] IB/uverbs: ex_query_device: check request's comp_mask
On 28/01/2015 15:19, Yann Droneaud wrote: > Hi, > > Le mardi 27 janvier 2015 à 08:50 +0200, Haggai Eran a écrit : >> On 26/01/2015 13:17, Yann Droneaud wrote: >>> ... >>> Le dimanche 25 janvier 2015 à 17:23 +0200, Haggai Eran a écrit : On 22/01/2015 15:28, Yann Droneaud wrote: > This patch ensures the extended QUERY_DEVICE uverbs request's > comp_mask has only known values. If userspace returns unknown > features, -EINVAL will be returned, allowing to probe/discover > which features are currently supported by the kernel. This probing process will be much more cumbersome than it needs to be because userspace will have to call QUERY_DEVICE repetitively with different comp_mask fields until it finds out the exact set of supported bits. >>> >>> O(log2(N)) >> >> I don't think user space developers will be happy having to do trial and >> error to determine what features the kernel driver supports. It might be >> even more then O(log2(N)). If my understanding of comp_mask bits usage is >> correct it would O(N). But it's not the time complexity I'm worried about, >> it's the fact that it requires user-space developers to go through hoops in >> order to get information that can be much more easily exported. >> > > But there's currently *NO* such mean that could give a hint to userspace > developer whether one bit of request's comp_mask is currently effective > in extended QUERY_DEVICE uverbs. While we have such mean in CREATE_FLOW > and DESTROY_FLOW: unsupported bits trigger -EINVAL. > > In QUERY_DEVICE, userspace developer is allowed to set request's > comp_mask to -1: it won't hurt it on current kernel, so why not using > this value or any other random value ? The program will run: it's "Works > For Me". > > But the same program (either binary or source code) might fail on > newer kernel where some bits in comp_mask gain a meaning not supported > by the program. Why don't we make the command comp_mask in QUERY_DEVICE zero in both versions. The behavior of the command with comp_mask = 0 will be to return the maximum amount of data that fits in the response buffer. The kernel will return -EINVAL if the input command comp_mask is not zero in the current version. If in the future we want to alter the behavior or add more input fields to QUERY_DEVICE, we would use new bits. >>> Or you had to add a different interface, dedicated to retrieve the exact >>> supported feature mask. >>> > Moreover, it also ensure the requested features set in comp_mask > are sequentially set, not skipping intermediate features, eg. the > "highest" requested feature also request all the "lower" ones. > This way each new feature will have to be stacked on top of the > existing ones: this is especially important for the request and > response data structures where fields are added after the > current ones when expanded to support a new feature. I think it is perfectly acceptable that not all drivers will implement all available features, and so you shouldn't enforce this requirement. >>> >>> With regard to QUERY_DEVICE: the data structure layout depends on the >>> comp_mask value. So either you propose a way to express multipart data >>> structure (see CMSG or NETLINK), or we have to ensure the data structure >>> is ever-growing, with each new chunck stacked over the existing ones: >>> that's the purpose of : >>> >>> if (cmd.comp_mask & (cmd.comp_mask + 1)) >>> return -EINVAL; >>> Also, it makes the comp_mask nothing more than a wasteful version number between 0 and 63. >>> >>> That's what I've already explained earlier in "Re: [PATCH v3 06/17] >>> IB/core: Add support for extended query device caps": >>> >>> http://mid.gmane.org/1421844612.13543.40.ca...@opteya.com >> >> Yes, you wrote there: >>> Regarding comp_mask (not for this particular verb): >>> >>> It's not clear whether request's comp_mask describe the request or the >>> response, as such I'm puzzled. >>> >>> How would the kernel and the userspace be able to parse the request and >>> the response if they ignore unknown bits ? >>> >>> How would they be able to skip the unrecognized chunk of the request and >>> response buffer ? >>> >>> How one bit in a comp_mask is related to a chunk in the request or >>> response ? >>> >>> It's likely the kernel or userspace would have to skip the remaining >>> comp_mask's bits after encountering an unknown bit as the size of the >>> corresponding chunk in request or response would be unknown, making >>> impossible to locate the corresponding chunk for the next bit set in >>> comp_mask. Having said that, comp_mask is just a complicated way of >>> expressing a version, which is turn describe a size (ever growing). >> >> It is my understanding that each comp_mask bit marks a set of fields in >> the command or in the response struct as valid, so the struct format >> remains the same and the kernel and userspace don't need
RE: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 state for BasePort0 switches
Sorry for the delay. > -Original Message- > From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma- > ow...@vger.kernel.org] On Behalf Of Hal Rosenstock > Sent: Tuesday, January 20, 2015 5:00 AM > To: Weiny, Ira > Cc: Dan Ben-Yosef; linux-rdma (linux-rdma@vger.kernel.org) > Subject: [PATCH infiniband-diags] ibtracert.c: Remove checking the port 0 > state > for BasePort0 switches > > From: Dan Ben Yosef > > The port 0 state check is needed only for HCA/Routers and > EnhancedPort0 switches. > > For BasePort0 switches, the PortState field in the PortInfo attribute for > port0 is > undefined (not used). > > Signed-off-by: Dan Ben Yosef > --- > src/ibtracert.c | 47 --- > 1 files changed, 40 insertions(+), 7 deletions(-) > > diff --git a/src/ibtracert.c b/src/ibtracert.c index d32968b..326200b 100644 > --- a/src/ibtracert.c > +++ b/src/ibtracert.c > @@ -92,6 +92,7 @@ struct Switch { > int mccap; > int linearFDBtop; > int fdb_base; > + int enhsp0; > int8_t fdb[64]; > char switchinfo[64]; > }; > @@ -114,6 +115,17 @@ struct Node { > Node *nodesdist[MAXHOPS]; > uint64_t target_portguid; > > +static int is_route_inactive_port0(Node * node, Port * port, Switch * > +sw) { I would prefer this function be named something like: port_inactive_not_bsp0 The current name is confusing as to what the logic is. > + int res = 0; > + /* not active for HCA and for enhanced port0 Switches */ > + if (port->state != 4 && Please use #defines here even though the original code did not. > + (node->type != IB_NODE_SWITCH || > + (node->type == IB_NODE_SWITCH && sw->enhsp0))) > + res = 1; > + return res; > +} > + > static int get_node(Node * node, Port * port, ib_portid_t * portid) { > void *pi = port->portinfo, *ni = node->nodeinfo, *nd = node->nodedesc; > @@ -164,6 +176,7 @@ static int switch_lookup(Switch * sw, ib_portid_t * > portid, int lid) > > mad_decode_field(si, IB_SW_LINEAR_FDB_CAP_F, &sw->linearcap); > mad_decode_field(si, IB_SW_LINEAR_FDB_TOP_F, &sw- > >linearFDBtop); > + mad_decode_field(si, IB_SW_ENHANCED_PORT0_F, &sw->enhsp0); > > if (lid >= sw->linearcap && lid > sw->linearFDBtop) > return -1; > @@ -248,7 +261,7 @@ static int find_route(ib_portid_t * from, ib_portid_t * > to, int dump) > Port *port, fromport, toport, nextport; > Switch sw; > int maxhops = MAXHOPS; > - int portnum, outport; > + int portnum, outport = 255, next_sw_outport = 255; > > memset(&fromnode,0,sizeof(Node)); > memset(&tonode,0,sizeof(Node)); > @@ -274,20 +287,28 @@ static int find_route(ib_portid_t * from, ib_portid_t * > to, int dump) > portnum = port->portnum; > > dump_endnode(dump, "From", node, port); > + if (node->type == IB_NODE_SWITCH) { > + next_sw_outport = switch_lookup(&sw, from, to->lid); > + if (next_sw_outport < 0 || next_sw_outport > node->numports) > { > + /* needed to print the port in badtbl */ > + outport = next_sw_outport; > + goto badtbl; > + } > + } > > while (maxhops--) { > - if (port->state != 4) > + /* checking if the port state isn't active. > + * The "sw" argument is only relevant when the port is a > + * switch port for HCAs this argument is ignored */ This comment should go above the function signature and be enhanced to clarify that it is checking for port state inactive except for BSP0 which has no state. Ira > + if (is_route_inactive_port0(node, port, &sw)) > goto badport; > > if (sameport(port, &toport)) > break; /* found */ > > - outport = portnum; > if (node->type == IB_NODE_SWITCH) { > DEBUG("switch node"); > - if ((outport = switch_lookup(&sw, from, to->lid)) < 0 || > - outport > node->numports) > - goto badtbl; > + outport = next_sw_outport; > > if (extend_dpath(&from->drpath, outport) < 0) > goto badpath; > @@ -307,6 +328,7 @@ static int find_route(ib_portid_t * from, ib_portid_t * > to, int dump) > (node->type == IB_NODE_ROUTER)) { > int ca_src = 0; > > + outport = portnum; > DEBUG("ca or router node"); > if (!sameport(port, &fromport)) { > IBWARN > @@ -335,8 +357,19 @@ static int find_route(ib_portid_t * from, ib_portid_t * > to, int dump) > nextport.portnum = > from->drpath.p[from->drpath.cnt + 1]; > } > + /