On Tue, 13 Apr 2010 13:46:58 +0300
Sasha Khapyorsky <sas...@voltaire.com> wrote:

> On 12:49 Thu 18 Feb     , Ira Weiny wrote:
> > 
> > From: Ira Weiny <wei...@llnl.gov>
> > Date: Fri, 22 Jan 2010 17:33:30 -0800
> > Subject: [PATCH] libibnetdisc: Convert to a multi-smp algorithm
> > 
> > v3: change DEFAULT_MAX_SMP_ON_WIRE to 2
> > 
> >     Allow for multiple SMP's to be on the wire at a single time.  This
> >     algorithm splits the processing of SMP's to a small smp engine which
> >     may be useful to split out in the future.
> > 
> > Signed-off-by: Ira Weiny <wei...@llnl.gov>
> 
> Applied. Thanks.
> 
> However see some comments and questions below.
> 
> > +static int recv_port_info(smp_engine_t *engine, ibnd_smp_t * smp,
> > +                     uint8_t *mad, void *cb_data)
> > +{
> > +   ibnd_fabric_t *fabric = ((ibnd_scan_t *)engine->user_data)->fabric;
> > +   ibnd_node_t *node = (ibnd_node_t *)cb_data;
> > +   ibnd_port_t *port;
> > +   uint8_t *port_info = mad + IB_SMP_DATA_OFFS;
> > +   uint8_t port_num, local_port;
> > +
> > +   port_num = mad_get_field(mad, 0, IB_MAD_ATTRMOD_F);
> > +   local_port = mad_get_field(port_info, 0, IB_PORT_LOCAL_PORT_F);
> > +
> > +   /* this may have been created before */
> > +   port = node->ports[port_num];
> > +   if (!port) {
> > +           port = node->ports[port_num] = calloc(1, sizeof(*port));
> > +           if (!port) {
> > +                   IBND_ERROR("Failed to allocate port\n");
> > +                   return -1;
> > +           }
> > +   }
> > +
> > +   memcpy(port->info, port_info, sizeof(port->info));
> > +   port->node = node;
> > +   port->portnum = port_num;
> > +   port->ext_portnum = 0;
> > +   port->base_lid = (uint16_t) mad_get_field(port->info, 0, IB_PORT_LID_F);
> > +   port->lmc = (uint8_t) mad_get_field(port->info, 0, IB_PORT_LMC_F);
> > +
> > +   if (port_num == 0) {
> > +           node->smalid = port->base_lid;
> > +           node->smalmc = port->lmc;
> > +   } else if (node->type == IB_NODE_SWITCH) {
> > +           port->base_lid = node->smalid;
> > +           port->lmc = node->smalmc;
> > +   }
> > +
> > +   add_to_portguid_hash(port, fabric->portstbl);
> > +
> > +   debug_port(&smp->path, port);
> >  
> > -   mad_dump_node_type(type, 64, &node->type, sizeof(int));
> > -   printf("%s -> %s %s {%016" PRIx64 "} portnum %d base lid %d-%d\"%s\"\n",
> > -          portid2str(path), prompt, type, node->guid,
> > -          node->type == IB_NODE_SWITCH ? 0 : port->portnum,
> > -          port->base_lid,
> > -          port->base_lid + (1 << port->lmc) - 1, node->nodedesc);
> > +   if (port_num &&
> > +       (mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F)
> > +       == IB_PORT_PHYS_STATE_LINKUP)
> > +           &&
> > +       (node->type == IB_NODE_SWITCH || node == fabric->from_node)) {
> 
> What will happen when ibnetdiscover runs from host connected by two
> ports to different subnets? Wouldn't it run over both fabrics (following
> PortInfo querying loop in recv_node_info() below)?

Looking at the code I think you are right...  In practice, this does not 
happen...

I am still trying to figure out exactly why.

> 
> > +
> > +           ib_portid_t path = smp->path;
> > +           if (extend_dpath(engine, &path, port_num) != -1)
> > +                   query_node_info(engine, &path, node);
> > +   }
> > +
> > +   return 0;
> > +}
> 
> [snip]
> 
> > +static int recv_node_info(smp_engine_t *engine, ibnd_smp_t * smp,
> > +                     uint8_t *mad, void *cb_data)
> > +{
> > +   ibnd_fabric_t *fabric = ((ibnd_scan_t *)engine->user_data)->fabric;
> > +   int i = 0;
> > +   uint8_t *node_info = mad + IB_SMP_DATA_OFFS;
> > +   ibnd_node_t * rem_node = (ibnd_node_t *)cb_data;
> >     ibnd_node_t *node;
> > +   int node_is_new = 0;
> > +   uint64_t node_guid = mad_get_field64(node_info, 0, IB_NODE_GUID_F);
> > +   uint64_t port_guid = mad_get_field64(node_info, 0, IB_NODE_PORT_GUID_F);
> > +   int port_num = mad_get_field(node_info, 0, IB_NODE_LOCAL_PORT_F);
> > +   ibnd_port_t *port = NULL;
> >  
> > -   for (node = fabric->nodestbl[hash]; node; node = node->htnext)
> > -           if (node->guid == new->guid)
> > -                   return node;
> > +   node = ibnd_find_node_guid(fabric, node_guid);
> > +   if (!node) {
> > +           node = create_node(engine, &smp->path, node_info);
> > +           if (!node)
> > +                   return -1;
> > +           node_is_new = 1;
> > +   }
> > +   IBND_DEBUG("Found %s node GUID %lx (%s)\n",
> > +              (node_is_new) ? "new": "old", node->guid,
> > +              portid2str(&smp->path));
> >  
> > -   return NULL;
> > +   port = node->ports[port_num];
> > +   if (!port) {
> > +           /* If we have not see this port before create a shell for it */
> > +           port = node->ports[port_num] = calloc(1, sizeof(*port));
> > +           port->node = node;
> > +           port->portnum = port_num;
> > +   }
> > +   port->guid = port_guid;
> > +
> > +   if (rem_node == NULL) /* this is the start node */
> > +           fabric->from_node = node;
> > +   else {
> > +           /* link ports... */
> > +           int rem_port_num = get_last_port(&smp->path);
> > +
> > +           if (!rem_node->ports[rem_port_num]) {
> > +                   IBND_ERROR("Internal Error; "
> > +                              "Node(%p) %lx Port %d no port 
> > created!?!?!?\n\n",
> > +                              rem_node, rem_node->guid, rem_port_num);
> > +                   return (-1);
> > +           }
> > +
> > +           link_ports(node, port, rem_node, rem_node->ports[rem_port_num]);
> > +   }
> > +
> > +   if (!node_is_new)
> > +           return 0;
> > +
> > +   query_node_desc(engine, &smp->path, node);
> > +
> > +   if (node->type == IB_NODE_SWITCH)
> > +           query_switch_info(engine, &smp->path, node);
> > +
> > +   /* process all the ports on this node */
> > +   for (i = (node->type == IB_NODE_SWITCH) ? 0 : 1;
> > +           i <= node->numports; i++) {
> > +                   query_port_info(engine, &smp->path, node, i);
> > +   }
> 
> This one.
> 
> > +
> > +   return 0;
> > +}
> 
> [snip]
> 
> > diff --git a/infiniband-diags/libibnetdisc/src/internal.h 
> > b/infiniband-diags/libibnetdisc/src/internal.h
> > index 348bd0f..61b644d 100644
> > --- a/infiniband-diags/libibnetdisc/src/internal.h
> > +++ b/infiniband-diags/libibnetdisc/src/internal.h
> > @@ -39,6 +39,7 @@
> >  #define _INTERNAL_H_
> >  
> >  #include <infiniband/ibnetdisc.h>
> > +#include <complib/cl_qmap.h>
> 
> I'm not really happy with complib dependency introduction - we had an
> issues with it in the past.

I don't recall.  infiniband-diags is already dependant on opensm-libs.  We 
have, at least, the node name map dependency from complib already, so I did not 
think it was that big of a deal.

> 
> Also I think that we can implement simpler transaction tracker (likely
> even more efficient) by storing sent MADs in cyclic array and encoding
> its index as part of TRID.

I just thought code reuse was the way to go here.

> 
> >  
> >  #define    IBND_DEBUG(fmt, ...) \
> >     if (ibdebug) { \
> > @@ -52,16 +53,44 @@
> >  
> >  #define MAXHOPS         63
> >  
> > -typedef struct ibnd_node_scan {
> > -   ibnd_node_t *node;
> > -   struct ibnd_node_scan *dnext;   /* nodesdist next */
> > -} ibnd_node_scan_t;
> > +#define DEFAULT_MAX_SMP_ON_WIRE 2
> >  
> >  typedef struct ibnd_scan {
> > -   ibnd_node_scan_t *nodesdist[MAXHOPS + 1];
> >     ib_portid_t selfportid;
> > +   ibnd_fabric_t *fabric;
> >  } ibnd_scan_t;
> >  
> > +
> > +typedef struct ibnd_smp ibnd_smp_t;
> > +typedef struct smp_engine smp_engine_t;
> > +typedef int (*smp_comp_cb_t)(smp_engine_t *engine, ibnd_smp_t * smp,
> > +                        uint8_t *mad_resp, void *cb_data);
> > +struct ibnd_smp {
> > +   cl_map_item_t on_wire;
> > +   struct ibnd_smp * qnext;
> > +   smp_comp_cb_t cb;
> > +   void * cb_data;
> > +   ib_portid_t path;
> > +   ib_rpc_t rpc;
> > +};
> > +struct smp_engine {
> > +   struct ibmad_port *ibmad_port;
> > +   ibnd_smp_t *smp_queue_head;
> > +   ibnd_smp_t *smp_queue_tail;
> > +   void * user_data;
> > +   cl_qmap_t smps_on_wire;
> > +   int num_smps_outstanding;
> > +   int max_smps_on_wire;
> > +};
> > +
> > +void smp_engine_init(smp_engine_t * engine, struct ibmad_port *ibmad_port,
> > +                void * user_data, int max_smps_on_wire);
> > +int issue_smp(smp_engine_t *engine, ib_portid_t * portid,
> > +         unsigned attrid, unsigned mod,
> > +         smp_comp_cb_t cb, void * cb_data);
> > +int process_mads(smp_engine_t *engine);
> > +void smp_engine_destroy(smp_engine_t *engine);
> > +
> >  void add_to_nodeguid_hash(ibnd_node_t * node, ibnd_node_t * hash[]);
> >  
> >  void add_to_portguid_hash(ibnd_port_t * port, ibnd_port_t * hash[]);
> > diff --git a/infiniband-diags/libibnetdisc/src/query_smp.c 
> > b/infiniband-diags/libibnetdisc/src/query_smp.c
> > new file mode 100644
> > index 0000000..5571314
> > --- /dev/null
> > +++ b/infiniband-diags/libibnetdisc/src/query_smp.c
> > @@ -0,0 +1,249 @@
> > +/*
> > + * Copyright (c) 2010 Lawrence Livermore National Laboratory
> > + *
> > + * This software is available to you under a choice of one of two
> > + * licenses.  You may choose to be licensed under the terms of the GNU
> > + * General Public License (GPL) Version 2, available from the file
> > + * COPYING in the main directory of this source tree, or the
> > + * OpenIB.org BSD license below:
> > + *
> > + *     Redistribution and use in source and binary forms, with or
> > + *     without modification, are permitted provided that the following
> > + *     conditions are met:
> > + *
> > + *      - Redistributions of source code must retain the above
> > + *        copyright notice, this list of conditions and the following
> > + *        disclaimer.
> > + *
> > + *      - Redistributions in binary form must reproduce the above
> > + *        copyright notice, this list of conditions and the following
> > + *        disclaimer in the documentation and/or other materials
> > + *        provided with the distribution.
> > + *
> > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + *
> > + */
> > +
> > +#include <errno.h>
> > +#include <infiniband/ibnetdisc.h>
> > +#include <infiniband/umad.h>
> > +#include "internal.h"
> > +
> > +void
> > +queue_smp(smp_engine_t *engine, ibnd_smp_t *smp)
> > +{
> > +   smp->qnext = NULL;
> > +   if (!engine->smp_queue_head) {
> > +           engine->smp_queue_head = smp;
> > +           engine->smp_queue_tail = smp;
> > +   } else {
> > +           engine->smp_queue_tail->qnext = smp;
> > +           engine->smp_queue_tail = smp;
> > +   }
> > +}
> > +
> > +ibnd_smp_t *
> > +get_smp(smp_engine_t *engine)
> > +{
> > +   ibnd_smp_t *head = engine->smp_queue_head;
> > +   ibnd_smp_t *tail = engine->smp_queue_tail;
> > +   ibnd_smp_t *rc = head;
> > +   if (head) {
> > +           if (tail == head)
> > +                   engine->smp_queue_tail = NULL;
> > +           engine->smp_queue_head = head->qnext;
> > +   }
> > +   return rc;
> > +}
> > +
> > +int send_smp(ibnd_smp_t * smp, struct ibmad_port *srcport)
> > +{
> > +   int rc = 0;
> > +   uint8_t umad[1024];
> > +   ib_rpc_t *rpc = &smp->rpc;
> > +
> > +   memset(umad, 0, umad_size() + IB_MAD_SIZE);
> > +
> > +   if ((rc = mad_build_pkt(umad, &smp->rpc, &smp->path, NULL, NULL))
> > +       < 0) {
> > +           IBND_ERROR("mad_build_pkt failed; %d", rc);
> > +           return rc;
> > +   }
> > +
> > +   if ((rc = umad_send(mad_rpc_portid(srcport),
> > +                       mad_rpc_class_agent(srcport, rpc->mgtclass),
> > +                       umad, IB_MAD_SIZE,
> > +                       mad_get_timeout(srcport, rpc->timeout),
> > +                       mad_get_retries(srcport))) < 0) {
> > +           IBND_ERROR("send failed; %d", rc);
> > +           return rc;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> > +static int process_smp_queue(smp_engine_t *engine)
> > +{
> > +   int rc = 0;
> > +   ibnd_smp_t *smp;
> > +   while (cl_qmap_count(&engine->smps_on_wire) < engine->max_smps_on_wire) 
> > {
> > +           smp = get_smp(engine);
> > +           if (!smp)
> > +                   return 0;
> > +
> > +           cl_qmap_insert(&engine->smps_on_wire, (uint32_t)smp->rpc.trid,
> > +                          (cl_map_item_t *)smp);
> > +           if ((rc = send_smp(smp, engine->ibmad_port)) != 0)
> > +                   return rc;
> > +   }
> > +   return 0;
> > +}
> > +
> > +int issue_smp(smp_engine_t *engine, ib_portid_t * portid,
> > +         unsigned attrid, unsigned mod,
> > +         smp_comp_cb_t cb, void * cb_data)
> > +{
> > +   ibnd_smp_t *smp = calloc(1, sizeof *smp);
> > +   if (!smp) {
> > +           IBND_ERROR("OOM");
> > +           return -ENOMEM;
> > +   }
> > +
> > +   smp->cb = cb;
> > +   smp->cb_data = cb_data;
> > +   smp->path = *portid;
> > +   smp->rpc.method = IB_MAD_METHOD_GET;
> > +   smp->rpc.attr.id = attrid;
> > +   smp->rpc.attr.mod = mod;
> > +   smp->rpc.timeout = mad_get_timeout(engine->ibmad_port, 0);
> > +   smp->rpc.datasz = IB_SMP_DATA_SIZE;
> > +   smp->rpc.dataoffs = IB_SMP_DATA_OFFS;
> > +   smp->rpc.trid = mad_trid();
> 
> BTW, by reviewing mad_trid() code in libibmad:
> 
> uint64_t mad_trid(void)
> {
>       static uint64_t base;
>       static uint64_t trid;
>       uint64_t next;
> 
>       if (!base) {
>               srandom((int)time(0) * getpid());
>               base = random();
>               trid = random();
>       }
>       next = ++trid | (base << 32);
>       return next;
> }
> 
> For me it doesn't look as thread safe function.

It's not!  The whole of libibmad is not thread safe!  (Ok perhaps just the 
parts I was using before.)  But we are not using threads here, right?

> 
> > +
> > +   if ((portid->lid <= 0) ||
> > +       (portid->drpath.drslid == 0xffff) ||
> > +       (portid->drpath.drdlid == 0xffff))
> > +           smp->rpc.mgtclass = IB_SMI_DIRECT_CLASS; /* direct SMI */
> > +   else
> > +           smp->rpc.mgtclass = IB_SMI_CLASS; /* Lid routed SMI */
> > +
> > +   portid->sl = 0;
> > +   portid->qp = 0;
> > +
> > +   engine->num_smps_outstanding++;
> > +   queue_smp(engine, smp);
> > +   return (process_smp_queue(engine));
> > +}
> > +
> > +int process_one_recv(smp_engine_t *engine)
> > +{
> > +   int rc = 0;
> > +   int status = 0;
> > +   ibnd_smp_t *smp;
> > +   uint8_t *mad;
> > +   uint32_t trid;
> > +   uint8_t umad[umad_size() + IB_MAD_SIZE];
> > +   int length = umad_size() + IB_MAD_SIZE;
> > +
> > +   memset(umad, 0, sizeof(umad));
> > +
> > +   /* wait for the next message */
> > +   if ((rc = umad_recv(mad_rpc_portid(engine->ibmad_port), umad, &length,
> > +                       0)) < 0) {
> > +           if (rc == -EWOULDBLOCK)
> > +                   return 0;
> > +           IBND_ERROR("umad_recv failed: %d\n", rc);
> > +           return -1;
> > +   }
> > +
> > +   rc = process_smp_queue(engine);
> > +
> > +   mad = umad_get_mad(umad);
> > +   trid = (uint32_t)mad_get_field64(mad, 0, IB_MAD_TRID_F);
> > +
> > +   smp = (ibnd_smp_t *)cl_qmap_remove(&engine->smps_on_wire, trid);
> > +   if ((cl_map_item_t *)smp == cl_qmap_end(&engine->smps_on_wire)) {
> > +           IBND_ERROR("Failed to find matching smp for trid (%x)\n",
> > +                      trid);
> > +           return -1;
> > +   }
> > +
> > +   if (rc)
> > +           goto error;
> > +
> > +   if ((status = umad_status(umad))) {
> > +           IBND_ERROR("umad (%s Attr 0x%x:%u) bad status %d; %s\n",
> > +                   portid2str(&smp->path),
> > +                   smp->rpc.attr.id, smp->rpc.attr.mod,
> > +                   status, strerror(status));
> > +   } else if ((status = mad_get_field(mad, 0, IB_DRSMP_STATUS_F))) {
> > +                   IBND_ERROR("mad (%s Attr 0x%x:%u) bad status 0x%x\n",
> > +                              portid2str(&smp->path),
> > +                              smp->rpc.attr.id, smp->rpc.attr.mod,
> > +                              status);
> > +   } else
> > +           rc = smp->cb(engine, smp, mad, smp->cb_data);
> > +
> > +error:
> > +   free(smp);
> > +   engine->num_smps_outstanding--;
> > +   return (rc);
> > +}
> > +
> > +void smp_engine_init(smp_engine_t * engine, struct ibmad_port *ibmad_port,
> > +                void * user_data, int max_smps_on_wire)
> > +{
> > +   memset(engine, '\0', sizeof(*engine));
> > +   engine->ibmad_port = ibmad_port;
> > +   engine->user_data = user_data;
> > +   cl_qmap_init(&engine->smps_on_wire);
> > +   engine->num_smps_outstanding = 0;
> > +   engine->max_smps_on_wire = max_smps_on_wire;
> > +}
> > +
> > +void smp_engine_destroy(smp_engine_t *engine)
> > +{
> > +   cl_map_item_t *item;
> > +   ibnd_smp_t *smp;
> > +
> > +   /* remove queued smps */
> > +   smp = get_smp(engine);
> > +   if (smp)
> > +           IBND_ERROR("outstanding SMP's\n");
> > +   for (/* */; smp; smp = get_smp(engine)) {
> > +           free(smp);
> > +   }
> > +
> > +   /* remove smps from the wire queue */
> > +   item = cl_qmap_head(&engine->smps_on_wire);
> > +   if (item != cl_qmap_end(&engine->smps_on_wire))
> > +           IBND_ERROR("outstanding SMP's on wire\n");
> > +   for (/* */; item != cl_qmap_end(&engine->smps_on_wire);
> > +        item = cl_qmap_head(&engine->smps_on_wire)) {
> > +           cl_qmap_remove_item(&engine->smps_on_wire, item);
> > +           free(item);
> > +   }
> > +
> > +   engine->num_smps_outstanding = 0;
> > +}
> > +
> > +int process_mads(smp_engine_t *engine)
> > +{
> > +   int rc = 0;
> > +   while (engine->num_smps_outstanding > 0) {
> > +           if ((rc = process_smp_queue(engine)) != 0)
> > +                   return rc;
> 
> Is it really needed to run process_smp_queue() here (assuming that it is
> already executed in issue_smp() and process_one_recv())? Or this is just
> for handling process_one_recv() error case? If so wouldn't it better to
> run it there?

Yes I was looking at that yesterday.  Since we are not threaded I just wanted 
to make sure that there were some messages on the wire before we went into the 
processing loop.  I will send a patch after I have tested everything out.

Ira

> 
> Sasha
> 
> > +           while (!cl_is_qmap_empty(&engine->smps_on_wire))
> > +                   if ((rc = process_one_recv(engine)) != 0)
> > +                           return rc;
> > +   }
> > +   return 0;
> > +}
> > +
> > -- 
> > 1.5.4.5
> > 


-- 
Ira Weiny
Math Programmer/Computer Scientist
Lawrence Livermore National Lab
925-423-8008
wei...@llnl.gov
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to