On 12:49 Thu 18 Feb     , Ira Weiny wrote:
> 
> From: Ira Weiny <wei...@llnl.gov>
> Date: Fri, 22 Jan 2010 17:33:30 -0800
> Subject: [PATCH] libibnetdisc: Convert to a multi-smp algorithm
> 
> v3: change DEFAULT_MAX_SMP_ON_WIRE to 2
> 
>       Allow for multiple SMP's to be on the wire at a single time.  This
>       algorithm splits the processing of SMP's to a small smp engine which
>       may be useful to split out in the future.
> 
> Signed-off-by: Ira Weiny <wei...@llnl.gov>

Applied. Thanks.

However see some comments and questions below.

> +static int recv_port_info(smp_engine_t *engine, ibnd_smp_t * smp,
> +                       uint8_t *mad, void *cb_data)
> +{
> +     ibnd_fabric_t *fabric = ((ibnd_scan_t *)engine->user_data)->fabric;
> +     ibnd_node_t *node = (ibnd_node_t *)cb_data;
> +     ibnd_port_t *port;
> +     uint8_t *port_info = mad + IB_SMP_DATA_OFFS;
> +     uint8_t port_num, local_port;
> +
> +     port_num = mad_get_field(mad, 0, IB_MAD_ATTRMOD_F);
> +     local_port = mad_get_field(port_info, 0, IB_PORT_LOCAL_PORT_F);
> +
> +     /* this may have been created before */
> +     port = node->ports[port_num];
> +     if (!port) {
> +             port = node->ports[port_num] = calloc(1, sizeof(*port));
> +             if (!port) {
> +                     IBND_ERROR("Failed to allocate port\n");
> +                     return -1;
> +             }
> +     }
> +
> +     memcpy(port->info, port_info, sizeof(port->info));
> +     port->node = node;
> +     port->portnum = port_num;
> +     port->ext_portnum = 0;
> +     port->base_lid = (uint16_t) mad_get_field(port->info, 0, IB_PORT_LID_F);
> +     port->lmc = (uint8_t) mad_get_field(port->info, 0, IB_PORT_LMC_F);
> +
> +     if (port_num == 0) {
> +             node->smalid = port->base_lid;
> +             node->smalmc = port->lmc;
> +     } else if (node->type == IB_NODE_SWITCH) {
> +             port->base_lid = node->smalid;
> +             port->lmc = node->smalmc;
> +     }
> +
> +     add_to_portguid_hash(port, fabric->portstbl);
> +
> +     debug_port(&smp->path, port);
>  
> -     mad_dump_node_type(type, 64, &node->type, sizeof(int));
> -     printf("%s -> %s %s {%016" PRIx64 "} portnum %d base lid %d-%d\"%s\"\n",
> -            portid2str(path), prompt, type, node->guid,
> -            node->type == IB_NODE_SWITCH ? 0 : port->portnum,
> -            port->base_lid,
> -            port->base_lid + (1 << port->lmc) - 1, node->nodedesc);
> +     if (port_num &&
> +         (mad_get_field(port->info, 0, IB_PORT_PHYS_STATE_F)
> +         == IB_PORT_PHYS_STATE_LINKUP)
> +             &&
> +         (node->type == IB_NODE_SWITCH || node == fabric->from_node)) {

What will happen when ibnetdiscover runs from host connected by two
ports to different subnets? Wouldn't it run over both fabrics (following
PortInfo querying loop in recv_node_info() below)?

> +
> +             ib_portid_t path = smp->path;
> +             if (extend_dpath(engine, &path, port_num) != -1)
> +                     query_node_info(engine, &path, node);
> +     }
> +
> +     return 0;
> +}

[snip]

> +static int recv_node_info(smp_engine_t *engine, ibnd_smp_t * smp,
> +                       uint8_t *mad, void *cb_data)
> +{
> +     ibnd_fabric_t *fabric = ((ibnd_scan_t *)engine->user_data)->fabric;
> +     int i = 0;
> +     uint8_t *node_info = mad + IB_SMP_DATA_OFFS;
> +     ibnd_node_t * rem_node = (ibnd_node_t *)cb_data;
>       ibnd_node_t *node;
> +     int node_is_new = 0;
> +     uint64_t node_guid = mad_get_field64(node_info, 0, IB_NODE_GUID_F);
> +     uint64_t port_guid = mad_get_field64(node_info, 0, IB_NODE_PORT_GUID_F);
> +     int port_num = mad_get_field(node_info, 0, IB_NODE_LOCAL_PORT_F);
> +     ibnd_port_t *port = NULL;
>  
> -     for (node = fabric->nodestbl[hash]; node; node = node->htnext)
> -             if (node->guid == new->guid)
> -                     return node;
> +     node = ibnd_find_node_guid(fabric, node_guid);
> +     if (!node) {
> +             node = create_node(engine, &smp->path, node_info);
> +             if (!node)
> +                     return -1;
> +             node_is_new = 1;
> +     }
> +     IBND_DEBUG("Found %s node GUID %lx (%s)\n",
> +                (node_is_new) ? "new": "old", node->guid,
> +                portid2str(&smp->path));
>  
> -     return NULL;
> +     port = node->ports[port_num];
> +     if (!port) {
> +             /* If we have not see this port before create a shell for it */
> +             port = node->ports[port_num] = calloc(1, sizeof(*port));
> +             port->node = node;
> +             port->portnum = port_num;
> +     }
> +     port->guid = port_guid;
> +
> +     if (rem_node == NULL) /* this is the start node */
> +             fabric->from_node = node;
> +     else {
> +             /* link ports... */
> +             int rem_port_num = get_last_port(&smp->path);
> +
> +             if (!rem_node->ports[rem_port_num]) {
> +                     IBND_ERROR("Internal Error; "
> +                                "Node(%p) %lx Port %d no port 
> created!?!?!?\n\n",
> +                                rem_node, rem_node->guid, rem_port_num);
> +                     return (-1);
> +             }
> +
> +             link_ports(node, port, rem_node, rem_node->ports[rem_port_num]);
> +     }
> +
> +     if (!node_is_new)
> +             return 0;
> +
> +     query_node_desc(engine, &smp->path, node);
> +
> +     if (node->type == IB_NODE_SWITCH)
> +             query_switch_info(engine, &smp->path, node);
> +
> +     /* process all the ports on this node */
> +     for (i = (node->type == IB_NODE_SWITCH) ? 0 : 1;
> +             i <= node->numports; i++) {
> +                     query_port_info(engine, &smp->path, node, i);
> +     }

This one.

> +
> +     return 0;
> +}

[snip]

> diff --git a/infiniband-diags/libibnetdisc/src/internal.h 
> b/infiniband-diags/libibnetdisc/src/internal.h
> index 348bd0f..61b644d 100644
> --- a/infiniband-diags/libibnetdisc/src/internal.h
> +++ b/infiniband-diags/libibnetdisc/src/internal.h
> @@ -39,6 +39,7 @@
>  #define _INTERNAL_H_
>  
>  #include <infiniband/ibnetdisc.h>
> +#include <complib/cl_qmap.h>

I'm not really happy with complib dependency introduction - we had an
issues with it in the past.

Also I think that we can implement simpler transaction tracker (likely
even more efficient) by storing sent MADs in cyclic array and encoding
its index as part of TRID.

>  
>  #define      IBND_DEBUG(fmt, ...) \
>       if (ibdebug) { \
> @@ -52,16 +53,44 @@
>  
>  #define MAXHOPS         63
>  
> -typedef struct ibnd_node_scan {
> -     ibnd_node_t *node;
> -     struct ibnd_node_scan *dnext;   /* nodesdist next */
> -} ibnd_node_scan_t;
> +#define DEFAULT_MAX_SMP_ON_WIRE 2
>  
>  typedef struct ibnd_scan {
> -     ibnd_node_scan_t *nodesdist[MAXHOPS + 1];
>       ib_portid_t selfportid;
> +     ibnd_fabric_t *fabric;
>  } ibnd_scan_t;
>  
> +
> +typedef struct ibnd_smp ibnd_smp_t;
> +typedef struct smp_engine smp_engine_t;
> +typedef int (*smp_comp_cb_t)(smp_engine_t *engine, ibnd_smp_t * smp,
> +                          uint8_t *mad_resp, void *cb_data);
> +struct ibnd_smp {
> +     cl_map_item_t on_wire;
> +     struct ibnd_smp * qnext;
> +     smp_comp_cb_t cb;
> +     void * cb_data;
> +     ib_portid_t path;
> +     ib_rpc_t rpc;
> +};
> +struct smp_engine {
> +     struct ibmad_port *ibmad_port;
> +     ibnd_smp_t *smp_queue_head;
> +     ibnd_smp_t *smp_queue_tail;
> +     void * user_data;
> +     cl_qmap_t smps_on_wire;
> +     int num_smps_outstanding;
> +     int max_smps_on_wire;
> +};
> +
> +void smp_engine_init(smp_engine_t * engine, struct ibmad_port *ibmad_port,
> +                  void * user_data, int max_smps_on_wire);
> +int issue_smp(smp_engine_t *engine, ib_portid_t * portid,
> +           unsigned attrid, unsigned mod,
> +           smp_comp_cb_t cb, void * cb_data);
> +int process_mads(smp_engine_t *engine);
> +void smp_engine_destroy(smp_engine_t *engine);
> +
>  void add_to_nodeguid_hash(ibnd_node_t * node, ibnd_node_t * hash[]);
>  
>  void add_to_portguid_hash(ibnd_port_t * port, ibnd_port_t * hash[]);
> diff --git a/infiniband-diags/libibnetdisc/src/query_smp.c 
> b/infiniband-diags/libibnetdisc/src/query_smp.c
> new file mode 100644
> index 0000000..5571314
> --- /dev/null
> +++ b/infiniband-diags/libibnetdisc/src/query_smp.c
> @@ -0,0 +1,249 @@
> +/*
> + * Copyright (c) 2010 Lawrence Livermore National Laboratory
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + *     Redistribution and use in source and binary forms, with or
> + *     without modification, are permitted provided that the following
> + *     conditions are met:
> + *
> + *      - Redistributions of source code must retain the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer.
> + *
> + *      - Redistributions in binary form must reproduce the above
> + *        copyright notice, this list of conditions and the following
> + *        disclaimer in the documentation and/or other materials
> + *        provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + *
> + */
> +
> +#include <errno.h>
> +#include <infiniband/ibnetdisc.h>
> +#include <infiniband/umad.h>
> +#include "internal.h"
> +
> +void
> +queue_smp(smp_engine_t *engine, ibnd_smp_t *smp)
> +{
> +     smp->qnext = NULL;
> +     if (!engine->smp_queue_head) {
> +             engine->smp_queue_head = smp;
> +             engine->smp_queue_tail = smp;
> +     } else {
> +             engine->smp_queue_tail->qnext = smp;
> +             engine->smp_queue_tail = smp;
> +     }
> +}
> +
> +ibnd_smp_t *
> +get_smp(smp_engine_t *engine)
> +{
> +     ibnd_smp_t *head = engine->smp_queue_head;
> +     ibnd_smp_t *tail = engine->smp_queue_tail;
> +     ibnd_smp_t *rc = head;
> +     if (head) {
> +             if (tail == head)
> +                     engine->smp_queue_tail = NULL;
> +             engine->smp_queue_head = head->qnext;
> +     }
> +     return rc;
> +}
> +
> +int send_smp(ibnd_smp_t * smp, struct ibmad_port *srcport)
> +{
> +     int rc = 0;
> +     uint8_t umad[1024];
> +     ib_rpc_t *rpc = &smp->rpc;
> +
> +     memset(umad, 0, umad_size() + IB_MAD_SIZE);
> +
> +     if ((rc = mad_build_pkt(umad, &smp->rpc, &smp->path, NULL, NULL))
> +         < 0) {
> +             IBND_ERROR("mad_build_pkt failed; %d", rc);
> +             return rc;
> +     }
> +
> +     if ((rc = umad_send(mad_rpc_portid(srcport),
> +                         mad_rpc_class_agent(srcport, rpc->mgtclass),
> +                         umad, IB_MAD_SIZE,
> +                         mad_get_timeout(srcport, rpc->timeout),
> +                         mad_get_retries(srcport))) < 0) {
> +             IBND_ERROR("send failed; %d", rc);
> +             return rc;
> +     }
> +
> +     return 0;
> +}
> +
> +static int process_smp_queue(smp_engine_t *engine)
> +{
> +     int rc = 0;
> +     ibnd_smp_t *smp;
> +     while (cl_qmap_count(&engine->smps_on_wire) < engine->max_smps_on_wire) 
> {
> +             smp = get_smp(engine);
> +             if (!smp)
> +                     return 0;
> +
> +             cl_qmap_insert(&engine->smps_on_wire, (uint32_t)smp->rpc.trid,
> +                            (cl_map_item_t *)smp);
> +             if ((rc = send_smp(smp, engine->ibmad_port)) != 0)
> +                     return rc;
> +     }
> +     return 0;
> +}
> +
> +int issue_smp(smp_engine_t *engine, ib_portid_t * portid,
> +           unsigned attrid, unsigned mod,
> +           smp_comp_cb_t cb, void * cb_data)
> +{
> +     ibnd_smp_t *smp = calloc(1, sizeof *smp);
> +     if (!smp) {
> +             IBND_ERROR("OOM");
> +             return -ENOMEM;
> +     }
> +
> +     smp->cb = cb;
> +     smp->cb_data = cb_data;
> +     smp->path = *portid;
> +     smp->rpc.method = IB_MAD_METHOD_GET;
> +     smp->rpc.attr.id = attrid;
> +     smp->rpc.attr.mod = mod;
> +     smp->rpc.timeout = mad_get_timeout(engine->ibmad_port, 0);
> +     smp->rpc.datasz = IB_SMP_DATA_SIZE;
> +     smp->rpc.dataoffs = IB_SMP_DATA_OFFS;
> +     smp->rpc.trid = mad_trid();

BTW, by reviewing mad_trid() code in libibmad:

uint64_t mad_trid(void)
{
        static uint64_t base;
        static uint64_t trid;
        uint64_t next;

        if (!base) {
                srandom((int)time(0) * getpid());
                base = random();
                trid = random();
        }
        next = ++trid | (base << 32);
        return next;
}

For me it doesn't look as thread safe function.

> +
> +     if ((portid->lid <= 0) ||
> +         (portid->drpath.drslid == 0xffff) ||
> +         (portid->drpath.drdlid == 0xffff))
> +             smp->rpc.mgtclass = IB_SMI_DIRECT_CLASS; /* direct SMI */
> +     else
> +             smp->rpc.mgtclass = IB_SMI_CLASS; /* Lid routed SMI */
> +
> +     portid->sl = 0;
> +     portid->qp = 0;
> +
> +     engine->num_smps_outstanding++;
> +     queue_smp(engine, smp);
> +     return (process_smp_queue(engine));
> +}
> +
> +int process_one_recv(smp_engine_t *engine)
> +{
> +     int rc = 0;
> +     int status = 0;
> +     ibnd_smp_t *smp;
> +     uint8_t *mad;
> +     uint32_t trid;
> +     uint8_t umad[umad_size() + IB_MAD_SIZE];
> +     int length = umad_size() + IB_MAD_SIZE;
> +
> +     memset(umad, 0, sizeof(umad));
> +
> +     /* wait for the next message */
> +     if ((rc = umad_recv(mad_rpc_portid(engine->ibmad_port), umad, &length,
> +                         0)) < 0) {
> +             if (rc == -EWOULDBLOCK)
> +                     return 0;
> +             IBND_ERROR("umad_recv failed: %d\n", rc);
> +             return -1;
> +     }
> +
> +     rc = process_smp_queue(engine);
> +
> +     mad = umad_get_mad(umad);
> +     trid = (uint32_t)mad_get_field64(mad, 0, IB_MAD_TRID_F);
> +
> +     smp = (ibnd_smp_t *)cl_qmap_remove(&engine->smps_on_wire, trid);
> +     if ((cl_map_item_t *)smp == cl_qmap_end(&engine->smps_on_wire)) {
> +             IBND_ERROR("Failed to find matching smp for trid (%x)\n",
> +                        trid);
> +             return -1;
> +     }
> +
> +     if (rc)
> +             goto error;
> +
> +     if ((status = umad_status(umad))) {
> +             IBND_ERROR("umad (%s Attr 0x%x:%u) bad status %d; %s\n",
> +                     portid2str(&smp->path),
> +                     smp->rpc.attr.id, smp->rpc.attr.mod,
> +                     status, strerror(status));
> +     } else if ((status = mad_get_field(mad, 0, IB_DRSMP_STATUS_F))) {
> +                     IBND_ERROR("mad (%s Attr 0x%x:%u) bad status 0x%x\n",
> +                                portid2str(&smp->path),
> +                                smp->rpc.attr.id, smp->rpc.attr.mod,
> +                                status);
> +     } else
> +             rc = smp->cb(engine, smp, mad, smp->cb_data);
> +
> +error:
> +     free(smp);
> +     engine->num_smps_outstanding--;
> +     return (rc);
> +}
> +
> +void smp_engine_init(smp_engine_t * engine, struct ibmad_port *ibmad_port,
> +                  void * user_data, int max_smps_on_wire)
> +{
> +     memset(engine, '\0', sizeof(*engine));
> +     engine->ibmad_port = ibmad_port;
> +     engine->user_data = user_data;
> +     cl_qmap_init(&engine->smps_on_wire);
> +     engine->num_smps_outstanding = 0;
> +     engine->max_smps_on_wire = max_smps_on_wire;
> +}
> +
> +void smp_engine_destroy(smp_engine_t *engine)
> +{
> +     cl_map_item_t *item;
> +     ibnd_smp_t *smp;
> +
> +     /* remove queued smps */
> +     smp = get_smp(engine);
> +     if (smp)
> +             IBND_ERROR("outstanding SMP's\n");
> +     for (/* */; smp; smp = get_smp(engine)) {
> +             free(smp);
> +     }
> +
> +     /* remove smps from the wire queue */
> +     item = cl_qmap_head(&engine->smps_on_wire);
> +     if (item != cl_qmap_end(&engine->smps_on_wire))
> +             IBND_ERROR("outstanding SMP's on wire\n");
> +     for (/* */; item != cl_qmap_end(&engine->smps_on_wire);
> +          item = cl_qmap_head(&engine->smps_on_wire)) {
> +             cl_qmap_remove_item(&engine->smps_on_wire, item);
> +             free(item);
> +     }
> +
> +     engine->num_smps_outstanding = 0;
> +}
> +
> +int process_mads(smp_engine_t *engine)
> +{
> +     int rc = 0;
> +     while (engine->num_smps_outstanding > 0) {
> +             if ((rc = process_smp_queue(engine)) != 0)
> +                     return rc;

Is it really needed to run process_smp_queue() here (assuming that it is
already executed in issue_smp() and process_one_recv())? Or this is just
for handling process_one_recv() error case? If so wouldn't it better to
run it there?

Sasha

> +             while (!cl_is_qmap_empty(&engine->smps_on_wire))
> +                     if ((rc = process_one_recv(engine)) != 0)
> +                             return rc;
> +     }
> +     return 0;
> +}
> +
> -- 
> 1.5.4.5
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to