Hi Nicolas,

On 17:26 Tue 14 Jul     , Nicolas Morey-Chaisemartin wrote:
> Support for xmit_wait counters was missing in the perfmgr though it was read 
> from the mad and event plugin interface already handles it.
> This patch adds support for it (tested and working with an event plugin)
> 
> Tested-by: Jean-Vincent Ficet <[email protected]>
> Signed-off-by: Nicolas Morey-Chaisemartin <[email protected]>
> ---
> I think emails got mixed up the first time so here it is again.
> 
>  opensm/include/opensm/osm_perfmgr_db.h |   23 ++++++-
>  opensm/opensm/osm_perfmgr.c            |   30 +++++++-
>  opensm/opensm/osm_perfmgr_db.c         |  124 +++++++++++++++++++++++++++++--
>  3 files changed, 166 insertions(+), 11 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_perfmgr_db.h 
> b/opensm/include/opensm/osm_perfmgr_db.h
> index 42a47bd..35b5ac3 100644
> --- a/opensm/include/opensm/osm_perfmgr_db.h
> +++ b/opensm/include/opensm/osm_perfmgr_db.h
> @@ -109,6 +109,14 @@ typedef struct {
>  } perfmgr_db_data_cnt_reading_t;
>  
>  /** =========================================================================
> + * Port select count reading
> + */
> +typedef struct {
> +     uint64_t xmit_wait;
> +     time_t time;
> +} perfmgr_db_sel_reading_t;
> +

Why do we need a separate structure for this counter?

Sasha


> +/** =========================================================================
>   * Dump output options
>   */
>  typedef enum {
> @@ -125,6 +133,8 @@ typedef struct db_port {
>       perfmgr_db_err_reading_t err_previous;
>       perfmgr_db_data_cnt_reading_t dc_total;
>       perfmgr_db_data_cnt_reading_t dc_previous;
> +     perfmgr_db_sel_reading_t ps_total;
> +     perfmgr_db_sel_reading_t ps_previous;
>       time_t last_reset;
>  } db_port_t;
>  
> @@ -179,7 +189,16 @@ perfmgr_db_err_t perfmgr_db_get_prev_dc(perfmgr_db_t * 
> db, uint64_t guid,
>                                       reading);
>  perfmgr_db_err_t perfmgr_db_clear_prev_dc(perfmgr_db_t * db, uint64_t guid,
>                                         uint8_t port);
> -
> +perfmgr_db_err_t perfmgr_db_add_ps_reading(perfmgr_db_t * db, uint64_t guid,
> +                                        uint8_t port,
> +                                        perfmgr_db_sel_reading_t *
> +                                        reading);
> +perfmgr_db_err_t perfmgr_db_get_prev_ps(perfmgr_db_t * db, uint64_t guid,
> +                                     uint8_t port,
> +                                     perfmgr_db_sel_reading_t *
> +                                     reading);
> +perfmgr_db_err_t perfmgr_db_clear_prev_ps(perfmgr_db_t * db, uint64_t guid,
> +                                       uint8_t port);
>  void perfmgr_db_clear_counters(perfmgr_db_t * db);
>  perfmgr_db_err_t perfmgr_db_dump(perfmgr_db_t * db, char *file,
>                                perfmgr_db_dump_t dump_type);
> @@ -196,6 +215,8 @@ void perfmgr_db_fill_data_cnt_read_pc(ib_port_counters_t 
> * wire_read,
>                                     perfmgr_db_data_cnt_reading_t * reading);
>  void perfmgr_db_fill_data_cnt_read_epc(ib_port_counters_ext_t * wire_read,
>                                      perfmgr_db_data_cnt_reading_t * reading);
> +void perfmgr_db_fill_sel_read(ib_port_counters_t * wire_read,
> +                                   perfmgr_db_sel_reading_t * reading);
>  
>  END_C_DECLS
>  
> diff --git a/opensm/opensm/osm_perfmgr.c b/opensm/opensm/osm_perfmgr.c
> index ecfdbda..8a9eb12 100644
> --- a/opensm/opensm/osm_perfmgr.c
> +++ b/opensm/opensm/osm_perfmgr.c
> @@ -853,10 +853,12 @@ void osm_perfmgr_destroy(osm_perfmgr_t * pm)
>  static void perfmgr_check_oob_clear(osm_perfmgr_t * pm,
>                                   monitored_node_t * mon_node, uint8_t port,
>                                   perfmgr_db_err_reading_t * cr,
> -                                 perfmgr_db_data_cnt_reading_t * dc)
> +                                 perfmgr_db_data_cnt_reading_t * dc,
> +                                 perfmgr_db_sel_reading_t * ps)
>  {
>       perfmgr_db_err_reading_t prev_err;
>       perfmgr_db_data_cnt_reading_t prev_dc;
> +     perfmgr_db_sel_reading_t prev_ps;
>  
>       if (perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_err)
>           != PERFMGR_EVENT_DB_SUCCESS) {
> @@ -905,6 +907,23 @@ static void perfmgr_check_oob_clear(osm_perfmgr_t * pm,
>                       mon_node->name, mon_node->guid, port);
>               perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
>       }
> +
> +     if (perfmgr_db_get_prev_ps(pm->db, mon_node->guid, port, &prev_ps)
> +         != PERFMGR_EVENT_DB_SUCCESS) {
> +             OSM_LOG(pm->log, OSM_LOG_VERBOSE,
> +                     "Failed to find previous select count "
> +                     "reading for %s (0x%" PRIx64 ") port %u\n",
> +                     mon_node->name, mon_node->guid, port);
> +             return;
> +     }
> +
> +     if (ps->xmit_wait < prev_ps.xmit_wait) {
> +             OSM_LOG(pm->log, OSM_LOG_ERROR,
> +                     "PerfMgr: ERR 4C17: Detected an out of band select 
> counter "
> +                     "clear on node %s (0x%" PRIx64 ") port %u\n",
> +                     mon_node->name, mon_node->guid, port);
> +             perfmgr_db_clear_prev_ps(pm->db, mon_node->guid, port);
> +     }
>  }
>  
>  /**********************************************************************
> @@ -1062,6 +1081,8 @@ static void pc_recv_process(void *context, void *data)
>       uint8_t port = mad_context->perfmgr_context.port;
>       perfmgr_db_err_reading_t err_reading;
>       perfmgr_db_data_cnt_reading_t data_reading;
> +     perfmgr_db_sel_reading_t select_reading;
> +
>       cl_map_item_t *p_node;
>       monitored_node_t *p_mon_node;
>  
> @@ -1148,10 +1169,12 @@ static void pc_recv_process(void *context, void *data)
>        */
>       perfmgr_db_fill_data_cnt_read_pc(wire_read, &data_reading);
>  
> +     perfmgr_db_fill_sel_read(wire_read, &select_reading);
> +
>       /* detect an out of band clear on the port */
>       if (mad_context->perfmgr_context.mad_method != IB_MAD_METHOD_SET)
>               perfmgr_check_oob_clear(pm, p_mon_node, port, &err_reading,
> -                                     &data_reading);
> +                                     &data_reading, &select_reading);
>  
>       /* log any critical events from this reading */
>       perfmgr_log_events(pm, p_mon_node, port, &err_reading);
> @@ -1161,9 +1184,12 @@ static void pc_recv_process(void *context, void *data)
>                                          &err_reading);
>               perfmgr_db_add_dc_reading(pm->db, node_guid, port,
>                                         &data_reading);
> +             perfmgr_db_add_ps_reading(pm->db, node_guid, port,
> +                                       &select_reading);
>       } else {
>               perfmgr_db_clear_prev_err(pm->db, node_guid, port);
>               perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
> +             perfmgr_db_clear_prev_ps(pm->db, node_guid, port);
>       }
>  
>       perfmgr_check_overflow(pm, p_mon_node, port, wire_read);
> diff --git a/opensm/opensm/osm_perfmgr_db.c b/opensm/opensm/osm_perfmgr_db.c
> index e5dfc19..132c2fb 100644
> --- a/opensm/opensm/osm_perfmgr_db.c
> +++ b/opensm/opensm/osm_perfmgr_db.c
> @@ -486,6 +486,102 @@ Exit:
>       return (rc);
>  }
>  
> +static inline void
> +debug_dump_ps_reading(perfmgr_db_t * db, uint64_t guid, uint8_t port_num,
> +                   db_port_t * port, perfmgr_db_sel_reading_t * cur)
> +{
> +     osm_log_t *log = db->perfmgr->log;
> +     if (!osm_log_is_active(log, OSM_LOG_DEBUG))
> +             return;
> +
> +     osm_log(log, OSM_LOG_DEBUG,
> +             "xd %" PRIu64 " <-- %" PRIu64 " (%" PRIu64 ")\n",
> +             cur->xmit_wait, port->ps_previous.xmit_wait,
> +             port->ps_total.xmit_wait);
> +}
> +
> +/**********************************************************************
> + * perfmgr_db_sel_reading_t functions
> + **********************************************************************/
> +perfmgr_db_err_t
> +perfmgr_db_add_ps_reading(perfmgr_db_t * db, uint64_t guid, uint8_t port,
> +                       perfmgr_db_sel_reading_t * reading)
> +{
> +     db_port_t *p_port = NULL;
> +     db_node_t *node = NULL;
> +     perfmgr_db_sel_reading_t *previous = NULL;
> +     perfmgr_db_err_t rc = PERFMGR_EVENT_DB_SUCCESS;
> +     osm_epi_ps_event_t epi_ps_data;
> +
> +     cl_plock_excl_acquire(&db->lock);
> +     node = get(db, guid);
> +     if ((rc = bad_node_port(node, port)) != PERFMGR_EVENT_DB_SUCCESS)
> +             goto Exit;
> +
> +     p_port = &node->ports[port];
> +     previous = &node->ports[port].ps_previous;
> +
> +     debug_dump_ps_reading(db, guid, port, p_port, reading);
> +
> +     epi_ps_data.time_diff_s = reading->time - previous->time;
> +     osm_epi_create_port_id(&epi_ps_data.port_id, guid, port,
> +                            node->node_name);
> +
> +     /* calculate changes from previous reading */
> +     epi_ps_data.xmit_wait = reading->xmit_wait - previous->xmit_wait;
> +     p_port->ps_total.xmit_wait += epi_ps_data.xmit_wait;
> +
> +     p_port->ps_previous = *reading;
> +     osm_opensm_report_event(db->perfmgr->osm,
> +                             OSM_EVENT_ID_PORT_SELECT, &epi_ps_data);
> +
> +Exit:
> +     cl_plock_release(&db->lock);
> +     return (rc);
> +}
> +
> +perfmgr_db_err_t perfmgr_db_get_prev_ps(perfmgr_db_t * db, uint64_t guid,
> +                                     uint8_t port,
> +                                     perfmgr_db_sel_reading_t * reading)
> +{
> +     db_node_t *node = NULL;
> +     perfmgr_db_err_t rc = PERFMGR_EVENT_DB_SUCCESS;
> +
> +     cl_plock_acquire(&db->lock);
> +
> +     node = get(db, guid);
> +     if ((rc = bad_node_port(node, port)) != PERFMGR_EVENT_DB_SUCCESS)
> +             goto Exit;
> +
> +     *reading = node->ports[port].ps_previous;
> +
> +Exit:
> +     cl_plock_release(&db->lock);
> +     return (rc);
> +}
> +
> +perfmgr_db_err_t
> +perfmgr_db_clear_prev_ps(perfmgr_db_t * db, uint64_t guid, uint8_t port)
> +{
> +     db_node_t *node = NULL;
> +     perfmgr_db_sel_reading_t *previous = NULL;
> +     perfmgr_db_err_t rc = PERFMGR_EVENT_DB_SUCCESS;
> +
> +     cl_plock_excl_acquire(&db->lock);
> +     node = get(db, guid);
> +     if ((rc = bad_node_port(node, port)) != PERFMGR_EVENT_DB_SUCCESS)
> +             goto Exit;
> +
> +     previous = &node->ports[port].ps_previous;
> +
> +     memset(previous, 0, sizeof(*previous));
> +     node->ports[port].ps_previous.time = time(NULL);
> +
> +Exit:
> +     cl_plock_release(&db->lock);
> +     return (rc);
> +}
> +
>  static void clear_counters(cl_map_item_t * const p_map_item, void *context)
>  {
>       db_node_t *node = (db_node_t *) p_map_item;
> @@ -517,6 +613,8 @@ static void clear_counters(cl_map_item_t * const 
> p_map_item, void *context)
>               node->ports[i].dc_total.multicast_rcv_pkts = 0;
>               node->ports[i].dc_total.time = ts;
>  
> +             node->ports[i].ps_total.xmit_wait = 0;
> +
>               node->ports[i].last_reset = ts;
>       }
>  }
> @@ -546,7 +644,7 @@ static void dump_node_mr(db_node_t * node, FILE * fp)
>               "%s\t%s\t"
>               "%s\t%s\t%s\t%s\t%s\t%s\t%s\t"
>               "%s\t%s\t%s\t%s\t%s\t%s\t%s\t"
> -             "%s\t%s\t%s\t%s\n",
> +             "%s\t%s\t%s\t%s\t%s\n",
>               "symbol_err_cnt",
>               "link_err_recover",
>               "link_downed",
> @@ -565,8 +663,7 @@ static void dump_node_mr(db_node_t * node, FILE * fp)
>               "rcv_pkts",
>               "unicast_xmit_pkts",
>               "unicast_rcv_pkts",
> -             "multicast_xmit_pkts",
> -             "multicast_rcv_pkts");
> +             "multicast_xmit_pkts", "multicast_rcv_pkts", "xmit_wait");
>       for (i = (node->esp0) ? 0 : 1; i < node->num_ports; i++) {
>               char *since = ctime(&node->ports[i].last_reset);
>               since[strlen(since) - 1] = '\0';        /* remove \n */
> @@ -577,8 +674,8 @@ static void dump_node_mr(db_node_t * node, FILE * fp)
>                       "%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t" "%" PRIu64
>                       "\t%" PRIu64 "\t%" PRIu64 "\t" "%" PRIu64 "\t%" PRIu64
>                       "\t%" PRIu64 "\t%" PRIu64 "\t" "%" PRIu64 "\t%" PRIu64
> -                     "\t%" PRIu64 "\t%" PRIu64 "\n", node->node_name,
> -                     node->node_guid, i, since,
> +                     "\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\n",
> +                     node->node_name, node->node_guid, i, since,
>                       node->ports[i].err_total.symbol_err_cnt,
>                       node->ports[i].err_total.link_err_recover,
>                       node->ports[i].err_total.link_downed,
> @@ -598,7 +695,8 @@ static void dump_node_mr(db_node_t * node, FILE * fp)
>                       node->ports[i].dc_total.unicast_xmit_pkts,
>                       node->ports[i].dc_total.unicast_rcv_pkts,
>                       node->ports[i].dc_total.multicast_xmit_pkts,
> -                     node->ports[i].dc_total.multicast_rcv_pkts);
> +                     node->ports[i].dc_total.multicast_rcv_pkts,
> +                     node->ports[i].ps_total.xmit_wait);
>       }
>  }
>  
> @@ -634,7 +732,8 @@ static void dump_node_hr(db_node_t * node, FILE * fp)
>                       "     unicast_xmit_pkts    : %" PRIu64 "\n"
>                       "     unicast_rcv_pkts     : %" PRIu64 "\n"
>                       "     multicast_xmit_pkts  : %" PRIu64 "\n"
> -                     "     multicast_rcv_pkts   : %" PRIu64 "\n",
> +                     "     multicast_rcv_pkts   : %" PRIu64 "\n"
> +                     "     xmit_wait            : %" PRIu64 "\n",
>                       node->node_name,
>                       node->node_guid,
>                       i,
> @@ -658,7 +757,8 @@ static void dump_node_hr(db_node_t * node, FILE * fp)
>                       node->ports[i].dc_total.unicast_xmit_pkts,
>                       node->ports[i].dc_total.unicast_rcv_pkts,
>                       node->ports[i].dc_total.multicast_xmit_pkts,
> -                     node->ports[i].dc_total.multicast_rcv_pkts);
> +                     node->ports[i].dc_total.multicast_rcv_pkts,
> +                     node->ports[i].ps_total.xmit_wait);
>       }
>  }
>  
> @@ -809,4 +909,12 @@ perfmgr_db_fill_data_cnt_read_epc(ib_port_counters_ext_t 
> * wire_read,
>       reading->multicast_rcv_pkts = cl_ntoh64(wire_read->multicast_rcv_pkts);
>       reading->time = time(NULL);
>  }
> +
> +void
> +perfmgr_db_fill_sel_read(ib_port_counters_t * wire_read,
> +                      perfmgr_db_sel_reading_t * reading)
> +{
> +     reading->xmit_wait = cl_ntoh32(wire_read->xmit_wait);
> +     reading->time = time(NULL);
> +}
>  #endif                               /* ENABLE_OSM_PERF_MGR */
> 
_______________________________________________
general mailing list
[email protected]
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to