On Wed, 2010-02-10 at 09:15 -0700, Yevgeny Kliteynik wrote: > Hi Jim, > > [snip...] > > On 20/Nov/09 21:15, Jim Schutt wrote: > > diff --git a/opensm/opensm/osm_qos.c b/opensm/opensm/osm_qos.c > > index 08f9a60..f42c334 100644 > > --- a/opensm/opensm/osm_qos.c > > +++ b/opensm/opensm/osm_qos.c > > @@ -194,6 +194,7 @@ static ib_api_status_t sl2vl_update(osm_sm_t * sm, > > osm_port_t * p_port, > > { > > ib_api_status_t status; > > uint8_t i, num_ports; > > + struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used; > > osm_physp_t *p_physp; > > > > if (osm_node_get_type(osm_physp_get_node_ptr(p)) == > > IB_NODE_TYPE_SWITCH) { > > @@ -213,8 +214,24 @@ static ib_api_status_t sl2vl_update(osm_sm_t * sm, > > osm_port_t * p_port, > > } > > > > for (i = 0; i< num_ports; i++) { > > + ib_slvl_table_t routing_sl2vl; > > + const ib_slvl_table_t *port_sl2vl; > > + const ib_slvl_table_t *port_sl2vl_old; > > + > > + if (re->update_sl2vl) { > > > > If routing failed, and no_fallback specified, OSM crashes here. > The simple fix is, of course, just fixing the condition to > "(re && re->update_sl2vl)", but
This could cause message deadlock for applications still running on parts of fabric that are sill operational, if the last successful routing was via a routing engine that wants to set SL2VL map values, because we would overwrite them with inappropriate values. But the following equivalent change would be OK: diff --git a/opensm/opensm/osm_qos.c b/opensm/opensm/osm_qos.c index 0f0b24f..07f4836 100644 --- a/opensm/opensm/osm_qos.c +++ b/opensm/opensm/osm_qos.c @@ -197,6 +197,12 @@ static ib_api_status_t sl2vl_update(osm_sm_t * sm, osm_port_t * p_port, struct osm_routing_engine *re = sm->p_subn->p_osm->routing_engine_used; osm_physp_t *p_physp; + /* + * Do nothing unless the most recent routing attempt was successful. + */ + if (!re) + return IB_SUCCESS; + if (osm_node_get_type(osm_physp_get_node_ptr(p)) == IB_NODE_TYPE_SWITCH) { if (ib_port_info_get_vl_cap(&p->port_info) == 1) { /* Check port 0's capability mask */ > I think that it would be better > not to apply QoS configuration if unicast manager failed - just > restart the sweep. I think you are right. Something like this? diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c index 10d5e09..d8d4c9e 100644 --- a/opensm/opensm/osm_state_mgr.c +++ b/opensm/opensm/osm_state_mgr.c @@ -1113,7 +1113,11 @@ static void do_sweep(osm_sm_t * sm) /* Re-program the switches fully */ sm->p_subn->ignore_existing_lfts = TRUE; - osm_ucast_mgr_process(&sm->ucast_mgr); + if (osm_ucast_mgr_process(&sm->ucast_mgr)) { + OSM_LOG_MSG_BOX(sm->p_log, OSM_LOG_VERBOSE, + "REROUTE FAILED"); + return; + } osm_qos_setup(sm->p_subn->p_osm); /* Reset flag */ @@ -1272,12 +1276,14 @@ repeat_discovery: "LID ASSIGNMENT COMPLETE - STARTING SWITCH TABLE CONFIG"); /* - * Proceed with unicast forwarding table configuration. + * Proceed with unicast forwarding table configuration; repeat + * if unicast routing fails. */ if (!sm->ucast_mgr.cache_valid || osm_ucast_cache_process(&sm->ucast_mgr)) - osm_ucast_mgr_process(&sm->ucast_mgr); + if (osm_ucast_mgr_process(&sm->ucast_mgr)) + goto repeat_discovery; osm_qos_setup(sm->p_subn->p_osm); diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c index fbc9244..8ea2e52 100644 --- a/opensm/opensm/osm_ucast_mgr.c +++ b/opensm/opensm/osm_ucast_mgr.c @@ -955,6 +955,7 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr) osm_opensm_t *p_osm; struct osm_routing_engine *p_routing_eng; cl_qmap_t *p_sw_guid_tbl; + int failed = 0; OSM_LOG_ENTER(p_mgr->p_log); @@ -973,7 +974,8 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr) p_osm->routing_engine_used = NULL; while (p_routing_eng) { - if (!ucast_mgr_route(p_routing_eng, p_osm)) + failed = ucast_mgr_route(p_routing_eng, p_osm); + if (!failed) break; p_routing_eng = p_routing_eng->next; } @@ -984,9 +986,11 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr) struct osm_routing_engine *r = p_osm->default_routing_engine; r->build_lid_matrices(r->context); - r->ucast_build_fwd_tables(r->context); - p_osm->routing_engine_used = r; - osm_ucast_mgr_set_fwd_tables(p_mgr); + failed = r->ucast_build_fwd_tables(r->context); + if (!failed) { + p_osm->routing_engine_used = r; + osm_ucast_mgr_set_fwd_tables(p_mgr); + } } if (p_osm->routing_engine_used) { @@ -1006,7 +1010,7 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr) Exit: CL_PLOCK_RELEASE(p_mgr->p_lock); OSM_LOG_EXIT(p_mgr->p_log); - return 0; + return failed; } static int ucast_build_lid_matrices(void *context) Thanks -- Jim > > -- Yevgeny > > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html