RE: [PATCH v5] IB/ipoib: fix dangling pointer references to ipoib_neigh and ipoib_path

2010-09-03 Thread Ralph Campbell
I haven't seen this stack trace before. Since it involves the RX
QP connection and my patch changes the TX QP connection,
I doubt my patch has any effect on this case. When I get some
time, I will look to see if I can find similar races in the RX connection
set up  tear down that might exist.

From: Pradeep Satyanarayana [prade...@linux.vnet.ibm.com]
Sent: Thursday, September 02, 2010 8:41 PM
To: Ralph Campbell
Cc: Roland Dreier; linux-rdma@vger.kernel.org
Subject: Re: [PATCH v5] IB/ipoib: fix dangling pointer references to 
ipoib_neigh and ipoib_path

Ralph,

I see the following crash sporadically (only under stress) with a Sles11SP1 
(which is 2.6.32 kernel).
I saw this crash with V4 of your patch and have not yet had a chance to try V5. 
Have you seen this
in your testing? If this not the crash stack can you please share what your 
patch fixes?

4ib0: RX drain timing out
4idr_remove called for id=11491974 which is not allocated.
4Call Trace:
4[c00749fe33b0] [c00129e4] .show_stack+0x6c/0x198 (unreliable)
4[c00749fe3460] [c02ea594] .sub_remove+0x1ec/0x1f8
4[c00749fe3520] [c02ea5e0] .idr_remove+0x40/0xf8
4[c00749fe35b0] [d00012d84d70] .cm_destroy_id+0xa0/0x520 [ib_cm]
4[c00749fe3680] [d0001b7fb644] .ipoib_cm_free_rx_reap_list+0xd4/0x190 
[ib_ipoib]
4[c00749fe3740] [d0001b7fe404] .ipoib_cm_dev_stop+0x23c/0x360 
[ib_ipoib]
4[c00749fe3800] [d0001b7f4dbc] .ipoib_ib_dev_stop+0xe4/0x4b0 
[ib_ipoib]
4[c00749fe3960] [d0001b7f0f30] .ipoib_stop+0x88/0x178 [ib_ipoib]
4[c00749fe39f0] [c04eacf4] .dev_close+0xdc/0x148
4[c00749fe3a80] [c04ea2b8] .dev_change_flags+0x1f0/0x288
4[c00749fe3b20] [d0001b7f11b8] .ipoib_remove_one+0xb8/0x140 [ib_ipoib]
4[c00749fe3bc0] [d0001210425c] .ib_unregister_client+0xb4/0x1b8 
[ib_core]
4[c00749fe3c90] [d0001b7ffde8] .ipoib_cleanup_module+0x20/0x60 
[ib_ipoib]
4[c00749fe3d20] [c00ec408] .SyS_delete_module+0x238/0x320
4[c00749fe3e30] [c00085b4] syscall_exit+0x0/0x40
1Unable to handle kernel paging request for data at address 0x4527228d1ffb
1Faulting instruction address: 0xc05a8e88
12:mon e
cpu 0x12: Vector: 300 (Data Access) at [c00749fe3250]
pc: c05a8e88: .wait_for_common+0xb8/0x268
lr: c05a8e20: .wait_for_common+0x50/0x268
sp: c00749fe34d0
   msr: 80009032
   dar: 4527228d1ffb
 dsisr: 4200
  current = 0xc0074b4ce0e0
  paca= 0xc0f64a00
pid   = 13605, comm = modprobe
12:mon

Thanks
Pradeep


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 04/18] opensm: Track the minimum value in the fabric of data VLs supported.

2010-09-03 Thread Jim Schutt
A routing engine that wants to make contributions to SL2VL maps in support
of routing free from credit loops may need to know the minimum number
of supported data VLs in the fabric.

This code tracks that value.

Signed-off-by: Jim Schutt jasc...@sandia.gov
---
 opensm/include/opensm/osm_subnet.h |1 +
 opensm/opensm/osm_port_info_rcv.c  |   13 -
 opensm/opensm/osm_state_mgr.c  |6 ++
 opensm/opensm/osm_subnet.c |1 +
 4 files changed, 20 insertions(+), 1 deletions(-)

diff --git a/opensm/include/opensm/osm_subnet.h 
b/opensm/include/opensm/osm_subnet.h
index 95a635c..4fa0161 100644
--- a/opensm/include/opensm/osm_subnet.h
+++ b/opensm/include/opensm/osm_subnet.h
@@ -536,6 +536,7 @@ typedef struct osm_subn {
uint16_t max_mcast_lid_ho;
uint8_t min_ca_mtu;
uint8_t min_ca_rate;
+   uint8_t min_data_vls;
boolean_t ignore_existing_lfts;
boolean_t subnet_initialization_error;
boolean_t force_heavy_sweep;
diff --git a/opensm/opensm/osm_port_info_rcv.c 
b/opensm/opensm/osm_port_info_rcv.c
index 9260047..c05301e 100644
--- a/opensm/opensm/osm_port_info_rcv.c
+++ b/opensm/opensm/osm_port_info_rcv.c
@@ -83,6 +83,7 @@ static void pi_rcv_process_endport(IN osm_sm_t * sm, IN 
osm_physp_t * p_physp,
ib_api_status_t status;
ib_net64_t port_guid;
uint8_t rate, mtu;
+   unsigned data_vls;
cl_qmap_t *p_sm_tbl;
osm_remote_sm_t *p_sm;
 
@@ -92,7 +93,7 @@ static void pi_rcv_process_endport(IN osm_sm_t * sm, IN 
osm_physp_t * p_physp,
 
/* HACK extended port 0 should be handled too! */
if (osm_physp_get_port_num(p_physp) != 0) {
-   /* track the minimal endport MTU and rate */
+   /* track the minimal endport MTU, rate, and operational VLs */
mtu = ib_port_info_get_mtu_cap(p_pi);
if (mtu  sm-p_subn-min_ca_mtu) {
OSM_LOG(sm-p_log, OSM_LOG_VERBOSE,
@@ -108,6 +109,16 @@ static void pi_rcv_process_endport(IN osm_sm_t * sm, IN 
osm_physp_t * p_physp,
PRIx64 \n, rate, cl_ntoh64(port_guid));
sm-p_subn-min_ca_rate = rate;
}
+
+   data_vls = 1U  (ib_port_info_get_op_vls(p_pi) - 1);
+   if (data_vls = IB_MAX_NUM_VLS)
+   data_vls = IB_MAX_NUM_VLS - 1;
+   if ((uint8_t)data_vls  sm-p_subn-min_data_vls) {
+   OSM_LOG(sm-p_log, OSM_LOG_VERBOSE,
+   Setting endport minimal data VLs to:%u defined 
by port:0x%
+   PRIx64 \n, data_vls, cl_ntoh64(port_guid));
+   sm-p_subn-min_data_vls = data_vls;
+   }
}
 
if (port_guid != sm-p_subn-sm_port_guid) {
diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
index a3d09d8..bb60636 100644
--- a/opensm/opensm/osm_state_mgr.c
+++ b/opensm/opensm/osm_state_mgr.c
@@ -1171,6 +1171,12 @@ repeat_discovery:
sm-p_subn-force_reroute = FALSE;
sm-p_subn-subnet_initialization_error = FALSE;
 
+   /* Reset tracking values in case limiting component got removed
+* from fabric. */
+   sm-p_subn-min_ca_mtu = IB_MAX_MTU;
+   sm-p_subn-min_ca_rate = IB_MAX_RATE;
+   sm-p_subn-min_data_vls = IB_MAX_NUM_VLS - 1;
+
/* rescan configuration updates */
if (!config_parsed  osm_subn_rescan_conf_files(sm-p_subn)  0)
OSM_LOG(sm-p_log, OSM_LOG_ERROR, ERR 331A: 
diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
index d5c5ab2..8224b5f 100644
--- a/opensm/opensm/osm_subnet.c
+++ b/opensm/opensm/osm_subnet.c
@@ -529,6 +529,7 @@ ib_api_status_t osm_subn_init(IN osm_subn_t * p_subn, IN 
osm_opensm_t * p_osm,
p_subn-max_mcast_lid_ho = IB_LID_MCAST_END_HO;
p_subn-min_ca_mtu = IB_MAX_MTU;
p_subn-min_ca_rate = IB_MAX_RATE;
+   p_subn-min_data_vls = IB_MAX_NUM_VLS - 1;
p_subn-ignore_existing_lfts = TRUE;
 
/* we assume master by default - so we only need to set it true if 
STANDBY */
-- 
1.6.2.2


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 02/18] opensm: Allow the routing engine to influence SL2VL calculations.

2010-09-03 Thread Jim Schutt
Note that the original code assumes that QoS setup is mostly static and
based only on user configuration.  As a result, there is no provision for
routing engines that want to compute contributions to the SL2VL maps.

Fix this up by adding a callback to struct osm_routing_engine that computes
a per-port SL2VL map, and call it from the appropriate place in the QoS
setup path.  Assume that if a routing engine provides a update_sl2vl()
callback that there will input-port dependence in the SL2VL maps, and
so do not attempt to use optimized SL2VL map programming even if the
switch supports it.

Also need to move the call to osm_qos_setup() in do_sweep() to after the
call to the routing engine, so that any SL2VL map contributions from the
routing engine are based on the latest information.  Need to call
osm_qos_setup() for requested reroute for the same reason.

Signed-off-by: Jim Schutt jasc...@sandia.gov
---
 opensm/include/opensm/osm_opensm.h |   12 
 opensm/opensm/osm_qos.c|   27 +++
 opensm/opensm/osm_state_mgr.c  |5 +++--
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/opensm/include/opensm/osm_opensm.h 
b/opensm/include/opensm/osm_opensm.h
index e97142e..25a6f90 100644
--- a/opensm/include/opensm/osm_opensm.h
+++ b/opensm/include/opensm/osm_opensm.h
@@ -126,6 +126,9 @@ struct osm_routing_engine {
int (*build_lid_matrices) (void *context);
int (*ucast_build_fwd_tables) (void *context);
void (*ucast_dump_tables) (void *context);
+   void (*update_sl2vl)(void *context, IN osm_physp_t *port,
+IN uint8_t in_port_num, IN uint8_t out_port_num,
+IN OUT ib_slvl_table_t *t);
void (*delete) (void *context);
struct osm_routing_engine *next;
 };
@@ -147,6 +150,15 @@ struct osm_routing_engine {
 *  ucast_dump_tables
 *  The callback for dumping unicast routing tables.
 *
+*  update_sl2vl(void *context, IN osm_physp_t *port,
+*   IN uint8_t in_port_num, IN uint8_t out_port_num,
+*   OUT ib_slvl_table_t *t)
+*  The callback to allow routing engine input for SL2VL maps.
+*  *port is the phyical port for which the SL2VL map is to be
+*  updated. For switches, in_port_num/out_port_num identify
+*  which part of the SL2VL map to update.  For router/HCA ports,
+*  in_port_num/out_port_num should be ignored.
+*
 *  delete
 *  The delete method, may be used for routing engine
 *  internals cleanup.
diff --git a/opensm/opensm/osm_qos.c b/opensm/opensm/osm_qos.c
index c90073e..e0f4411 100644
--- a/opensm/opensm/osm_qos.c
+++ b/opensm/opensm/osm_qos.c
@@ -207,6 +207,7 @@ static int qos_extports_setup(osm_sm_t * sm, osm_node_t 
*node,
osm_physp_t *p0, *p;
unsigned force_update;
unsigned num_ports = osm_node_get_num_physp(node);
+   struct osm_routing_engine *re = sm-p_subn-p_osm-routing_engine_used;
int ret = 0;
unsigned in, out;
uint8_t op_vl1;
@@ -224,7 +225,7 @@ static int qos_extports_setup(osm_sm_t * sm, osm_node_t 
*node,
return ret;
 
if (ib_switch_info_get_opt_sl2vlmapping(node-sw-switch_info) 
-   sm-p_subn-opt.use_optimized_slvl) {
+   sm-p_subn-opt.use_optimized_slvl  !re-update_sl2vl) {
p = osm_node_get_physp_ptr(node, 1);
op_vl1 = ib_port_info_get_op_vls(p-port_info);
force_update = p-need_update || sm-p_subn-need_update;
@@ -249,10 +250,20 @@ static int qos_extports_setup(osm_sm_t * sm, osm_node_t 
*node,
p = osm_node_get_physp_ptr(node, out);
force_update = p-need_update || sm-p_subn-need_update;
/* go over all in ports */
-   for (in = 0; in  num_ports; in++)
+   for (in = 0; in  num_ports; in++) {
+   const ib_slvl_table_t *port_sl2vl = qcfg-sl2vl;
+   ib_slvl_table_t routing_sl2vl;
+
+   if (re-update_sl2vl) {
+   routing_sl2vl = *port_sl2vl;
+   re-update_sl2vl(re-context,
+p, in, out, routing_sl2vl);
+   port_sl2vl = routing_sl2vl;
+   }
if (sl2vl_update_table(sm, p, in, in  8 | out,
-  force_update, qcfg-sl2vl))
+  force_update, port_sl2vl))
ret = -1;
+   }
}
 
return ret;
@@ -262,6 +273,9 @@ static int qos_endport_setup(osm_sm_t * sm, osm_physp_t * p,
 const struct qos_config *qcfg, int vlarb_only)
 {
unsigned force_update = p-need_update || sm-p_subn-need_update;
+   struct 

[PATCH v4 12/18] opensm: Enable torus-2QoS routing engine.

2010-09-03 Thread Jim Schutt

Signed-off-by: Jim Schutt jasc...@sandia.gov
---
 opensm/include/opensm/osm_opensm.h |1 +
 opensm/opensm/main.c   |2 +-
 opensm/opensm/osm_opensm.c |6 ++
 3 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/opensm/include/opensm/osm_opensm.h 
b/opensm/include/opensm/osm_opensm.h
index fddcf53..8d63111 100644
--- a/opensm/include/opensm/osm_opensm.h
+++ b/opensm/include/opensm/osm_opensm.h
@@ -105,6 +105,7 @@ typedef enum _osm_routing_engine_type {
OSM_ROUTING_ENGINE_TYPE_FTREE,
OSM_ROUTING_ENGINE_TYPE_LASH,
OSM_ROUTING_ENGINE_TYPE_DOR,
+   OSM_ROUTING_ENGINE_TYPE_TORUS_2QOS,
OSM_ROUTING_ENGINE_TYPE_UNKNOWN
 } osm_routing_engine_type_t;
 /***/
diff --git a/opensm/opensm/main.c b/opensm/opensm/main.c
index 6e6c733..3a6d565 100644
--- a/opensm/opensm/main.c
+++ b/opensm/opensm/main.c
@@ -174,7 +174,7 @@ static void show_usage(void)
 Min Hop algorithm.  Multiple routing engines can be 
specified\n
 separated by commas so that specific ordering of 
routing\n
 algorithms will be tried if earlier routing engines 
fail.\n
-Supported engines: updn, file, ftree, lash, dor\n\n);
+Supported engines: updn, file, ftree, lash, dor, 
torus-2QoS\n\n);
printf(--do_mesh_analysis\n
 This option enables additional analysis for the 
lash\n
 routing engine to precondition switch port 
assignments\n
diff --git a/opensm/opensm/osm_opensm.c b/opensm/opensm/osm_opensm.c
index 1c865b0..a69b7bb 100644
--- a/opensm/opensm/osm_opensm.c
+++ b/opensm/opensm/osm_opensm.c
@@ -70,6 +70,7 @@ extern int osm_ucast_file_setup(struct osm_routing_engine *, 
osm_opensm_t *);
 extern int osm_ucast_ftree_setup(struct osm_routing_engine *, osm_opensm_t *);
 extern int osm_ucast_lash_setup(struct osm_routing_engine *, osm_opensm_t *);
 extern int osm_ucast_dor_setup(struct osm_routing_engine *, osm_opensm_t *);
+extern int osm_ucast_torus2QoS_setup(struct osm_routing_engine *, osm_opensm_t 
*);
 
 const static struct routing_engine_module routing_modules[] = {
{minhop, osm_ucast_minhop_setup},
@@ -78,6 +79,7 @@ const static struct routing_engine_module routing_modules[] = 
{
{ftree, osm_ucast_ftree_setup},
{lash, osm_ucast_lash_setup},
{dor, osm_ucast_dor_setup},
+   {torus-2QoS, osm_ucast_torus2QoS_setup},
{NULL, NULL}
 };
 
@@ -98,6 +100,8 @@ const char *osm_routing_engine_type_str(IN 
osm_routing_engine_type_t type)
return lash;
case OSM_ROUTING_ENGINE_TYPE_DOR:
return dor;
+   case OSM_ROUTING_ENGINE_TYPE_TORUS_2QOS:
+   return torus-2QoS;
default:
break;
}
@@ -124,6 +128,8 @@ osm_routing_engine_type_t osm_routing_engine_type(IN const 
char *str)
return OSM_ROUTING_ENGINE_TYPE_LASH;
else if (!strcasecmp(str, dor))
return OSM_ROUTING_ENGINE_TYPE_DOR;
+   else if (!strcasecmp(str, torus-2QoS))
+   return OSM_ROUTING_ENGINE_TYPE_TORUS_2QOS;
else
return OSM_ROUTING_ENGINE_TYPE_UNKNOWN;
 }
-- 
1.6.2.2


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 18/18] opensm: Cause status of unicast routing attempt to propogate to callers of osm_ucast_mgr_process().

2010-09-03 Thread Jim Schutt
If unicast routing fails, there is no point to continuing with fabric bring-up.
Just restart a new heavy sweep instead.

Signed-off-by: Jim Schutt jasc...@sandia.gov
---
 opensm/opensm/osm_state_mgr.c |   12 +---
 opensm/opensm/osm_ucast_mgr.c |   14 +-
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
index bb60636..1befbfe 100644
--- a/opensm/opensm/osm_state_mgr.c
+++ b/opensm/opensm/osm_state_mgr.c
@@ -1142,7 +1142,11 @@ static void do_sweep(osm_sm_t * sm)
/* Re-program the switches fully */
sm-p_subn-ignore_existing_lfts = TRUE;
 
-   osm_ucast_mgr_process(sm-ucast_mgr);
+   if (osm_ucast_mgr_process(sm-ucast_mgr)) {
+   OSM_LOG_MSG_BOX(sm-p_log, OSM_LOG_VERBOSE,
+   REROUTE FAILED);
+   return;
+   }
osm_qos_setup(sm-p_subn-p_osm);
 
/* Reset flag */
@@ -1313,12 +1317,14 @@ repeat_discovery:
LID ASSIGNMENT COMPLETE - STARTING SWITCH TABLE 
CONFIG);
 
/*
-* Proceed with unicast forwarding table configuration.
+* Proceed with unicast forwarding table configuration; if it fails
+* return early to wait for a trap or the next sweep interval.
 */
 
if (!sm-ucast_mgr.cache_valid ||
osm_ucast_cache_process(sm-ucast_mgr))
-   osm_ucast_mgr_process(sm-ucast_mgr);
+   if (osm_ucast_mgr_process(sm-ucast_mgr))
+   return;
 
osm_qos_setup(sm-p_subn-p_osm);
 
diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
index f5a715f..85495eb 100644
--- a/opensm/opensm/osm_ucast_mgr.c
+++ b/opensm/opensm/osm_ucast_mgr.c
@@ -1069,6 +1069,7 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
osm_opensm_t *p_osm;
struct osm_routing_engine *p_routing_eng;
cl_qmap_t *p_sw_guid_tbl;
+   int failed = 0;
 
OSM_LOG_ENTER(p_mgr-p_log);
 
@@ -1087,7 +1088,8 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
 
p_osm-routing_engine_used = NULL;
while (p_routing_eng) {
-   if (!ucast_mgr_route(p_routing_eng, p_osm))
+   failed = ucast_mgr_route(p_routing_eng, p_osm);
+   if (!failed)
break;
p_routing_eng = p_routing_eng-next;
}
@@ -1098,9 +1100,11 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
struct osm_routing_engine *r = p_osm-default_routing_engine;
 
r-build_lid_matrices(r-context);
-   r-ucast_build_fwd_tables(r-context);
-   p_osm-routing_engine_used = r;
-   osm_ucast_mgr_set_fwd_tables(p_mgr);
+   failed = r-ucast_build_fwd_tables(r-context);
+   if (!failed) {
+   p_osm-routing_engine_used = r;
+   osm_ucast_mgr_set_fwd_tables(p_mgr);
+   }
}
 
if (p_osm-routing_engine_used) {
@@ -1120,7 +1124,7 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
 Exit:
CL_PLOCK_RELEASE(p_mgr-p_lock);
OSM_LOG_EXIT(p_mgr-p_log);
-   return 0;
+   return failed;
 }
 
 static int ucast_build_lid_matrices(void *context)
-- 
1.6.2.2


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 17/18] opensm: Avoid havoc in dump_ucast_routes() caused by torus-2QoS persistent use of osm_port_t:priv.

2010-09-03 Thread Jim Schutt
Torus-2QoS makes persistent use of osm_port_t:priv to speed calculation
of path SL values.

However, osm_switch_recommend_path() uses a non-NULL osm_port_t:priv
as a flag that osm_port_t:priv holds a tracking array used when
LMC  0.  It turns out that 1) dump_ucast_routes() does not need
osm_switch_recommend_path() to consider alternate routes, and 2)
before the addition of torus-2QoS, osm_port_t:priv use never
persisted past the unicast routing function, so it was always
NULL on entry to dump_ucast_routes().

Fix this up by making the routing_for_lmc flag explicitly set by
the caller of osm_switch_recommend_path(), rather than inferring
it from osm_port_t:priv.  This retains existing behavior for
existing routing engines, and allows torus-2QoS to make persistent
use of osm_port_t:priv.

The alternative would be to add another member to osm_port_t,
say osm_port_t:priv2.

Signed-off-by: Jim Schutt jasc...@sandia.gov
---
 opensm/include/opensm/osm_switch.h |   12 
 opensm/opensm/osm_dump.c   |2 +-
 opensm/opensm/osm_switch.c |7 ---
 opensm/opensm/osm_ucast_mgr.c  |1 +
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/opensm/include/opensm/osm_switch.h 
b/opensm/include/opensm/osm_switch.h
index 51a8427..f407dd9 100644
--- a/opensm/include/opensm/osm_switch.h
+++ b/opensm/include/opensm/osm_switch.h
@@ -918,6 +918,7 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * 
p_sw,
  IN osm_port_t * p_port, IN uint16_t lid_ho,
  IN unsigned start_from,
  IN boolean_t ignore_existing,
+ IN boolean_t routing_for_lmc,
  IN boolean_t dor);
 /*
 * PARAMETERS
@@ -940,6 +941,17 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * 
p_sw,
 *  If false, the switch will choose an existing route if one
 *  exists, otherwise will choose the optimal route.
 *
+*  routing_for_lmc
+*  [in] We support an enhanced LMC aware routing mode:
+*  In the case of LMC  0, we can track the remote side
+*  system and node for all of the lids of the target
+*  and try and avoid routing again through the same
+*  system / node.
+*
+*  Assume if routing_for_lmc is TRUE that this procedure
+*  was provided with the tracking array and counter via
+*  p_port-priv, and we can conduct this algorithm.
+*
 *  dor
 *  [in] If TRUE, Dimension Order Routing will be done.
 *
diff --git a/opensm/opensm/osm_dump.c b/opensm/opensm/osm_dump.c
index bfff1a0..535a03f 100644
--- a/opensm/opensm/osm_dump.c
+++ b/opensm/opensm/osm_dump.c
@@ -221,7 +221,7 @@ static void dump_ucast_routes(cl_map_item_t * item, FILE * 
file, void *cxt)
/* No LMC Optimization */
best_port = osm_switch_recommend_path(p_sw, p_port,
  lid_ho, 1, TRUE,
- dor);
+ FALSE, dor);
fprintf(file, No %u hop path possible via port %u!,
best_hops, best_port);
}
diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
index b621852..9785a9d 100644
--- a/opensm/opensm/osm_switch.c
+++ b/opensm/opensm/osm_switch.c
@@ -216,6 +216,7 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * 
p_sw,
  IN osm_port_t * p_port, IN uint16_t lid_ho,
  IN unsigned start_from,
  IN boolean_t ignore_existing,
+ IN boolean_t routing_for_lmc,
  IN boolean_t dor)
 {
/*
@@ -225,10 +226,10 @@ uint8_t osm_switch_recommend_path(IN const osm_switch_t * 
p_sw,
   and try and avoid routing again through the same
   system / node.
 
-  If this procedure is provided with the tracking array
-  and counter we can conduct this algorithm.
+  Assume if routing_for_lmc is true that this procedure was
+  provided the tracking array and counter via p_port-priv,
+  and we can conduct this algorithm.
 */
-   boolean_t routing_for_lmc = (p_port-priv != NULL);
uint16_t base_lid;
uint8_t hops;
uint8_t least_hops;
diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
index e6e40f0..f5a715f 100644
--- a/opensm/opensm/osm_ucast_mgr.c
+++ b/opensm/opensm/osm_ucast_mgr.c
@@ -252,6 +252,7 @@ static void ucast_mgr_process_port(IN osm_ucast_mgr_t * 
p_mgr,
 */
port = osm_switch_recommend_path(p_sw, p_port, lid_ho, start_from,
 

[PATCH v4 15/18] opensm: Make it possible to configure no fallback routing engine.

2010-09-03 Thread Jim Schutt
For a fabric that requires routing with an engine with special properties,
say avoiding credit loops via making use of SLs in routing, it might
be preferable to not fall back to minhop if the configured routing engine
fails.

E.g. the torus-2QoS routing engine uses both SL2VL maps and path SL values
to provide routing free of credit loops, but cannot route fabrics for
some patterns of failed switches.  Should a switch fail that creates such
a pattern, it may be preferable to keep the previous routing information
loaded in the switches until a switch can be replaced that restores
torus-2QoS's ability to route the fabric.

The alternative, having some other engine route the fabric, will immediately
introduce credit loops.

Signed-off-by: Jim Schutt jasc...@sandia.gov
---
 opensm/include/opensm/osm_subnet.h |1 +
 opensm/opensm/osm_opensm.c |5 +
 opensm/opensm/osm_qos.c|6 ++
 opensm/opensm/osm_ucast_mgr.c  |   23 +++
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/opensm/include/opensm/osm_subnet.h 
b/opensm/include/opensm/osm_subnet.h
index fa3e46e..42ae416 100644
--- a/opensm/include/opensm/osm_subnet.h
+++ b/opensm/include/opensm/osm_subnet.h
@@ -219,6 +219,7 @@ typedef struct osm_subn_opt {
osm_qos_options_t qos_rtr_options;
boolean_t enable_quirks;
boolean_t no_clients_rereg;
+   boolean_t no_fallback_routing_engine;
 #ifdef ENABLE_OSM_PERF_MGR
boolean_t perfmgr;
boolean_t perfmgr_redir;
diff --git a/opensm/opensm/osm_opensm.c b/opensm/opensm/osm_opensm.c
index a69b7bb..82aa987 100644
--- a/opensm/opensm/osm_opensm.c
+++ b/opensm/opensm/osm_opensm.c
@@ -159,6 +159,11 @@ static struct osm_routing_engine 
*setup_routing_engine(osm_opensm_t *osm,
struct osm_routing_engine *re;
const struct routing_engine_module *m;
 
+   if (!strcmp(name, no_fallback)) {
+   osm-subn.opt.no_fallback_routing_engine = TRUE;
+   return NULL;
+   }
+
for (m = routing_modules; m-name  *m-name; m++) {
if (!strcmp(m-name, name)) {
re = malloc(sizeof(struct osm_routing_engine));
diff --git a/opensm/opensm/osm_qos.c b/opensm/opensm/osm_qos.c
index 204c69c..ab55918 100644
--- a/opensm/opensm/osm_qos.c
+++ b/opensm/opensm/osm_qos.c
@@ -212,6 +212,12 @@ static int qos_extports_setup(osm_sm_t * sm, osm_node_t 
*node,
unsigned in, out;
uint8_t op_vl1;
 
+   /*
+* Do nothing unless the most recent routing attempt was successful.
+*/
+   if (!re)
+   return ret;
+
for (out = 1; out  num_ports; out++) {
p = osm_node_get_physp_ptr(node, out);
force_update = p-need_update || sm-p_subn-need_update;
diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
index 10629cb..d1c485f 100644
--- a/opensm/opensm/osm_ucast_mgr.c
+++ b/opensm/opensm/osm_ucast_mgr.c
@@ -1091,7 +1091,8 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
p_routing_eng = p_routing_eng-next;
}
 
-   if (!p_osm-routing_engine_used) {
+   if (!p_osm-routing_engine_used 
+   p_osm-subn.opt.no_fallback_routing_engine != TRUE) {
/* If configured routing algorithm failed, use default MinHop */
struct osm_routing_engine *r = p_osm-default_routing_engine;
 
@@ -1101,14 +1102,20 @@ int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
osm_ucast_mgr_set_fwd_tables(p_mgr);
}
 
-   OSM_LOG(p_mgr-p_log, OSM_LOG_INFO,
-   %s tables configured on all switches\n,
-   osm_routing_engine_type_str(p_osm-
-   routing_engine_used-type));
-
-   if (p_mgr-p_subn-opt.use_ucast_cache)
-   p_mgr-cache_valid = TRUE;
+   if (p_osm-routing_engine_used) {
+   OSM_LOG(p_mgr-p_log, OSM_LOG_INFO,
+   %s tables configured on all switches\n,
+   osm_routing_engine_type_str(p_osm-
+   routing_engine_used-type));
 
+   if (p_mgr-p_subn-opt.use_ucast_cache)
+   p_mgr-cache_valid = TRUE;
+   } else {
+   p_mgr-p_subn-subnet_initialization_error = TRUE;
+   OSM_LOG(p_mgr-p_log, OSM_LOG_ERROR,
+   No routing engine able to successfully configure 
+switch tables on current fabric\n);
+   }
 Exit:
CL_PLOCK_RELEASE(p_mgr-p_lock);
OSM_LOG_EXIT(p_mgr-p_log);
-- 
1.6.2.2


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 14/18] opensm: Do not require -Q option for torus-2QoS routing engine.

2010-09-03 Thread Jim Schutt
The torus-2QoS engine provides a deadlock-free routing for a 2D/3D torus,
but requires that switch SL2VL maps be programmed.  Before this change,
opensm -Q was required for that to happen.

When a routing engine sets the struct osm_routing_engine:update_sl2vl
pointer, it is signalling its intent to participate in SL2VL map programming.
So, don't return early from osm_qos_setup() in that case; instead do everything
except attempt to read QoS configuration information.

For that to work properly, need to also always set up the default QoS config
information, instead of just when QoS is requested via -Q.

With that in place, the -Q option now means the same thing to torus-2QoS that
it means to other routing engines: QoS configuration is requested.

Signed-off-by: Jim Schutt jasc...@sandia.gov
---
 opensm/opensm/osm_qos.c|7 +--
 opensm/opensm/osm_subnet.c |   18 +-
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/opensm/opensm/osm_qos.c b/opensm/opensm/osm_qos.c
index e0f4411..204c69c 100644
--- a/opensm/opensm/osm_qos.c
+++ b/opensm/opensm/osm_qos.c
@@ -308,7 +308,9 @@ int osm_qos_setup(osm_opensm_t * p_osm)
int ret = 0;
int vlarb_only;
 
-   if (!p_osm-subn.opt.qos)
+   if (!(p_osm-subn.opt.qos ||
+ (p_osm-routing_engine_used 
+  p_osm-routing_engine_used-update_sl2vl)))
return 0;
 
OSM_LOG_ENTER(p_osm-log);
@@ -325,7 +327,8 @@ int osm_qos_setup(osm_opensm_t * p_osm)
cl_plock_excl_acquire(p_osm-lock);
 
/* read QoS policy config file */
-   osm_qos_parse_policy_file(p_osm-subn);
+   if (p_osm-subn.opt.qos)
+   osm_qos_parse_policy_file(p_osm-subn);
 
p_tbl = p_osm-subn.port_guid_tbl;
p_next = cl_qmap_head(p_tbl);
diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
index bc34a0f..f714af7 100644
--- a/opensm/opensm/osm_subnet.c
+++ b/opensm/opensm/osm_subnet.c
@@ -1051,6 +1051,8 @@ static void subn_verify_qos_set(osm_qos_options_t *set, 
const char *prefix,
 
 int osm_subn_verify_config(IN osm_subn_opt_t * p_opts)
 {
+   osm_qos_options_t dflt;
+
if (p_opts-lmc  7) {
log_report( Invalid Cached Option Value:lmc = %u:
   Using Default:%u\n, p_opts-lmc, OSM_DEFAULT_LMC);
@@ -1101,17 +1103,15 @@ int osm_subn_verify_config(IN osm_subn_opt_t * p_opts)
p_opts-console = OSM_DEFAULT_CONSOLE;
}
 
-   if (p_opts-qos) {
-   osm_qos_options_t dflt;
-
-   /* the default options in qos_options must be correct.
-* every other one need not be, b/c those will default
-* back to whatever is in qos_options.
-*/
 
-   subn_set_default_qos_options(dflt);
+   /* the default options in qos_options must be correct.
+* every other one need not be, b/c those will default
+* back to whatever is in qos_options.
+*/
+   subn_set_default_qos_options(dflt);
+   subn_verify_qos_set(p_opts-qos_options, qos, dflt);
 
-   subn_verify_qos_set(p_opts-qos_options, qos, dflt);
+   if (p_opts-qos) {
subn_verify_qos_set(p_opts-qos_ca_options, qos_ca,
p_opts-qos_options);
subn_verify_qos_set(p_opts-qos_sw0_options, qos_sw0,
-- 
1.6.2.2


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: OFED 1.5.2: libibumad.so version bump

2010-09-03 Thread Hal Rosenstock
On Thu, Sep 2, 2010 at 5:05 PM, Ira Weiny wei...@llnl.gov wrote:
 On Thu, 2 Sep 2010 13:35:44 -0700
 Hal Rosenstock hal.rosenst...@gmail.com wrote:

 Hi Ira,

 On Tue, Aug 31, 2010 at 3:49 PM, Ira Weiny wei...@llnl.gov wrote:
  On Tue, 31 Aug 2010 02:27:29 -0700
  Yevgeny Kliteynik klit...@gmail.com wrote:
 
  Hi all,
 
  In order to support RoCEE, a while ago I've added
  a new field to umad, thus introduced an ABI change.
 
  There already was a discussion on the linux-rdma list,
  but due to the proximity of the upcoming OFED 1.5.2
  release these concerns were raised again.
 
  So my question is, other that *general* concerns about
  changing ABI, is anybody aware of the *actual* problem
  that will be caused by this? Any customer/3rd party
  solution that would be affected by this?
 
  Because our MVAPICH depends on umad, libibumad.so.1 to be exact.[*]  These 
  ABI
  changes (to v2 and v3) would have forced our users to recompile their 
  codes.
  We are maintaining the old ABI here until our next major release of 
  CHAOS[#]
  to prevent this.
 
  I think the thing to remember is that many people are using Open Fabrics
  software, but are _not_ using OFED.  What is tested with OFED is not the
  only thing which might be using these libraries.  Our version of MVAPICH 
  is a
  good example.
 
  I am certainly not the expert in this area and I know that many people have
  tried to make this point in the past, but I will say it here again.  Each 
  of
  these Open Fabrics packages _must_ be maintained to stand on their own.
  Roland did this a long time ago with ibverbs.
 
  I think now is a good time to start discussing breaking up the 
  management git
  tree so that these libraries can live on their own.

 How does breaking up the management git tree help with this issue ?

 Creating and tracking of branches to maintain ABI would be easier with
 separate git trees.  Furthermore, separate trees will help force the use of
 consistent ABI's and interfaces.

 For example, if I currently want OpenSM version 3.3.6 I get a management tree
 with version libibumad 1.3.5.  But this last ABI change to umad was only
 required for the latest infiniband-diags (ibstat utility).  Why do I get all
 this cruft when pulling the latest OpenSM?

That's a totally different issue than which packages a particular OFED
release picks up.


 To me, that's the admin part and is separate from the ABI issue
 raised.

 Yes it is separate.  That is why I created another thread to discuss those
 issues.


 The ABI compatibility is not achieved by administrative means
 (separate repos, etc.) but rather than review and discipline to
 achieve this as a unmutable goal.

 I agree that ABI compatibility will require more discipline.  That is what
 made me think of the separate git trees.  I feel it will be _easier_ to
 maintain this discipline when the trees are separate.

Call me a skeptic but I think the same thing would've occurred with
separate git trees.

I have no real preference one way or the other. In fact, this was
discussed in the early days which are long forgotten. The libraries
are small and umad is a lot more stable than mad. It would just mean a
lot of busy work for everyone with internal trees.

-- Hal


 Ira


 -- Hal

  I will write a separate email regarding this.
 
  Ira
 
  [*] We are looking into removing the dependency.
  [#] Shameless plug: 
  http://*code.google.com/p/chaos-release/wiki/CHAOS_Description
 
 
  -- Yevgeny
  --
  To unsubscribe from this list: send the line unsubscribe linux-rdma in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://**vger.kernel.org/majordomo-info.html
 
 
 
  --
  Ira Weiny
  Math Programmer/Computer Scientist
  Lawrence Livermore National Lab
  925-423-8008
  wei...@llnl.gov
  --
  To unsubscribe from this list: send the line unsubscribe linux-rdma in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://*vger.kernel.org/majordomo-info.html
 



 --
 Ira Weiny
 Math Programmer/Computer Scientist
 Lawrence Livermore National Lab
 925-423-8008
 wei...@llnl.gov

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Errors from ibchecknet

2010-09-03 Thread Chuck Hartley
I checked another working  fabric here and also see the same warnings,
so it looks like the warnings are not really a problem.

Well, I assume that it is just IPoIB that isn't working. Since ibping
works, I believe that says the IB part is ok. Of course, I can't run
any of the perftools since they all need IPoIB to resolve the host IP.

Do you have any suggestions of what to check to diagnose the IPoIB
problem?  Specifically, can you think of any interaction with the
normal networking stuff in the kernel that might be misconfigured?
The reason I mention that is because I rebuilt/installed OFED (no
errors/warnings) and it is in its default configuration, which is
running well on other similar fabrics here.  Therefore I assume the
problem must be with the non-OFED stuff. Previously, whenever this
kind of problem cropped up it has always been because opensm was not
running. I did check that iptables was off, so it isn't a firewall
issue.

- Chuck


On Thu, Sep 2, 2010 at 4:16 PM, Ira Weiny wei...@llnl.gov wrote:
 On Thu, 2 Sep 2010 11:11:13 -0700
 Chuck Hartley hartlc...@gmail.com wrote:

 Sure, here is the output:
 Note this is with the switch we swapped in, so the port numbers don't
 match the ibchecknet output in the original message.

 # ibstat
 CA 'mlx4_0'
       CA type: MT26428
       Number of ports: 2
       Firmware version: 2.6.0
       Hardware version: a0
       Node GUID: 0x0002c90300032de0
       System image GUID: 0x0002c90300032de3
       Port 1:
               State: Active
               Physical state: LinkUp
               Rate: 40
               Base lid: 6
               LMC: 0
               SM lid: 6

 Well the SM lid is set here.  Is it set on the other nodes?

 I don't run ibchecknet usually but I am getting the same errors here on a
 working fabric...

 ibwarn: [13629] dump_perfcounters: PortXmitWait not indicated so ignore this 
 counter
 #warn: Lid is not configured lid 37 port 2
 #warn: SM Lid is not configured
 Port check lid 37 port 2:  FAILED

 Looking at this output I don't think this is an error.

 13:17:14  smpquery nodeinfo 37
 # Node info: Lid 37
 BaseVers:1
 ClassVers:...1
 NodeType:Switch
 NumPorts:24
 ...

 On switch external Ports the Lid and SMLid are not used.

 Hal, would you concur?

 Chuck,
 Is it just that IPoIB is not working for you?

 Ira


               Capability mask: 0x0251086a
               Port GUID: 0x0002c90300032de1
       Port 2:
               State: Down
               Physical state: Polling
               Rate: 10
               Base lid: 0
               LMC: 0
               SM lid: 0
               Capability mask: 0x02510868
               Port GUID: 0x0002c90300032de2
 CA 'mthca0'
       CA type: MT25204
       Number of ports: 1
       Firmware version: 1.2.0
       Hardware version: a0
       Node GUID: 0x003048c64c0c
       System image GUID: 0x003048c64c0c0003
       Port 1:
               State: Down
               Physical state: Polling
               Rate: 10
               Base lid: 0
               LMC: 0
               SM lid: 0
               Capability mask: 0x02510a68
               Port GUID: 0x003048c64c0c0001

 # iblinkinfo
 Switch 0x0002c9020041a7a0 Infiniscale-IV Mellanox Technologies:
            1    1[  ] ==( 4X 10.0 Gbps Active/  LinkUp)==       5
 1[  ]  HCA-1 ( )
            1    2[  ] ==( 4X 10.0 Gbps Active/  LinkUp)==       6
 1[  ] linux70 HCA-1 ( )
            1    3[  ] ==( 4X 10.0 Gbps Active/  LinkUp)==       7
 1[  ] linux71 HCA-1 ( )
            1    4[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1    5[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1    6[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1    7[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1    8[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1    9[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   10[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   11[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   12[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   13[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   14[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   15[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   16[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   17[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   18[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   19[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   20[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   21[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   22[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   23[  ] ==( 4X 2.5 Gbps   Down/ Polling)==
 [  ]  ( )
            1   24[