[PATCH] OpenSM torus routing order list

2011-03-21 Thread Alex Netes
Enables to define list of switch ports so the SM will
go over this list when creating a routing.
It helps balancing links load on some communication patterns
where multipile links connect between the switches.

Signed-off-by: David McMillen da...@systemfabricworks.com
Signed-off-by: Alex Netes ale...@mellanox.com
---
 man/torus-2QoS.conf.5.in |   19 -
 opensm/osm_torus.c   |   65 +++--
 2 files changed, 79 insertions(+), 5 deletions(-)

diff --git a/man/torus-2QoS.conf.5.in b/man/torus-2QoS.conf.5.in
index 147a7b1..dd1aafb 100644
--- a/man/torus-2QoS.conf.5.in
+++ b/man/torus-2QoS.conf.5.in
@@ -62,7 +62,7 @@ see \fBUNICAST ROUTING\fR in torus-2QoS(8).
 \fIsw0_GUID sw1_GUID
 \fR
 .RS
-These keywords are used to seed the torus/mesh topolgy.
+These keywords are used to seed the torus/mesh topology.
 For example, xp_link 0x2000 0x2001 specifies that a link from
 the switch with node GUID 0x2000 to the switch with node GUID 0x2001
 would point in the positive x direction,
@@ -78,7 +78,7 @@ for torus dimensions of radix four (see \fBTOPOLOGY 
DISCOVERY\fR in
 torus-2QoS(8)).  For such cases both the positive and negative coordinate
 directions must be specified.
 .P
-Based on the topology specifed via the \fBtorus\fR/\fBmesh\fR keyword,
+Based on the topology specified via the \fBtorus\fR/\fBmesh\fR keyword,
 torus-2QoS will detect and log when it has insufficient seed configuration.
 .RE
 .
@@ -140,6 +140,17 @@ parameter needs to be increased.
 If this keyword appears multiple times, the last instance prevails.
 .RE
 .
+.P
+\fBport_order
+\fIp1 p2 p3 ...
+\fR
+.RS
+This keyword specifies the order on which the ports would be chosen for 
routing.
+This keyword is optional and if given overrides the default order.
+It's possible to define any subset of ports that would be chosen before the
+others.
+.RE
+.
 .SH EXAMPLE
 .
 \f(RC
@@ -171,6 +182,10 @@ z_dateline -1  # back to its original position.
 # on a host attached to a switch from the second seed.
 # Both instances should use this torus-2QoS.conf to ensure
 # path SL values do not change in the event of SM failover.
+
+# port_order defines the order on which the ports would be
+# chosen for routing.
+port_order 7 10 8 11 9 12 25 28 26 29 27 30
 .fi
 \fR
 .
diff --git a/opensm/osm_torus.c b/opensm/osm_torus.c
index add3cf9..7723a45 100644
--- a/opensm/osm_torus.c
+++ b/opensm/osm_torus.c
@@ -287,6 +287,8 @@ struct torus {
unsigned seed_cnt, seed_idx;
unsigned x_sz, y_sz, z_sz;
 
+   unsigned port_order[IB_NODE_NUM_PORTS_MAX+1];
+
unsigned sw_pool_sz;
unsigned link_pool_sz;
unsigned seed_sz;
@@ -844,6 +846,47 @@ out:
 }
 
 static
+bool parse_port(unsigned *pnum, const char *parse_sep)
+{
+   char *val, *nextchar;
+
+   val = strtok(NULL, parse_sep);
+   if (!val)
+   return false;
+   *pnum = strtoul(val, nextchar, 0);
+   if (*pnum  IB_NODE_NUM_PORTS_MAX)
+   *pnum = 0;
+   return true;
+}
+
+static
+bool parse_port_order(struct torus *t, const char *parse_sep)
+{
+   unsigned i, j, k, n;
+
+   for (i = 0; i  (sizeof(t-port_order) / sizeof(unsigned)); i++) {
+   if (!parse_port((t-port_order[i]), parse_sep))
+   break;
+   for (j = 0; j  i; j++) {
+   if (t-port_order[j] == t-port_order[i]) {
+   i--;/* Ignore duplicate port number */
+   break;
+   }
+   }
+   }
+
+   n = i;
+   for (j = 0; j  (sizeof(t-port_order) / sizeof(unsigned)); j++) {
+   for (k = 0; k  i; k++)
+   if (t-port_order[k] == j)
+   break;
+   if (k = i) t-port_order[n++] = j;
+   }
+
+   return true;
+}
+
+static
 bool parse_pg_max_ports(struct torus *t, const char *parse_sep)
 {
char *val, *nextchar;
@@ -1018,6 +1061,8 @@ next_line:
} else if (strcmp(mesh, keyword) == 0) {
t-flags |= X_MESH | Y_MESH | Z_MESH;
kw_success = parse_torus(t, parse_sep);
+   } else if (strcmp(port_order, keyword) == 0) {
+   kw_success = parse_port_order(t, parse_sep);
} else if (strcmp(next_seed, keyword) == 0) {
kw_success = grow_seed_array(t, 1);
t-seed_cnt++;
@@ -8424,6 +8469,7 @@ bool torus_lft(struct torus *t, struct t_switch *sw)
struct port_grp *pgrp;
struct t_switch *dsw;
osm_switch_t *osm_sw;
+   unsigned order[IB_NODE_NUM_PORTS_MAX+1];
 
if (!(sw-osm_switch  sw-osm_switch-priv == sw)) {
OSM_LOG(t-osm-log, OSM_LOG_ERROR,
@@ -8439,13 +8485,22 @@ bool torus_lft(struct torus *t, struct t_switch *sw)
dsw = t-sw_pool[s];
pgrp = dsw-ptgrp[2 * TORUS_MAX_DIM];
 
-   for (p = 0; p  pgrp-port_cnt; p++) {
+

Re: [PATCH] OpenSM Ignore invalid command line option -t 0

2011-03-21 Thread Alex Netes
On 09:16 Sun 20 Mar , Tamir Ronen wrote:
 
 From acc805cdf65e12e2cee9bfbf360a176c55a5949f Mon Sep 17 00:00:00 2001
 From: Tamir Ronen tam...@mellanox.com
 Date: Thu, 17 Mar 2011 15:21:43 +0200
 Subject: [PATCH] Ignore command line option -t 0
 
 If the timeout specified in the command line option -t equals zero,
 Print an error message and ignore it.
 ---

Applied. Thanks.

Also fixed description in the man/opensm.8.in
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Adding Congestion Control definitions.

2011-03-21 Thread Alex Netes
Adding Congestion Control definitions as described in IB spec A10
and Link Working Group Errata Q3/2010.

Signed-off-by: Alex Netes ale...@mellanox.com
---
 Makefile.am  |4 +-
 include/infiniband/mad.h |   67 +-
 src/ccm.c|  102 ++
 src/dump.c   |   25 +++
 src/fields.c |   47 +-
 src/libibmad.map |2 +
 src/mad.c|3 +-
 7 files changed, 244 insertions(+), 6 deletions(-)
 create mode 100644 src/ccm.c

diff --git a/Makefile.am b/Makefile.am
index 0a9e55d..2b3d363 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -15,8 +15,8 @@ endif
 
 libibmad_la_SOURCES = src/dump.c src/fields.c src/mad.c src/portid.c \
  src/resolve.c src/rpc.c src/sa.c src/smp.c src/gs.c \
- src/serv.c src/register.c src/vendor.c src/bm.c \
- src/mad_internal.h
+ src/serv.c src/register.c src/vendor.c src/bm.c src/ccm.c\
+  src/mad_internal.h
 
 libibmad_la_LDFLAGS = -version-info $(ibmad_api_version) \
 -export-dynamic $(libibmad_version_script)
diff --git a/include/infiniband/mad.h b/include/infiniband/mad.h
index 7571a61..1790bd6 100644
--- a/include/infiniband/mad.h
+++ b/include/infiniband/mad.h
@@ -66,6 +66,18 @@ BEGIN_C_DECLS
 #define IB_BM_DATA_SZ  (IB_MAD_SIZE - IB_BM_DATA_OFFS)
 #define IB_BM_BKEY_OFFS24
 #define IB_BM_BKEY_AND_DATA_SZ (IB_MAD_SIZE - IB_BM_BKEY_OFFS)
+#define IB_CC_DATA_OFFS 24
+#define IB_CC_DATA_SZ   (IB_MAD_SIZE - IB_CC_DATA_OFFS)
+#define CC_KEY_OFFSET  64  /* bits */
+#define LOG_RESERVED   256 /* bits */
+#define CC_KEY_BLOCK_SIZE  8
+#define CCM_SWITCH_LOG 1
+#define CCM_CA_LOG 2
+#define CCM_SWITCH_ENTRY_LIST_OFFSET   40 + CC_KEY_BLOCK_SIZE
+#define CCM_CA_ENTRY_LIST_OFFSET   8 + CC_KEY_BLOCK_SIZE
+#define CCM_SWITCH_ENTRY_LIST_LENGTH   15
+#define CCM_CA_ENTRY_LIST_LENGTH   16
+#define CCM_SWITCH_ENTRY_SIZE  12
 
 enum MAD_CLASSES {
IB_SMI_CLASS = 0x1,
@@ -140,6 +152,21 @@ enum SMI_ATTR_ID {
IB_ATTR_LAST
 };
 
+enum CC_ATTR_ID {
+   IB_CC_ATTR_CLASS_PORT_INFO = 0X01,
+   IB_CC_ATTR_NOTICE = 0X02,
+   IB_CC_ATTR_CONGESTION_INFO = 0X11,
+   IB_CC_ATTR_CONGESTION_KEY_INFO = 0X12,
+   IB_CC_ATTR_CONGESTION_LOG = 0X13,
+   IB_CC_ATTR_SWITCH_CONGESTION_SETTING = 0X14,
+   IB_CC_ATTR_SWITCH_PORT_CONGESTION_SWTTING = 0X15,
+   IB_CC_ATTR_CA_CONGESTION_SETTING = 0X16,
+   IB_CC_ATTR_CONGESTION_CONTROL_TABLE = 0X17,
+   IB_CC_ATTR_TIME_STAMP = 0X18,
+
+   IB_CC_ATTR_LAST
+};
+
 enum SA_ATTR_ID {
IB_SA_ATTR_NOTICE = 0x02,
IB_SA_ATTR_INFORMINFO = 0x03,
@@ -706,6 +733,36 @@ enum MAD_FIELDS {
IB_PSC_SAMPLES_ONLY_OPT_MASK_F,
IB_PSC_LAST_F,
 
+   IB_CC_LOG_FIRST_F,
+   IB_CC_LOG_LOGTYPE_F = IB_CC_LOG_FIRST_F,
+   IB_CC_LOG_CONGESTION_FLAGS_F,
+   IB_CC_LOG_LOG_EVENTS_COUNTER_F,
+   IB_CC_LOG_CURRENT_TIME_STAMP_F,
+   IB_CC_LOG_PORTMAP_F,
+   IB_CC_LOG_CONGESTION_ENTRY_LIST,
+   IB_CC_LOG_LAST_F,
+
+   IB_CC_LOG_EVENT_FIRST_F,
+   IB_CC_LOG_EVENT_SLID_F = IB_CC_LOG_EVENT_FIRST_F,
+   IB_CC_LOG_EVENT_DLID_F,
+   IB_CC_LOG_EVENT_SL_F,
+   IB_CC_LOG_EVENT_RESERVED_F,
+   IB_CC_LOG_EVENT_TIMESTAMP_F,
+   IB_CC_LOG_EVENT_LAST,
+
+   IB_CC_SWITCH_SET_FIRST_F,
+   IB_CC_SWITCH_SET_CONTROL_MAP_F = IB_CC_SWITCH_SET_FIRST_F,
+   IB_CC_SWITCH_SET_VICTIM_MASK_F,
+   IB_CC_SWITCH_SET_CREDIT_MASK_F,
+   IB_CC_SWITCH_SET_THRESHOLD_F,
+   IB_CC_SWITCH_SET_RESERVED1_F,
+   IB_CC_SWITCH_SET_PACKET_SIZE_F,
+   IB_CC_SWITCH_SET_CS_THRESHOLD_F,
+   IB_CC_SWITCH_SET_RESERVED2_F,
+   IB_CC_SWITCH_SET_RETURN_DELAY_F,
+   IB_CC_SWITCH_SET_MARKING_RATE_F,
+   IB_CC_SWITCH_SET_LAST_F,
+
IB_FIELD_LAST_  /* must be last */
 };
 
@@ -1013,6 +1070,14 @@ MAD_EXPORT uint8_t *performance_reset_via(void *rcvbuf, 
ib_portid_t * dest,
 MAD_EXPORT uint8_t *bm_call_via(void *data, ib_portid_t * portid,
ib_bm_call_t * call,
struct ibmad_port *srcport);
+/* ccm.c */
+uint8_t *cc_set_via(void *rcvbuf, ib_portid_t * dest,
+   uint8_t * cc_key, unsigned timeout,
+   unsigned id, const struct ibmad_port *srcport);
+
+uint8_t *cc_query_via(void *rcvbuf, ib_portid_t * dest, uint8_t * cc_key,
+ unsigned timeout, unsigned id,
+ const struct ibmad_port *srcport);
 
 /* dump.c */
 MAD_EXPORT ib_mad_dump_fn
@@ -1029,7 +1094,7 @@ MAD_EXPORT ib_mad_dump_fn
 mad_dump_switchinfo, mad_dump_perfcounters, mad_dump_perfcounters_ext,
 mad_dump_perfcounters_xmt_sl, mad_dump_perfcounters_rcv_sl,
 

[PULL] SRP large IO request support

2011-03-21 Thread David Dillow
Roland,

Please pull from

  git://git.kernel.org/pub/scm/linux/kernel/git/dad/srp-initiator.git 
external-indirect

to receive the following changes implementing support for large IO
requests through SRP.

Thanks!



David Dillow (6):
  IB/srp: always avoid non-zero offsets into an FMR
  IB/srp: move IB CM setup completion into its own function
  IB/srp: allow sg_tablesize to be set for each target
  IB/srp: rework mapping engine to use multiple FMR entries
  IB/srp: add support for indirect tables that don't fit in SRP_CMD
  IB/srp: try to use larger FMR sizes to cover our mappings

 drivers/infiniband/ulp/srp/ib_srp.c |  725 +++
 drivers/infiniband/ulp/srp/ib_srp.h |   38 ++-
 2 files changed, 514 insertions(+), 249 deletions(-)

-- 
Dave Dillow
National Center for Computational Science
Oak Ridge National Laboratory
(865) 241-6602 office


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PULL] SRP large IO request support

2011-03-21 Thread Roland Dreier
On Mon, Mar 21, 2011 at 11:02 AM, David Dillow dillo...@ornl.gov wrote:
 Please pull from

  git://git.kernel.org/pub/scm/linux/kernel/git/dad/srp-initiator.git 
 external-indirect

Thanks, pulled.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] OpenSM torus routing order list

2011-03-21 Thread Jim Schutt

Hi,

On Mon, 2011-03-21 at 03:58 -0600, Alex Netes wrote:
 Enables to define list of switch ports so the SM will
 go over this list when creating a routing.
 It helps balancing links load on some communication patterns
 where multipile links connect between the switches.
 
 Signed-off-by: David McMillen da...@systemfabricworks.com
 Signed-off-by: Alex Netes ale...@mellanox.com
 ---
  man/torus-2QoS.conf.5.in |   19 -
  opensm/osm_torus.c   |   65 +++--
  2 files changed, 79 insertions(+), 5 deletions(-)
 
 diff --git a/man/torus-2QoS.conf.5.in b/man/torus-2QoS.conf.5.in
 index 147a7b1..dd1aafb 100644
 --- a/man/torus-2QoS.conf.5.in
 +++ b/man/torus-2QoS.conf.5.in
 @@ -62,7 +62,7 @@ see \fBUNICAST ROUTING\fR in torus-2QoS(8).
  \fIsw0_GUID sw1_GUID
  \fR
  .RS
 -These keywords are used to seed the torus/mesh topolgy.
 +These keywords are used to seed the torus/mesh topology.
  For example, xp_link 0x2000 0x2001 specifies that a link from
  the switch with node GUID 0x2000 to the switch with node GUID 0x2001
  would point in the positive x direction,
 @@ -78,7 +78,7 @@ for torus dimensions of radix four (see \fBTOPOLOGY 
 DISCOVERY\fR in
  torus-2QoS(8)).  For such cases both the positive and negative coordinate
  directions must be specified.
  .P
 -Based on the topology specifed via the \fBtorus\fR/\fBmesh\fR keyword,
 +Based on the topology specified via the \fBtorus\fR/\fBmesh\fR keyword,
  torus-2QoS will detect and log when it has insufficient seed configuration.
  .RE
  .
 @@ -140,6 +140,17 @@ parameter needs to be increased.
  If this keyword appears multiple times, the last instance prevails.
  .RE
  .
 +.P
 +\fBport_order
 +\fIp1 p2 p3 ...
 +\fR
 +.RS
 +This keyword specifies the order on which the ports would be chosen for 
 routing.
 +This keyword is optional and if given overrides the default order.
 +It's possible to define any subset of ports that would be chosen before the
 +others.
 +.RE
 +.

This documentation needs to tell me a little more about 
how to choose port_order values.  

Something like this:

This keyword specifies the order in which CA ports on a 
destination switch are visited when computing routes.
When the fabric contains switches connected with multiple
parallel links, routes are distributed in a round-robin
fashion across such links, and so changing the order 
that CA ports are visited changes the distribution
of routes across such links.  This may be advantageous 
for some specific traffic patterns.

The default is to visit CA ports in increasing port
order on destination switches.

Duplicate values in the list will be ignored.


  .SH EXAMPLE
  .
  \f(RC
 @@ -171,6 +182,10 @@ z_dateline -1  # back to its original position.
  # on a host attached to a switch from the second seed.
  # Both instances should use this torus-2QoS.conf to ensure
  # path SL values do not change in the event of SM failover.
 +
 +# port_order defines the order on which the ports would be
 +# chosen for routing.
 +port_order 7 10 8 11 9 12 25 28 26 29 27 30
  .fi
  \fR
  .
 diff --git a/opensm/osm_torus.c b/opensm/osm_torus.c
 index add3cf9..7723a45 100644
 --- a/opensm/osm_torus.c
 +++ b/opensm/osm_torus.c
 @@ -287,6 +287,8 @@ struct torus {
   unsigned seed_cnt, seed_idx;
   unsigned x_sz, y_sz, z_sz;
  
 + unsigned port_order[IB_NODE_NUM_PORTS_MAX+1];
 +
   unsigned sw_pool_sz;
   unsigned link_pool_sz;
   unsigned seed_sz;
 @@ -844,6 +846,47 @@ out:
  }
  
  static
 +bool parse_port(unsigned *pnum, const char *parse_sep)
 +{
 + char *val, *nextchar;
 +
 + val = strtok(NULL, parse_sep);
 + if (!val)
 + return false;
 + *pnum = strtoul(val, nextchar, 0);
 + if (*pnum  IB_NODE_NUM_PORTS_MAX)
 + *pnum = 0;

Hmmm, user configuration was just silently discarded.
Please warn to give the user a chance to correct it.

 + return true;
 +}
 +
 +static
 +bool parse_port_order(struct torus *t, const char *parse_sep)
 +{
 + unsigned i, j, k, n;
 +
 + for (i = 0; i  (sizeof(t-port_order) / sizeof(unsigned)); i++) {

Please add this (from linux kernel):
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))

and use it instead for all such loops.

 + if (!parse_port((t-port_order[i]), parse_sep))
 + break;
 + for (j = 0; j  i; j++) {
 + if (t-port_order[j] == t-port_order[i]) {
 + i--;/* Ignore duplicate port number */

Again, please warn that user configuration was discarded.

 + break;
 + }
 + }
 + }
 +
 + n = i;
 + for (j = 0; j  (sizeof(t-port_order) / sizeof(unsigned)); j++) {
 + for (k = 0; k  i; k++)
 + if (t-port_order[k] == j)
 + break;
 + if (k = i) t-port_order[n++] = j;

Style nit: make that last line into two lines.

 + }
 +