[PATCH] opensm/osm_sa_member_record.c: mlid independent MGID generator

2009-11-12 Thread Sasha Khapyorsky

If we are going to do many MGID to single MLID compression we cannot use
MLID value for unique MGID generation.

Instead use static counter and also add querying for potentially existing
multicast group with same MGID value (with 1000 attempts limit).

Signed-off-by: Sasha Khapyorsky 
---
 opensm/opensm/osm_sa_mcmember_record.c |   13 +
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/opensm/opensm/osm_sa_mcmember_record.c 
b/opensm/opensm/osm_sa_mcmember_record.c
index f6a9ead..c6856fc 100644
--- a/opensm/opensm/osm_sa_mcmember_record.c
+++ b/opensm/opensm/osm_sa_mcmember_record.c
@@ -715,8 +715,10 @@ static boolean_t mgrp_request_is_realizable(IN osm_sa_t * 
sa,
 static unsigned build_new_mgid(osm_sa_t * sa, ib_net64_t comp_mask,
   ib_member_rec_t * mcmr)
 {
+   static uint32_t uniq_count;
ib_gid_t *mgid = &mcmr->mgid;
uint8_t scope;
+   unsigned i;
 
/* use the given scope state only if requested! */
if (comp_mask & IB_MCR_COMPMASK_SCOPE)
@@ -733,11 +735,14 @@ static unsigned build_new_mgid(osm_sa_t * sa, ib_net64_t 
comp_mask,
/* HACK: use the SA port gid to make it globally unique */
memcpy(&mgid->raw[4], &sa->p_subn->opt.subnet_prefix, sizeof(uint64_t));
 
-   /* HACK: how do we get a unique number - use the mlid twice */
-   memcpy(&mgid->raw[10], &mcmr->mlid, sizeof(uint16_t));
-   memcpy(&mgid->raw[12], &mcmr->mlid, sizeof(uint16_t));
+   for (i = 0; i < 1000; i++) {
+   memcpy(&mgid->raw[10], &uniq_count, 4);
+   uniq_count++;
+   if (!osm_get_mgrp_by_mgid(sa, mgid))
+   return 1;
+   }
 
-   return 1;
+   return 0;
 }
 
 /**
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] opensm/osm_sa_mcmember_record.c: move mgid allocation code

2009-11-12 Thread Sasha Khapyorsky

Move new MGID allocation code into separate function build_new_mgid().

Signed-off-by: Sasha Khapyorsky 
---
 opensm/opensm/osm_sa_mcmember_record.c |   68 +++-
 1 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/opensm/opensm/osm_sa_mcmember_record.c 
b/opensm/opensm/osm_sa_mcmember_record.c
index 357e6ab..f6a9ead 100644
--- a/opensm/opensm/osm_sa_mcmember_record.c
+++ b/opensm/opensm/osm_sa_mcmember_record.c
@@ -712,19 +712,45 @@ static boolean_t mgrp_request_is_realizable(IN osm_sa_t * 
sa,
return TRUE;
 }
 
+static unsigned build_new_mgid(osm_sa_t * sa, ib_net64_t comp_mask,
+  ib_member_rec_t * mcmr)
+{
+   ib_gid_t *mgid = &mcmr->mgid;
+   uint8_t scope;
+
+   /* use the given scope state only if requested! */
+   if (comp_mask & IB_MCR_COMPMASK_SCOPE)
+   ib_member_get_scope_state(mcmr->scope_state, &scope, NULL);
+   else
+   /* to guarantee no collision with other subnets use local scope! */
+   scope = IB_MC_SCOPE_LINK_LOCAL;
+
+   mgid->raw[0] = 0xff;
+   mgid->raw[1] = 0x10 | scope;
+   mgid->raw[2] = 0xa0;
+   mgid->raw[3] = 0x1b;
+
+   /* HACK: use the SA port gid to make it globally unique */
+   memcpy(&mgid->raw[4], &sa->p_subn->opt.subnet_prefix, sizeof(uint64_t));
+
+   /* HACK: how do we get a unique number - use the mlid twice */
+   memcpy(&mgid->raw[10], &mcmr->mlid, sizeof(uint16_t));
+   memcpy(&mgid->raw[12], &mcmr->mlid, sizeof(uint16_t));
+
+   return 1;
+}
+
 /**
  Call this function to create a new mgrp.
 **/
 static ib_api_status_t mcmr_rcv_create_new_mgrp(IN osm_sa_t * sa,
IN ib_net64_t comp_mask,
IN const ib_member_rec_t *
-   const p_recvd_mcmember_rec,
+   p_recvd_mcmember_rec,
IN const osm_physp_t * p_physp,
OUT osm_mgrp_t ** pp_mgrp)
 {
ib_net16_t mlid;
-   uint8_t scope;
-   ib_gid_t *p_mgid;
ib_api_status_t status = IB_SUCCESS;
ib_member_rec_t mcm_rec = *p_recvd_mcmember_rec;/* copy for 
modifications */
 
@@ -743,37 +769,24 @@ static ib_api_status_t mcmr_rcv_create_new_mgrp(IN 
osm_sa_t * sa,
goto Exit;
}
 
-   OSM_LOG(sa->p_log, OSM_LOG_DEBUG,
-   "Obtained new mlid 0x%X\n", cl_ntoh16(mlid));
+   OSM_LOG(sa->p_log, OSM_LOG_DEBUG, "Obtained new mlid 0x%X\n",
+   cl_ntoh16(mlid));
 
+   mcm_rec.mlid = mlid;
/* we need to create the new MGID if it was not defined */
if (!ib_gid_is_notzero(&p_recvd_mcmember_rec->mgid)) {
/* create a new MGID */
char gid_str[INET6_ADDRSTRLEN];
 
-   /* use the given scope state only if requested! */
-   if (comp_mask & IB_MCR_COMPMASK_SCOPE)
-   ib_member_get_scope_state(p_recvd_mcmember_rec->
- scope_state, &scope, NULL);
-   else
-   /* to guarantee no collision with other subnets use 
local scope! */
-   scope = IB_MC_SCOPE_LINK_LOCAL;
-
-   p_mgid = &(mcm_rec.mgid);
-   p_mgid->raw[0] = 0xFF;
-   p_mgid->raw[1] = 0x10 | scope;
-   p_mgid->raw[2] = 0xA0;
-   p_mgid->raw[3] = 0x1B;
-
-   /* HACK: use the SA port gid to make it globally unique */
-   memcpy((&p_mgid->raw[4]),
-  &sa->p_subn->opt.subnet_prefix, sizeof(uint64_t));
-
-   /* HACK: how do we get a unique number - use the mlid twice */
-   memcpy(&p_mgid->raw[10], &mlid, sizeof(uint16_t));
-   memcpy(&p_mgid->raw[12], &mlid, sizeof(uint16_t));
+   if (!build_new_mgid(sa, comp_mask, &mcm_rec)) {
+   OSM_LOG(sa->p_log, OSM_LOG_ERROR, "ERR 1B23: "
+   "cannot allocate unique MGID value\n");
+   free_mlid(sa, mlid);
+   status = IB_SA_MAD_STATUS_NO_RESOURCES;
+   goto Exit;
+   }
OSM_LOG(sa->p_log, OSM_LOG_DEBUG, "Allocated new MGID:%s\n",
-   inet_ntop(AF_INET6, p_mgid->raw, gid_str,
+   inet_ntop(AF_INET6, mcm_rec.mgid.raw, gid_str,
  sizeof gid_str));
} else if (!validate_requested_mgid(sa, &mcm_rec)) {
/* a specific MGID was requested so validate the resulting MGID 
*/
@@ -795,7 +808,6 @@ static ib_api_status_t mcmr_rcv_create_new_mgrp(IN os

Re: [PATCH 2/2] infiniband-diags/ibqueryerrors: Properly exit and clean up resources when node info query fails

2009-11-12 Thread Sasha Khapyorsky
On 20:22 Tue 03 Nov , Ira Weiny wrote:
> 
> From: Ira Weiny 
> Date: Tue, 3 Nov 2009 20:17:14 -0800
> Subject: [PATCH] infiniband-diags/ibqueryerrors: Properly exit and clean up 
> resources when node info query fails
> 
> 
> Signed-off-by: Ira Weiny 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] infiniband-diags/ibqueryerrors: Change realloc of suppressed fields to a static array

2009-11-12 Thread Sasha Khapyorsky
On 20:22 Tue 03 Nov , Ira Weiny wrote:
> Sasha,
> 
> 2 clean up patches which apply in order.
> 
> Ira
> 
> From: Ira Weiny 
> Date: Tue, 3 Nov 2009 13:19:33 -0800
> Subject: [PATCH] infiniband-diags/ibqueryerrors: Change realloc of suppressed 
> fields to a static array
> 
>   Realloc size was wrong causing a core when enough errors were
>   suppressed.  Reproduced by running:
> 
>   ibqueryerrors -c -s 
> RcvSwRelayErrors,LinkDowned,VL15Dropped,XmtWait,SymbolErrors,LinkRecovers,RcvErrors
> 
>   Change this to a static array which also removes the need to free
>   memory.
> 
> Signed-off-by: Ira Weiny 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] infiniband-diags/ibqueryerrors: Fix remote node name printing

2009-11-12 Thread Sasha Khapyorsky
On 19:54 Tue 03 Nov , Ira Weiny wrote:
> 
> From: Ira Weiny 
> Date: Tue, 3 Nov 2009 19:51:32 -0800
> Subject: [PATCH] infiniband-diags/ibqueryerrors: Fix remote node name printing
> 
>   "-r" option was reporting the current node name, not the remote node
>   name of the port as it should have been.
> 
> Signed-off-by: Ira Weiny 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] opensm/main.c: opensm cannot be killed while asking for port guid

2009-11-12 Thread Sammy Wilborn
On Fri, Nov 13, 2009 at 04:55:27AM +0200, Sasha Khapyorsky wrote:
> On 12:41 Fri 06 Nov , Michael Reed wrote:
> > opensm enters an uninterruptible loop when the user enters
> > "-g 0" on the command line.  The only way to kill opensm is to
> > background the process and send "kill -9".
> 
> The reason is that the port chooser is running in code section where all
> signals are blocked. Which is bad in general, but resulted by not
> perfect opensm/vendor/complid inter dependencies: sub threads are
> activated (complib_init(), osm_opensm_init(), osm_opensm_bind()) with
> blocked signals. The port chooser should run in the middle of this chain
> after opensm vendor initialization (osm_opensm_init()), but before opensm
> vendor binding (osm_opensm_bind()) - it requires a GUID value.
> 
> I don't immediately know how to improve this without significant
> complib/vendor/opensm interfaces changing.
> 
> > This patch provides the user an out in get_port_guid() by
> > introducing "0" as a valid selection.  This patch also changes
> > the "Lame choice!" error message to something a bit more, uh,
> > "professional". main() is modified to test the return code.
> > This was probably a bug as a return of 0 is returned under a
> > number of different circumstances by get_port_guid().
> > 
> > Applies to 1.15 RC2.
> 
> Please next time generate patch against the current master branch.
> 
> > Signed-off-by: Michael Reed 
> 
> Applied with minor change (see below). Thanks.
> 
> > 
> > 
> > --- /tmp/OFED-1.5-rc2/main.c2009-11-06 08:56:59.089100487 -0800
> > +++ opensm/main.c   2009-11-06 09:42:34.698963811 -0800
> > @@ -434,15 +434,19 @@ static ib_net64_t get_port_guid(IN osm_o
> >i + 1, cl_ntoh64(attr_array[i].port_guid),
> >attr_array[i].lid,
> >ib_get_port_state_str(attr_array[i].link_state));
> > -   printf("\nEnter choice (1-%u): ", i);
> > +   printf("\n\t0: Exit\n");
> > +   printf("\nEnter choice (0-%u): ", i);
> > fflush(stdout);
> > if (scanf("%u", &choice) <= 0) {
> > char junk[128];
> > if (scanf("%s", junk) <= 0)
> > printf("\nError: Cannot scan!\n");
> > -   } else if (choice && choice <= num_ports)
> > +   }
> > +   else if (choice == 0)
> > +   return (0);
> > +   else if (choice <= num_ports)
> > break;
> > -   printf("\nError: Lame choice!\n");
> > +   printf("\nError: Please try again.\n");
> 
> "Try again" is good, but it is useful to provide a reason too, so let's
> do it even more "professional":
> 
>   printf("\nError: Lame choice! Please try again.\n");

How about the following?

printf("\nError: Invalid choice! Please try again.\n");

--Sammy

> 
> Sasha
> 
> > }
> > choice--;
> > printf("Choice guid=0x%" PRIx64 "\n",
> > @@ -1039,6 +1043,9 @@ int main(int argc, char *argv[])
> > if (opt.guid == 0 || cl_hton64(opt.guid) == CL_HTON64(INVALID_GUID))
> > opt.guid = get_port_guid(&osm, opt.guid);
> >  
> > +   if (opt.guid == 0)
> > +   goto Exit;
> > +
> > status = osm_opensm_bind(&osm, opt.guid);
> > if (status != IB_SUCCESS) {
> > printf("\nError from osm_opensm_bind (0x%X)\n", status);
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] opensm/main.c: opensm cannot be killed while asking for port guid

2009-11-12 Thread Sasha Khapyorsky
On 12:41 Fri 06 Nov , Michael Reed wrote:
> opensm enters an uninterruptible loop when the user enters
> "-g 0" on the command line.  The only way to kill opensm is to
> background the process and send "kill -9".

The reason is that the port chooser is running in code section where all
signals are blocked. Which is bad in general, but resulted by not
perfect opensm/vendor/complid inter dependencies: sub threads are
activated (complib_init(), osm_opensm_init(), osm_opensm_bind()) with
blocked signals. The port chooser should run in the middle of this chain
after opensm vendor initialization (osm_opensm_init()), but before opensm
vendor binding (osm_opensm_bind()) - it requires a GUID value.

I don't immediately know how to improve this without significant
complib/vendor/opensm interfaces changing.

> This patch provides the user an out in get_port_guid() by
> introducing "0" as a valid selection.  This patch also changes
> the "Lame choice!" error message to something a bit more, uh,
> "professional". main() is modified to test the return code.
> This was probably a bug as a return of 0 is returned under a
> number of different circumstances by get_port_guid().
> 
> Applies to 1.15 RC2.

Please next time generate patch against the current master branch.

> Signed-off-by: Michael Reed 

Applied with minor change (see below). Thanks.

> 
> 
> --- /tmp/OFED-1.5-rc2/main.c  2009-11-06 08:56:59.089100487 -0800
> +++ opensm/main.c 2009-11-06 09:42:34.698963811 -0800
> @@ -434,15 +434,19 @@ static ib_net64_t get_port_guid(IN osm_o
>  i + 1, cl_ntoh64(attr_array[i].port_guid),
>  attr_array[i].lid,
>  ib_get_port_state_str(attr_array[i].link_state));
> - printf("\nEnter choice (1-%u): ", i);
> + printf("\n\t0: Exit\n");
> + printf("\nEnter choice (0-%u): ", i);
>   fflush(stdout);
>   if (scanf("%u", &choice) <= 0) {
>   char junk[128];
>   if (scanf("%s", junk) <= 0)
>   printf("\nError: Cannot scan!\n");
> - } else if (choice && choice <= num_ports)
> + }
> + else if (choice == 0)
> + return (0);
> + else if (choice <= num_ports)
>   break;
> - printf("\nError: Lame choice!\n");
> + printf("\nError: Please try again.\n");

"Try again" is good, but it is useful to provide a reason too, so let's
do it even more "professional":

printf("\nError: Lame choice! Please try again.\n");

Sasha

>   }
>   choice--;
>   printf("Choice guid=0x%" PRIx64 "\n",
> @@ -1039,6 +1043,9 @@ int main(int argc, char *argv[])
>   if (opt.guid == 0 || cl_hton64(opt.guid) == CL_HTON64(INVALID_GUID))
>   opt.guid = get_port_guid(&osm, opt.guid);
>  
> + if (opt.guid == 0)
> + goto Exit;
> +
>   status = osm_opensm_bind(&osm, opt.guid);
>   if (status != IB_SUCCESS) {
>   printf("\nError from osm_opensm_bind (0x%X)\n", status);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband-diags] [PATCH] [2/2] split out scan specific data from ibnd_node_t

2009-11-12 Thread Sasha Khapyorsky
On 15:10 Thu 12 Nov , Sean Hefty wrote:
> >
> >I'm not asking to make high level life harder :). My point is to not
> >prevent from advanced developers to use available low level too,
> >especially when such preventing requires some extra efforts.
> 
> I haven't been following the details of this thread, but it's very common to
> expose only a portion of a data structure to the user, while keeping some of 
> it
> private.

Of course we can easily find such examples, and also will find others
not less useful where all internal data is exposed (libibumad for
example).

I think that if library interface is designed so that some typical
task/tools (high or low level) cannot be implemented easily than likely
this would be example of a poor interface regardless to how public/private
data structures were separated in this library.

> As long as the lower library allocates the structure and exchanges
> pointers, the interface can be maintained.

Just to be clear - the example of "low level" data triggered this round
of the discussion is node's direct path generated during subnet
discovery (another was discovery BFS graph, but this one "was lost"
already).

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [infiniband-diags] [PATCH] [2/2] split out scan specific data from ibnd_node_t

2009-11-12 Thread Sean Hefty
>> Making things "private" allows us to change the library without breaking the
>> interface.
>
>I don't think
>
>> Furthermore, it simplifies the interface for users who want to
>> write code at a higher level.
>
>I'm not asking to make high level life harder :). My point is to not
>prevent from advanced developers to use available low level too,
>especially when such preventing requires some extra efforts.

I haven't been following the details of this thread, but it's very common to
expose only a portion of a data structure to the user, while keeping some of it
private.  As long as the lower library allocates the structure and exchanges
pointers, the interface can be maintained.

Trying to expose what should be internal data structures through a public
interface tosses encapsulation out the window, along with any benefits that it
provides.  In the end, all you get is a poorly designed interface.

Advanced developers can use lower level interfaces, like mad or umad.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband-diags] [PATCH 2/2] fix potential segfault in ibnd_node_t destroy path

2009-11-12 Thread Sasha Khapyorsky
On 10:14 Fri 06 Nov , Al Chu wrote:
> Hey Sasha,
> 
> There is a potential for the ports array in ibnd_node_t to be NULL if
> you hit an error during scan and eventually destroy the fabric struct
> before returning to the user.
> 
> Al
> 
> -- 
> Albert Chu
> ch...@llnl.gov
> Computer Scientist
> High Performance Systems Division
> Lawrence Livermore National Laboratory

> From: Albert Chu 
> Date: Thu, 5 Nov 2009 15:18:27 -0800
> Subject: [PATCH] fix potential segfault in ibnd_node_t destroy path
> 
> 
> Signed-off-by: Albert Chu 

Applied with minor change (see below). Thanks.

> ---
>  infiniband-diags/libibnetdisc/src/ibnetdisc.c |8 +---
>  1 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/infiniband-diags/libibnetdisc/src/ibnetdisc.c 
> b/infiniband-diags/libibnetdisc/src/ibnetdisc.c
> index 62dff93..7ce9a54 100644
> --- a/infiniband-diags/libibnetdisc/src/ibnetdisc.c
> +++ b/infiniband-diags/libibnetdisc/src/ibnetdisc.c
> @@ -605,10 +605,12 @@ static void destroy_node(ibnd_node_t * node)
>  {
>   int p = 0;
>  
> - for (p = 0; p <= node->numports; p++) {
> - free(node->ports[p]);
> + if (node->ports) {
> + for (p = 0; p <= node->numports; p++) {
> + free(node->ports[p]);
> + }

Removing here unneeded braces around free().

Sasha

> + free(node->ports);
>   }
> - free(node->ports);
>   free(node);
>  }
>  
> -- 
> 1.5.4.5
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband-diags] [PATCH 1/2] consistently type lids and lmcs libibnetdisc

2009-11-12 Thread Sasha Khapyorsky
On 10:14 Fri 06 Nov , Al Chu wrote:
> Hey Sasha,
> 
> The ibnd_port_t lid and lmc are typed uint16_t/uint8_t, but the
> ibnd_node_t smalid and smalmc are typed int/int.  This patch turns the
> ints into uint16_t/uint8_t.
> 
> Naturally, we can do the opposite, but it should be consistent one way
> or the other.
> 
> Al
> 
> -- 
> Albert Chu
> ch...@llnl.gov
> Computer Scientist
> High Performance Systems Division
> Lawrence Livermore National Laboratory

> From: Albert Chu 
> Date: Thu, 5 Nov 2009 14:58:11 -0800
> Subject: [PATCH] consistently type lids and lmcs libibnetdisc
> 
> 
> Signed-off-by: Albert Chu 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband-diags] [PATCH] [2/2] split out scan specific data from ibnd_node_t

2009-11-12 Thread Sasha Khapyorsky
Hi Ira,

On 10:59 Thu 12 Nov , Ira Weiny wrote:
> 
> nodesdist was remove from the public interface.  Although an "advanced" user
> might have been able to use it, the data stored there was very scan specific.
> Removing it was a good idea for 2 reasons, 1) simplify the interface,

I don't see how this conceptually simplifies interface. I think that
it started from a wrong approach about having two data structure sets -
public and private. Now it requires cleaning again and again in order to
have simpler interface.

> 2) if
> the scan algorithm changed users might have to change the way they use the
> data; not good for compatibility.

Maybe, but let him to decide. Such usage is not mandatory.

> I agree there was some usefulness, sometimes.  However the path_portid can not
> be guaranteed to be valid.

Why not? Isn't it a valid on the last scan?

> Again there are multiple issues.  First I don't
> think we want to support to this to the users.

But you don't need to support it. Advanced use is developer's
responsibility.

> Second Al is working toward is
> the ability to cache the fabric information to be read back later.  Storing
> all this "scan" specific information is going to be extra work which is
> superfluous to the layout of the fabric.

Hard to say really without seeing any code, but you can simply keep all
this scan specific information over session and have a pointer field on
ibnd_fabric structure which refer this.

> > I cannot understand why are you trying to make things there as "private"
> > as technically possible (even on price of extra code size and
> > complexity). Finally it is an open source stuff, so let to users to use
> > it how they want and for their own responsibility. :)
> 
> Making things "private" allows us to change the library without breaking the
> interface.

I don't think

> Furthermore, it simplifies the interface for users who want to
> write code at a higher level.

I'm not asking to make high level life harder :). My point is to not
prevent from advanced developers to use available low level too,
especially when such preventing requires some extra efforts.

> My original design goals were 2 fold.  1. make
> a library which speeds up the functionality of the perl script tools.  2.
> Provide a C interface to a fast scan library which can be used to implement
> further tools which would have formerly been implemented via scripts around
> ibnetdiscover.

My purposes serve (2) very well. Isn't it?

> 
> Here is one use case we have been working off of.
> 
> One of our MPI developers here is not familiar with Infiniband. He has
> written many scripts around the current suite of tools for various research
> that he does.  The concepts of nodes, ports, and links are familiar to him but
> sending a "MAD" or differentiating between a GSI MAD vs SMP is not.  I want to
> give him information about nodes, links, ports, errors, data counters, routing
> tables, etc. without making him use the MAD layer, or worse yet, umad layer.

How having 'path_portid' in node structure enforces him to use the MAD
or umad (which is simpler than mad in general, IMO) layers? It doesn't.

> In this use case he does not care that libibnetdisc does a BFS and creates a
> nodesdist structure of some sort resulting from that algorithm.  Presenting a
> "fabric" with a list of "nodes" which further have some "ports" makes sense to
> a user like this.

Again, how having access to internal discovery stuff makes a life of
such users harder?

> This use case in addition to trying to cache the data makes a simpler, cleaner
> interface much more attractive.  :-D

And there is another use case (hypothetical yet) - one of our IB
developers is familiar with infiniband... Would such "high-level"-only
interface be so attractive for him?

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband-diags] [PATCH] [2/2] split out scan specific data from ibnd_node_t

2009-11-12 Thread Sasha Khapyorsky
Hi Al,

On 09:51 Thu 12 Nov , Al Chu wrote:
> 
> > Really? I thought that it could be a useful data for "advanced" uses.
> 
> It was removed in commit 094f922a34d6378d6a3bd1d137f90d6530685f94.  It
> was a simpler version of a patch that Ira had proposed on the mailing
> list.

This means that I'm applying patches too quickly :)

Then 'dest' is likely useless without such array, but 'portid' is not.

> > I cannot understand why are you trying to make things there as 
> > "private" as technically possible (even on price of extra code size 
> > and complexity). Finally it is an open source stuff, so let to users 
> > to use it how they want and for their own responsibility. :)
> 
> At the core of this patch (as well as some other patches I've submitted
> on libibnetdiscover before), is cleaning up the interface of
> libibnetdisc to be just the "core" of libibnetdisc.  We could stick
> anything into the public structs that could have potential usefulness,
> but at some point I think we need to limit ourselves to only the core
> stuff.  Why not add the ibmad_port to the structs?  Or instead of
> putting just the guids or lids in the structs, why not also the pkeys,
> capability masks, or VL tables?

This can be done (if needed), but will require some efforts, and this is
not what I'm asking for. I'm just proposing to not remove potentially
useful things, that is all and this is for no price.

And cleaning interface is a good thing.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: strong ordering for data registered memory

2009-11-12 Thread Caitlin Bestler






>On Thu Nov 12 12:43 , David Brean  wrote:See section 4 in 
>the paper called "High Performance RDMA Based MPI Implementation over 
>InfiniBand" on the MVAPICH web page for description of one implementation that 
>polls on data buffers.  Specifically, look at text around the statement 
>"Although the approach uses the in-order implementation of hardware for RDMA 
>write which is not specified in the InfiniBand standard, this feature is very 
>likely to be kept by different hardware designers."  Although this paper is 
>describing a PCI-X implementation, the feature is also exists on PCIe.
>
>It's assumed that the host memory interconnect complies with statements 
>described in the "Update Ordering and Granularity Provided by a Write 
>Transaction" of the PCI spec.  This particular application depends on PCI 
>WRITE behavior, not READ.
>
>Does this help?
>

A simplified way of looking at this requirement is that the last N bytes of the 
RDMA Write payload require the same ordering guarantees as a CQE would have had.

This is of course ironic since the technique was developed to avoid the 
overhead of the CQ.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] RDMA/cxgb3: remove BUG_ON() on rearm failure.

2009-11-12 Thread Steve Wise
This failure, while indicating fatal problems with the device, shouldn't
kill the system.

Signed-off-by: Steve Wise 
---

 drivers/infiniband/hw/cxgb3/cxio_hal.c |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c 
b/drivers/infiniband/hw/cxgb3/cxio_hal.c
index 72ed339..7c05bfa 100644
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -109,7 +109,6 @@ int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq 
*cq,
while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
udelay(1);
if (i++ > 100) {
-   BUG_ON(1);
printk(KERN_ERR "%s: stalled rnic\n",
   rdev_p->dev_name);
return -EIO;

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] libibmad: Add support for new PortInfo:McastPkeyTrapSuppressionEnabled field

2009-11-12 Thread Sasha Khapyorsky
On 15:35 Fri 06 Nov , Hal Rosenstock wrote:
> 
> Per published MgtWG erratum RefID 4576 - Add PortInfo component
> to allow SM to tell port to suppress Bad P_Key traps for multicast
> limited members
> 
> Signed-off-by: Hal Rosenstock 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] opensm/complib: bug in cl_list_insert_array_head/tail functions

2009-11-12 Thread Sasha Khapyorsky
On 15:28 Sun 08 Nov , Yevgeny Kliteynik wrote:
> Fixing bug in cl_list_insert_array_head/tail functions.
> In case of failure, no cleanup is done.
> 
> Signed-off-by: Yevgeny Kliteynik 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: strong ordering for data registered memory

2009-11-12 Thread David Brean

See section 4 in the paper called "High Performance RDMA Based MPI Implementation over 
InfiniBand" on the MVAPICH web page for description of one implementation that polls on data 
buffers.  Specifically, look at text around the statement "Although the approach uses the 
in-order implementation of hardware for RDMA write which is not specified in the InfiniBand 
standard, this feature is very likely to be kept by different hardware designers."  Although 
this paper is describing a PCI-X implementation, the feature is also exists on PCIe.

It's assumed that the host memory interconnect complies with statements described in the 
"Update Ordering and Granularity Provided by a Write Transaction" of the PCI 
spec.  This particular application depends on PCI WRITE behavior, not READ.

Does this help?

Jason Gunthorpe wrote:

On Wed, Nov 11, 2009 at 05:44:59PM -0500, Richard Frank wrote:


Would anyone like to through out the list of HCAs that do this... I
can guess at a few...  and can ask the vendors directly.. if not.. .

It would be much nicer to not hardcode names of adapters.. but that won't
stop us.. :)


Isn't it more complex than this? AFAIK the PCI-E standard does not
specify the order which data inside a single transfer becomes visible,
only how different transfers relate. To work on the most agressive
PCI-E system the HCA would have to transfer the last XX bytes as a
seperate PCI-E transaction without relaxed ordering.

This is the sort of thing that might start to matter on QPI and HT
memory-interleaved configurations. A multi-cache line transfer will be
split up and completed on different chips - it may not be fully
coherent 100% of the time.

Jason

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: strong ordering for data registered memory

2009-11-12 Thread David Brean
Some of the SPARC servers from Sun support an IO memory mapping 
mechanism for marking pages for weak or strong ordering.  It has no 
dependency on the HCA.  Marking memory pages for data buffers as weak 
ordered delivers significantly better throughput on IB.


I assumed that there are many more applications using the user verbs 
that don't care about ordering for data buffers, but would need to be 
modified to set a "weak ordering" flag.


Nonetheless, I'm somewhat flexible and willing to go with bit definition 
that that verbs consumers prefer.


-David

Roland Dreier wrote:

 > I decided to minimize the impact of an API change on the class of
 > applications that use the current verbs interface because those
 > applications can safely run on platforms that deliver optimal
 > performance using weak ordering for data buffers.  New binaries aren't
 > required for this class of application.
 > 
 > I thought it would be more appropriate to put the burden of added

 > complexity on the class of applications that bypass the verbs to
 > access special features in the hardware.  In fact, those applications
 > are selective about memory regions that need this special handling and
 > would register lots of memory without the "strong ordering' bit.  How
 > applications determine that the platform is capable of performing the
 > request would be beyond the scope of the verbs, however, I suppose
 > that the verbs framework could check and return an error.
 > 
 > If there are applications that expect the hardware to support "strong

 > ordering" and don't check the hardware, then these might be a problem.
 > Do any of these exists?
 > 
 > By the way, if I had proposed this bit several years ago, then I would

 > have chosen a "weak ordering" flag.  Instead, I decided to try
 > protecting the existing base of verbs-based software.

I can't really follow this.  Right now Open MPI et al assume that if
they see a Mellanox adapter, they get the "last byte of RDMA becomes
visible last" behavior.  And there is not a way that I know of to turn
this off at all, let alone get any performance difference.  The
exception being the Cell processor system that started the previous
discussion, where weak ordering at the platform level helped things.

But given that current software does seem to rely on ordering, it seems
that opting into weak ordering would break fewer applications.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] infiniband-diags/libibnetdisc: Remove nearly duplicate "get_node_info" function

2009-11-12 Thread Sasha Khapyorsky
On 14:01 Tue 10 Nov , Ira Weiny wrote:
> 
> From: Ira Weiny 
> Date: Tue, 10 Nov 2009 13:51:41 -0800
> Subject: [PATCH] infiniband-diags/libibnetdisc: Remove nearly duplicate 
> "get_node_info" function
> 
>   Fold remaining functionality into "query_port_info"
> 
> Signed-off-by: Ira Weiny 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] infiniband-diags/libibnetdisc: clean up PortInfo queries

2009-11-12 Thread Sasha Khapyorsky
On 14:01 Tue 10 Nov , Ira Weiny wrote:
> Sasha,
> 
> This and the subsequent patch clean up the port info query code to be more
> straight forward.
> 
> First off I got rid of the "decode_port_info" function as it does very little
> and combined the smp_query_via call into a "query_port_info"
> 
> The second patch removes the "get_port_info" function as it became nearly
> redundant with query_port_info.
> 
> Ira
> 
> From: Ira Weiny 
> Date: Tue, 10 Nov 2009 11:22:03 -0800
> Subject: [PATCH] infiniband-diags/libibnetdisc: clean up PortInfo queries
> 
>   remove old "decode_port_info" and replace with new "query_port_info"
> 
> Signed-off-by: Ira Weiny 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] infiniband-diags/ibqueryerrors: Add support for PortXmitDiscardDetails

2009-11-12 Thread Sasha Khapyorsky
On 18:59 Fri 06 Nov , Sasha Khapyorsky wrote:
> On 09:42 Wed 04 Nov , Hal Rosenstock wrote:
> > 
> > When --details selected, if PortCounters has transmit discards, then
> > query PortXmitDiscardDetails. On reset, if --details is selected, then
> > PortXmitDiscardDetails will also be reset.
> > 
> > Unfortunately, PortSamplesControl:OptionMask can't be trusted to determine
> > which PortXmitDiscardDetails counters are supported so an option was added
> > for this.
> 
> Interesting... Why so? Is it any bug in existing hw? Which one?

Could you elaborate?

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: OpenSM: reporting traps 129, 130, 131

2009-11-12 Thread Sasha Khapyorsky
On 11:38 Thu 12 Nov , Hal Rosenstock wrote:
> 
> We've cared for several years now (babbling port policy was introduced
> into the master on 7/6/09) but this issue still persists. Are you
> saying no reports for these traps until the trap rate is obeyed ?

If existing trap rate filtering in OpenSM is sufficient we can add
reporting (if somebody needs this). I just don't think that we need to
introduce temporary options.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] complib/fleximap: make compar callback to return int

2009-11-12 Thread Sasha Khapyorsky

To be consistent with another similar comparator functions (used in
qsort, scandir, etc.) make fleximap comparator function to return 'int'
instead of 'long'.

Signed-off-by: Sasha Khapyorsky 
---
 opensm/complib/cl_map.c  |8 
 opensm/include/complib/cl_fleximap.h |2 +-
 opensm/opensm/osm_subnet.c   |2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/opensm/complib/cl_map.c b/opensm/complib/cl_map.c
index b49b4b8..d851bf8 100644
--- a/opensm/complib/cl_map.c
+++ b/opensm/complib/cl_map.c
@@ -1148,7 +1148,7 @@ cl_fmap_item_t *cl_fmap_get(IN const cl_fmap_t * const 
p_map,
IN const void *const p_key)
 {
cl_fmap_item_t *p_item;
-   intn_t cmp;
+   int cmp;
 
CL_ASSERT(p_map);
CL_ASSERT(p_map->state == CL_INITIALIZED);
@@ -1175,7 +1175,7 @@ cl_fmap_item_t *cl_fmap_get_next(IN const cl_fmap_t * 
const p_map,
 {
cl_fmap_item_t *p_item;
cl_fmap_item_t *p_item_found;
-   intn_t cmp;
+   int cmp;
 
CL_ASSERT(p_map);
CL_ASSERT(p_map->state == CL_INITIALIZED);
@@ -1273,7 +1273,7 @@ cl_fmap_item_t *cl_fmap_insert(IN cl_fmap_t * const p_map,
   IN cl_fmap_item_t * const p_item)
 {
cl_fmap_item_t *p_insert_at, *p_comp_item;
-   intn_t cmp = 0;
+   int cmp = 0;
 
CL_ASSERT(p_map);
CL_ASSERT(p_map->state == CL_INITIALIZED);
@@ -1575,7 +1575,7 @@ void cl_fmap_delta(IN OUT cl_fmap_t * const p_map1,
   OUT cl_fmap_t * const p_new, OUT cl_fmap_t * const p_old)
 {
cl_fmap_item_t *p_item1, *p_item2;
-   intn_t cmp;
+   int cmp;
 
CL_ASSERT(p_map1);
CL_ASSERT(p_map2);
diff --git a/opensm/include/complib/cl_fleximap.h 
b/opensm/include/complib/cl_fleximap.h
index 0af8766..ec008cf 100644
--- a/opensm/include/complib/cl_fleximap.h
+++ b/opensm/include/complib/cl_fleximap.h
@@ -181,7 +181,7 @@ typedef struct _cl_fmap_item {
 *
 * SYNOPSIS
 */
-typedef intn_t
+typedef int
 (*cl_pfn_fmap_cmp_t) (IN const void *const p_key1,
  IN const void *const p_key2);
 /*
diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
index cac5e94..dd72a3a 100644
--- a/opensm/opensm/osm_subnet.c
+++ b/opensm/opensm/osm_subnet.c
@@ -397,7 +397,7 @@ static const opt_rec_t opt_tbl[] = {
{0}
 };
 
-static long compar_mgids(const void *m1, const void *m2)
+static int compar_mgids(const void *m1, const void *m2)
 {
return memcmp(m1, m2, sizeof(ib_gid_t));
 }
-- 
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] opensm: cleanup intn_t uses

2009-11-12 Thread Sasha Khapyorsky

In all cases where non standard (defined in complib) intn_t and uintn_t
types were used pointer boundary was not required. So clean it up -
replace by int and unsigned int types.

Signed-off-by: Sasha Khapyorsky 
---
 opensm/include/opensm/osm_mcast_tbl.h   |2 +-
 opensm/libvendor/osm_vendor_al.c|4 ++--
 opensm/libvendor/osm_vendor_ibumad_sa.c |6 +++---
 opensm/libvendor/osm_vendor_mlx_sa.c|6 +++---
 opensm/opensm/osm_mcast_tbl.c   |   14 +-
 opensm/opensm/osm_sa_multipath_record.c |5 ++---
 opensm/opensm/osm_sa_path_record.c  |6 ++
 7 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/opensm/include/opensm/osm_mcast_tbl.h 
b/opensm/include/opensm/osm_mcast_tbl.h
index 0745b5b..37e2c26 100644
--- a/opensm/include/opensm/osm_mcast_tbl.h
+++ b/opensm/include/opensm/osm_mcast_tbl.h
@@ -174,7 +174,7 @@ void osm_mcast_tbl_delete(IN osm_mcast_tbl_t ** pp_tbl);
 *
 * SYNOPSIS
 */
-int osm_mcast_tbl_realloc(IN osm_mcast_tbl_t * p_tbl, IN uintn_t mlid_offset);
+int osm_mcast_tbl_realloc(IN osm_mcast_tbl_t * p_tbl, IN unsigned mlid_offset);
 /*
 * PARAMETERS
 *
diff --git a/opensm/libvendor/osm_vendor_al.c b/opensm/libvendor/osm_vendor_al.c
index abf3395..3ac05c9 100644
--- a/opensm/libvendor/osm_vendor_al.c
+++ b/opensm/libvendor/osm_vendor_al.c
@@ -489,7 +489,7 @@ Exit:
 static ib_api_status_t
 __osm_vendor_get_ca_guids(IN osm_vendor_t * const p_vend,
  IN ib_net64_t ** const p_guids,
- IN uintn_t * const p_num_guids)
+ IN unsigned * const p_num_guids)
 {
ib_api_status_t status;
 
@@ -581,7 +581,7 @@ osm_vendor_get_all_port_attr(IN osm_vendor_t * const p_vend,
ib_api_status_t status;
 
uint32_t ca;
-   uintn_t ca_count;
+   unsigned ca_count;
uint32_t port_count = 0;
uint8_t port_num;
uint32_t total_ports = 0;
diff --git a/opensm/libvendor/osm_vendor_ibumad_sa.c 
b/opensm/libvendor/osm_vendor_ibumad_sa.c
index 1a7d5a2..e05a5ef 100644
--- a/opensm/libvendor/osm_vendor_ibumad_sa.c
+++ b/opensm/libvendor/osm_vendor_ibumad_sa.c
@@ -129,9 +129,9 @@ __osmv_sa_mad_rcv_cb(IN osm_madw_t * p_madw,
if (ib_get_attr_size(p_sa_mad->attr_offset)) {
/* we used the offset value to calculate the
   number of records in here */
-   query_res.result_cnt = (uintn_t)
-   ((p_madw->mad_size - IB_SA_MAD_HDR_SIZE) /
-ib_get_attr_size(p_sa_mad->attr_offset));
+   query_res.result_cnt =
+   (p_madw->mad_size - IB_SA_MAD_HDR_SIZE) /
+   ib_get_attr_size(p_sa_mad->attr_offset);
OSM_LOG(p_bind->p_log, OSM_LOG_DEBUG,
"Count = %u = %zu / %u (%zu)\n",
query_res.result_cnt,
diff --git a/opensm/libvendor/osm_vendor_mlx_sa.c 
b/opensm/libvendor/osm_vendor_mlx_sa.c
index 0a4e050..e47f072 100644
--- a/opensm/libvendor/osm_vendor_mlx_sa.c
+++ b/opensm/libvendor/osm_vendor_mlx_sa.c
@@ -141,9 +141,9 @@ __osmv_sa_mad_rcv_cb(IN osm_madw_t * p_madw,
"__osmv_sa_mad_rcv_cb: Count = 0\n");
}
else {
-   query_res.result_cnt = (uintn_t)
-   ((p_madw->mad_size - 
IB_SA_MAD_HDR_SIZE) /
-
ib_get_attr_size(p_sa_mad->attr_offset));
+   query_res.result_cnt =
+   (p_madw->mad_size - IB_SA_MAD_HDR_SIZE) 
/
+   ib_get_attr_size(p_sa_mad->attr_offset);
osm_log(p_bind->p_log, OSM_LOG_DEBUG,
"__osmv_sa_mad_rcv_cb: "
"Count = %u = %zu / %u (%zu)\n",
diff --git a/opensm/opensm/osm_mcast_tbl.c b/opensm/opensm/osm_mcast_tbl.c
index 14f8e7a..ee59275 100644
--- a/opensm/opensm/osm_mcast_tbl.c
+++ b/opensm/opensm/osm_mcast_tbl.c
@@ -88,9 +88,7 @@ void osm_mcast_tbl_destroy(IN osm_mcast_tbl_t * p_tbl)
 void osm_mcast_tbl_set(IN osm_mcast_tbl_t * p_tbl, IN uint16_t mlid_ho,
   IN uint8_t port)
 {
-   uintn_t mlid_offset;
-   uintn_t mask_offset;
-   uintn_t bit_mask;
+   unsigned mlid_offset, mask_offset, bit_mask;
int16_t block_num;
 
CL_ASSERT(p_tbl && p_tbl->p_mask_tbl);
@@ -108,7 +106,7 @@ void osm_mcast_tbl_set(IN osm_mcast_tbl_t * p_tbl, IN 
uint16_t mlid_ho,
p_tbl->max_block_in_use = (uint16_t) block_num;
 }
 
-int osm_mcast_tbl_realloc(IN osm_mcast_tbl_t * p_tbl, IN uintn_t mlid_offset)
+int osm_mcast_tbl_realloc(IN osm_mcast_tbl

Re: [PATCH] osm_subnet.c

2009-11-12 Thread Sasha Khapyorsky
On 09:09 Mon 05 Oct , Smith, Stan wrote:
> >>
> >> --- a/opensm/opensm/osm_subnet.c2009-10-01
> >> 12:45:52.0 -0700 +++ b/opensm/opensm/osm_subnet.c
> >> 2009-10-01 14:24:18.0 -0700 @@ -397,7 +397,7 @@
> >>
> >>  /**
> >>
> >> **/
> >> -static long compar_mgids(const void *m1, const void *m2) +static
> >> intn_t compar_mgids(const void *m1, const void *m2)
> >
> > Any disagreement about changing a prototype of this method
> > (cl_pfn_fmap_cmp_t) in complib to use standard type (long) instead of
> > "homemade" and less clear one ('intn_t')?
> >
> > Sasha
> >
> >>  {
> >> return memcmp(m1, m2, sizeof(ib_gid_t));
> >>  }
> 
> How is the processor architecture 'natural' size intn_t less clear than 
> 'long'?

'intn_t' is not a standard type, but a hack defined in complib, unlike
this 'long' is a very basic C language type (btw I'm fine with 'int' for
this particular case).

> In some worlds sizeof(long) != sizeof(void*).

This is bad. Fortunately for this case we don't need a "natural" int. And
for case where it is really needed a standard types would be more
appropriate in general than complib's defined intn_t.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/core: export struct ib_port

2009-11-12 Thread Dave Olson
On Thu, 12 Nov 2009, Roland Dreier wrote:

| 
|  > |  > While this is true for SLtoVL, we create other files which are
|  > |  > device specific under the port directory too.
|  > |  > It seems like we might need to introduce a callback into the driver to
|  > |  > create the port specific sysfs files.
|  > | 
|  > | Umm, you could have said there were other things initially!
|  > 
|  > Those have been there "forever" in qib without requiring the change
|  > in the core sysfs code.  It's only sysfs group entries that require
|  > the patch to expose ib_port.
| 
| OK, I'm confused.  Ralph originally said:
| 
|  > It [struct ib_port] is used by the new ib_qib driver to expose the SL
|  > to VL table since the user level MPI library (libpsm) constructs
|  > packets including the IB header.
| 
| So if that's the only use, then I'd be in favor of just exposing the
| standard, generic SL-to-VL table info for all IB devices.  If there are
| other per-port device-specific things, then let's give a clean way for
| devices to add per-port attributes without having to know the internals
| of how the RDMA core implements sysfs stuff.

The other per-port stuff doesn't require ib_port, because it's not
using the same parent kobject.   They are separate directories with
separate files, not groups attached to the port kobject.

Ralph is correct, but the answer is somewhat misleading, in his response to
your reply.  The new SL2VL table is the only thing in the HCA driver
that needs ib_port to be exposed so far, and "likely" to be the only
thing ever.

| Yes, I agree that callbacks aren't really the best way.  I was
| suggesting passing in the per-port attributes as part of the ib_device
| structure in ib_register_device().

For us, that would be inconvenient, although we could probably make
it work.   And it's not just an attribute list, we'd then have to
export functions somehow (via a pointer table, presumably).  It all
seems pretty inflexible, and a lot more work, for no real gain.

Dave Olson
dave.ol...@qlogic.com
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [ofw] Re: [PATCH] osm_subnet.c

2009-11-12 Thread Sean Hefty
>   int compar(const void *, const void *);
>
>, or to not bother at all and leave it as 'long'.
>
>Hmm, I would prefer to change to 'int' to prevent future confusing.
>Thoughts?

int seems to make the most sense to me.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ofw] Re: [PATCH] osm_subnet.c

2009-11-12 Thread Sasha Khapyorsky
On 09:20 Mon 05 Oct , Sean Hefty wrote:
> >>  /**
> >>   **/
> >> -static long compar_mgids(const void *m1, const void *m2)
> >> +static intn_t compar_mgids(const void *m1, const void *m2)
> >
> >Any disagreement about changing a prototype of this method
> >(cl_pfn_fmap_cmp_t) in complib to use standard type (long) instead of
> >"homemade" and less clear one ('intn_t')?
> >
> >Sasha
> >
> >>  {
> >> return memcmp(m1, m2, sizeof(ib_gid_t));
> >>  }
> 
> If the function is simply a wrapper around a single call to memcmp, why keep 
> it
> at all?

The function is used as comparator callback in complib's fleximap
implementation (just similar to qsort()'s compar parameter and others)
and signed integer (= , >  or < 0) should be returned. Using pointer
sized int as return value is not needed here. We can do it to match
others comparators (qsort, scandir, etc.) prototype exactly:

int compar(const void *, const void *);

, or to not bother at all and leave it as 'long'.

Hmm, I would prefer to change to 'int' to prevent future confusing.
Thoughts?

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] IB/core: export struct ib_port

2009-11-12 Thread Roland Dreier

 > |  > While this is true for SLtoVL, we create other files which are
 > |  > device specific under the port directory too.
 > |  > It seems like we might need to introduce a callback into the driver to
 > |  > create the port specific sysfs files.
 > | 
 > | Umm, you could have said there were other things initially!
 > 
 > Those have been there "forever" in qib without requiring the change
 > in the core sysfs code.  It's only sysfs group entries that require
 > the patch to expose ib_port.

OK, I'm confused.  Ralph originally said:

 > It [struct ib_port] is used by the new ib_qib driver to expose the SL
 > to VL table since the user level MPI library (libpsm) constructs
 > packets including the IB header.

So if that's the only use, then I'd be in favor of just exposing the
standard, generic SL-to-VL table info for all IB devices.  If there are
other per-port device-specific things, then let's give a clean way for
devices to add per-port attributes without having to know the internals
of how the RDMA core implements sysfs stuff.

 > | Anyway, rather than a callback, I guess we could just add a place to
 > | attach a set of port attributes to the structure that gets passed into
 > | ib_register_device() maybe?
 > 
 > Seems like major overkill to have callbacks, when all we need is to
 > get the structure that "owns" (is the parent kobject of) the directory.

Yes, I agree that callbacks aren't really the best way.  I was
suggesting passing in the per-port attributes as part of the ib_device
structure in ib_register_device().

 > | And maybe we could clean up the existing code that does
 > | device_create_file() to use a list of device attributes also...
 > 
 > Seems to be a rather different issue, to me.

Yes, it's independent.  But if we're passing in per-port attributes, we
might as well take the opportunity to make the API rational and pass in
per-device attributes too.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] opensm - use C99 transportable data type for pointer storage

2009-11-12 Thread Sasha Khapyorsky
On 17:48 Mon 09 Nov , Stan C. Smith wrote:
> 
> In order to skip the #ifndef __WIN__ around #include , inttypes.h 
> was used.
> 'inttypes.h' includes stdint.h and exists in the WinOF svn tree.
> OFED opensm builds without problems on EL 5.3.
> 
> Signed-off-by: stan smith 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband-diags] [PATCH] [2/2] split out scan specific data from ibnd_node_t

2009-11-12 Thread Ira Weiny
On Thu, 12 Nov 2009 18:31:05 +0200
Sasha Khapyorsky  wrote:

> On 10:34 Fri 06 Nov , Al Chu wrote:
> > > 
> > > Why do you want to remove this? port->path_portid can be useful for
> > > logging, specific querying, etc.. Even node->dnext can be helpful for
> > > some "advanced" use too.
> > 
> > The 'nodesdist' array (which is what the 'dnext' pointer is used for) is
> > only used during the scan and is no longer available publicly.
> 
> Really? I thought that it could be a useful data for "advanced" uses.

nodesdist was remove from the public interface.  Although an "advanced" user
might have been able to use it, the data stored there was very scan specific.
Removing it was a good idea for 2 reasons, 1) simplify the interface, 2) if
the scan algorithm changed users might have to change the way they use the
data; not good for compatibility.

> 
> > So the
> > 'dnext' pointer doesn't serve a purpose being in the public ibnd_node_t
> > struct.
> > 
> > Post scan, the 'path_portid' was ony used in ibnd_update_node(), which
> > is now removed.
> 
> You are saying about libibnetdisc itself. My example was about an
> application which uses this. For instance after discovery application
> may want to query some ports for its own purpose. What is wrong with
> that?

I agree there was some usefulness, sometimes.  However the path_portid can not
be guaranteed to be valid.  Again there are multiple issues.  First I don't
think we want to support to this to the users.  Second Al is working toward is
the ability to cache the fabric information to be read back later.  Storing
all this "scan" specific information is going to be extra work which is
superfluous to the layout of the fabric.

> 
> > In addition, Ira and I felt it is one of the fields
> > that shouldn't have been exported out of libibnetdisc, it is far too
> > "scan specific" and shouldn't be public.
> 
> I cannot understand why are you trying to make things there as "private"
> as technically possible (even on price of extra code size and
> complexity). Finally it is an open source stuff, so let to users to use
> it how they want and for their own responsibility. :)

Making things "private" allows us to change the library without breaking the
interface.  Furthermore, it simplifies the interface for users who want to
write code at a higher level.  My original design goals were 2 fold.  1. make
a library which speeds up the functionality of the perl script tools.  2.
Provide a C interface to a fast scan library which can be used to implement
further tools which would have formerly been implemented via scripts around
ibnetdiscover.

Here is one use case we have been working off of.

One of our MPI developers here is not familiar with Infiniband.  He has
written many scripts around the current suite of tools for various research
that he does.  The concepts of nodes, ports, and links are familiar to him but
sending a "MAD" or differentiating between a GSI MAD vs SMP is not.  I want to
give him information about nodes, links, ports, errors, data counters, routing
tables, etc. without making him use the MAD layer, or worse yet, umad layer.
In this use case he does not care that libibnetdisc does a BFS and creates a
nodesdist structure of some sort resulting from that algorithm.  Presenting a
"fabric" with a list of "nodes" which further have some "ports" makes sense to
a user like this.

This use case in addition to trying to cache the data makes a simpler, cleaner
interface much more attractive.  :-D

Ira

[snip]

-- 
Ira Weiny
Math Programmer/Computer Scientist
Lawrence Livermore National Lab
925-423-8008
wei...@llnl.gov
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] opensm - use C99 transportable data type for pointer storage

2009-11-12 Thread Sasha Khapyorsky
On 09:32 Thu 12 Nov , Smith, Stan wrote:
> Hello,
>   Can you help me understand why this patch always seems to fall into a black 
> hole with no feedback?

Bad luck and my wishes to understand what is going on there. However the
second is less critical in this particular case - I can do it after
applying the patch.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [ofw] RE: [PATCH] opensm - use C99 transportable data type forpointer storage

2009-11-12 Thread Smith, Stan
Hefty, Sean wrote:
>> Did you send it before morning coffee or after ?  :)
>>
>> Maybe, because it seems obvious.
>> All are agree with the elimination of unnecessary #ifndef's.
>> As to me, go ahead, commit it.
>
> It needs to be merged into the mgmt.git tree.
>
  #include 
 +#include 

  #ifdef __cplusplus
  #  define BEGIN_C_DECLS extern "C" {
 @@ -49,7 +50,7 @@
  #endif   /* __cplusplus */

  BEGIN_C_DECLS
 -#define st_ptr_t unsigned long
 +#define st_ptr_t uintptr_t
>
> Does anyone know the reason for st_ptr_t at all?  Why not use
> uintptr_t everywhere and avoid the define?

This is a good question. I avoided the issue due to the fact this patch has 
been submitted before with no feedback so KIS is the game plan for now.

Once the uintptr_t concept has been accepted, then a 2nd step would be to 
replace st_ptr_t with uintptr_t and remove the #define.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [ofw] RE: [PATCH] opensm - use C99 transportable data type forpointer storage

2009-11-12 Thread Sean Hefty
>Did you send it before morning coffee or after ?  :)
>
>Maybe, because it seems obvious.
>All are agree with the elimination of unnecessary #ifndef's.
>As to me, go ahead, commit it.

It needs to be merged into the mgmt.git tree.

>> >  #include 
>> > +#include 
>> >
>> >  #ifdef __cplusplus
>> >  #  define BEGIN_C_DECLS extern "C" {
>> > @@ -49,7 +50,7 @@
>> >  #endif   /* __cplusplus */
>> >
>> >  BEGIN_C_DECLS
>> > -#define st_ptr_t unsigned long
>> > +#define st_ptr_t uintptr_t

Does anyone know the reason for st_ptr_t at all?  Why not use uintptr_t
everywhere and avoid the define?


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [ofw] RE: [PATCH] opensm - use C99 transportable data type forpointer storage

2009-11-12 Thread Leonid Keller
Did you send it before morning coffee or after ?  :)

Maybe, because it seems obvious. 
All are agree with the elimination of unnecessary #ifndef's.
As to me, go ahead, commit it.

> -Original Message-
> From: ofw-boun...@lists.openfabrics.org 
> [mailto:ofw-boun...@lists.openfabrics.org] On Behalf Of Smith, Stan
> Sent: Thursday, November 12, 2009 7:32 PM
> To: 'Sasha Khapyorsky'
> Cc: linux-rdma@vger.kernel.org; o...@lists.openfabrics.org
> Subject: [ofw] RE: [PATCH] opensm - use C99 transportable 
> data type forpointer storage
> 
> Hello,
>   Can you help me understand why this patch always seems to 
> fall into a black hole with no feedback?
> 
> Thank you,
> 
> Stan.
> 
> 
> Stan C. Smith wrote:
> > In order to skip the #ifndef __WIN__ around #include , 
> > inttypes.h was used. 'inttypes.h' includes stdint.h and 
> exists in the 
> > WinOF svn tree.
> > OFED opensm builds without problems on EL 5.3.
> >
> > Signed-off-by: stan smith 
> >
> > diff --git a/opensm/include/opensm/st.h 
> b/opensm/include/opensm/st.h 
> > index 30cc308..ad6c289 100644
> > --- a/opensm/include/opensm/st.h
> > +++ b/opensm/include/opensm/st.h
> > @@ -39,6 +39,7 @@
> >  #define ST_INCLUDED
> >
> >  #include 
> > +#include 
> >
> >  #ifdef __cplusplus
> >  #  define BEGIN_C_DECLS extern "C" {
> > @@ -49,7 +50,7 @@
> >  #endif   /* __cplusplus */
> >
> >  BEGIN_C_DECLS
> > -#define st_ptr_t unsigned long
> > +#define st_ptr_t uintptr_t
> >  typedef st_ptr_t st_data_t;
> >
> >  #define ST_DATA_T_DEFINED
> 
> ___
> ofw mailing list
> o...@lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband-diags] [PATCH] [2/2] split out scan specific data from ibnd_node_t

2009-11-12 Thread Al Chu
Hey Sasha,

> Really? I thought that it could be a useful data for "advanced" uses.

It was removed in commit 094f922a34d6378d6a3bd1d137f90d6530685f94.  It
was a simpler version of a patch that Ira had proposed on the mailing
list.

> I cannot understand why are you trying to make things there as 
> "private" as technically possible (even on price of extra code size 
> and complexity). Finally it is an open source stuff, so let to users 
> to use it how they want and for their own responsibility. :)

At the core of this patch (as well as some other patches I've submitted
on libibnetdiscover before), is cleaning up the interface of
libibnetdisc to be just the "core" of libibnetdisc.  We could stick
anything into the public structs that could have potential usefulness,
but at some point I think we need to limit ourselves to only the core
stuff.  Why not add the ibmad_port to the structs?  Or instead of
putting just the guids or lids in the structs, why not also the pkeys,
capability masks, or VL tables?

Al

On Thu, 2009-11-12 at 18:31 +0200, Sasha Khapyorsky wrote:
> On 10:34 Fri 06 Nov , Al Chu wrote:
> > > 
> > > Why do you want to remove this? port->path_portid can be useful for
> > > logging, specific querying, etc.. Even node->dnext can be helpful for
> > > some "advanced" use too.
> > 
> > The 'nodesdist' array (which is what the 'dnext' pointer is used for) is
> > only used during the scan and is no longer available publicly.
> 
> Really? I thought that it could be a useful data for "advanced" uses.
> 
> > So the
> > 'dnext' pointer doesn't serve a purpose being in the public ibnd_node_t
> > struct.
> > 
> > Post scan, the 'path_portid' was ony used in ibnd_update_node(), which
> > is now removed.
> 
> You are saying about libibnetdisc itself. My example was about an
> application which uses this. For instance after discovery application
> may want to query some ports for its own purpose. What is wrong with
> that?
> 
> > In addition, Ira and I felt it is one of the fields
> > that shouldn't have been exported out of libibnetdisc, it is far too
> > "scan specific" and shouldn't be public.
> 
> I cannot understand why are you trying to make things there as "private"
> as technically possible (even on price of extra code size and
> complexity). Finally it is an open source stuff, so let to users to use
> it how they want and for their own responsibility. :)
> 
> Sasha
-- 
Albert Chu
ch...@llnl.gov
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] opensm/osm_state_mgr.c: do not probe remote side of port 0

2009-11-12 Thread Sasha Khapyorsky
On 11:37 Tue 10 Nov , Yevgeny Kliteynik wrote:
> Do not probe remote side of port 0 of the switch
> that SM is running on - spare the timeout error.
> 
> Signed-off-by: Yevgeny Kliteynik 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] opensm - use C99 transportable data type for pointer storage

2009-11-12 Thread Smith, Stan
Hello,
  Can you help me understand why this patch always seems to fall into a black 
hole with no feedback?

Thank you,

Stan.


Stan C. Smith wrote:
> In order to skip the #ifndef __WIN__ around #include ,
> inttypes.h was used. 'inttypes.h' includes stdint.h and exists in the
> WinOF svn tree.
> OFED opensm builds without problems on EL 5.3.
>
> Signed-off-by: stan smith 
>
> diff --git a/opensm/include/opensm/st.h b/opensm/include/opensm/st.h
> index 30cc308..ad6c289 100644
> --- a/opensm/include/opensm/st.h
> +++ b/opensm/include/opensm/st.h
> @@ -39,6 +39,7 @@
>  #define ST_INCLUDED
>
>  #include 
> +#include 
>
>  #ifdef __cplusplus
>  #  define BEGIN_C_DECLS extern "C" {
> @@ -49,7 +50,7 @@
>  #endif   /* __cplusplus */
>
>  BEGIN_C_DECLS
> -#define st_ptr_t unsigned long
> +#define st_ptr_t uintptr_t
>  typedef st_ptr_t st_data_t;
>
>  #define ST_DATA_T_DEFINED

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] infiniband-diags/ibqueryerrors: Reformat the output of the xmtdiscard details.

2009-11-12 Thread Sasha Khapyorsky
On 14:04 Tue 10 Nov , Ira Weiny wrote:
> 
> From: Ira Weiny 
> Date: Tue, 3 Nov 2009 13:58:03 -0800
> Subject: [PATCH] infiniband-diags/ibqueryerrors: Reformat the output of the 
> xmtdiscard details.
> 
> Old:
> Errors for 0x2c9020040fec8 "Infiniscale-IV Mellanox Technologies"
>GUID 0x2c9020040fec8 port 27: [XmtDiscards == 8]
>GUID 0x2c9020040fec8 port 27: [PortNeighborMTUDiscards == 8]
> 
> New:
> Errors for 0x2c9020040fec8 "Infiniscale-IV Mellanox Technologies"
>GUID 0x2c9020040fec8 port 27: [XmtDiscards == 8] [PortNeighborMTUDiscards 
> == 8]
> 
> Signed-off-by: Ira Weiny 

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] libibmad: Add support for PortInfo:McastPkeyTrapSuppressionEnabled

2009-11-12 Thread Sasha Khapyorsky
On 14:35 Fri 06 Nov , Hal Rosenstock wrote:
> On Fri, Nov 6, 2009 at 1:02 PM, Sasha Khapyorsky  wrote:
> > On 09:10 Fri 06 Nov     , Hal Rosenstock wrote:
> >>
> >> Per published MgtWG erratum RefID 4576
> >
> > Please describe the change.
> 
> Not sure what else you are looking for here.

Not just me. Many people may look over change log. And they are not
necessary must be a full time IBTA members (and/or even not full time
linux-rdma list observers). Likely that for non-IBTA members words
"MgtWG" and "RefID 4576" will not say a lot about the change.

And for me it will be required to log into members area of IBTA web site
(which is slow) and to search in the tracker for RefID 4576. It is just
in order to review a two lines patch.

> Is it a description of
> the new field or a description of the patch which is trivial:
> "Add support for new field to PortInfo"

What is this field, what should it indicate, what is
McastPkeyTrap, what is an issue with it, etc.. Nothing of this is
covered.

> > Reference to private tracker's RefID number
> > is not enough.
> 
> It's not private as I've tried to explain before.

You said this, but you never provided any link to where it is published.
Could you?

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: OpenSM: reporting traps 129, 130, 131

2009-11-12 Thread Hal Rosenstock
On Thu, Nov 12, 2009 at 11:38 AM, Hal Rosenstock
 wrote:
> On Thu, Nov 12, 2009 at 10:42 AM, Sasha Khapyorsky  
> wrote:
>> On 08:46 Thu 12 Nov     , Hal Rosenstock wrote:
>>> >
>>> > Any particular reason why there's no reporting of these traps?
>>>
>>> AFAIK it's been this way for at least the last 5 or 6 years.
>>
>> Yes, this is what I found too tracking down git/svn history.
>>
>>> A practical consideration in changing this is that these traps are the
>>> ones for which babbling port was implemented since they do not obey
>>> the trap rate so there's a large downside to adding the reports for
>>> these. If that is to be done, then perhaps this should be done based
>>> on some option.
>>
>> I don't think that we need new option for this, instead we should care
>> that OpenSM is not overloaded by such trap floods.
>
> We've cared for several years now (babbling port policy was introduced
> into the master on 7/6/09)

I meant 7/6/07

> but this issue still persists. Are you
> saying no reports for these traps until the trap rate is obeyed ?
>
> -- Hal
>
>>
>> Sasha
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: OpenSM: reporting traps 129, 130, 131

2009-11-12 Thread Hal Rosenstock
On Thu, Nov 12, 2009 at 10:42 AM, Sasha Khapyorsky  wrote:
> On 08:46 Thu 12 Nov     , Hal Rosenstock wrote:
>> >
>> > Any particular reason why there's no reporting of these traps?
>>
>> AFAIK it's been this way for at least the last 5 or 6 years.
>
> Yes, this is what I found too tracking down git/svn history.
>
>> A practical consideration in changing this is that these traps are the
>> ones for which babbling port was implemented since they do not obey
>> the trap rate so there's a large downside to adding the reports for
>> these. If that is to be done, then perhaps this should be done based
>> on some option.
>
> I don't think that we need new option for this, instead we should care
> that OpenSM is not overloaded by such trap floods.

We've cared for several years now (babbling port policy was introduced
into the master on 7/6/09) but this issue still persists. Are you
saying no reports for these traps until the trap rate is obeyed ?

-- Hal

>
> Sasha
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [infiniband-diags] [PATCH] [2/2] split out scan specific data from ibnd_node_t

2009-11-12 Thread Sasha Khapyorsky
On 10:34 Fri 06 Nov , Al Chu wrote:
> > 
> > Why do you want to remove this? port->path_portid can be useful for
> > logging, specific querying, etc.. Even node->dnext can be helpful for
> > some "advanced" use too.
> 
> The 'nodesdist' array (which is what the 'dnext' pointer is used for) is
> only used during the scan and is no longer available publicly.

Really? I thought that it could be a useful data for "advanced" uses.

> So the
> 'dnext' pointer doesn't serve a purpose being in the public ibnd_node_t
> struct.
> 
> Post scan, the 'path_portid' was ony used in ibnd_update_node(), which
> is now removed.

You are saying about libibnetdisc itself. My example was about an
application which uses this. For instance after discovery application
may want to query some ports for its own purpose. What is wrong with
that?

> In addition, Ira and I felt it is one of the fields
> that shouldn't have been exported out of libibnetdisc, it is far too
> "scan specific" and shouldn't be public.

I cannot understand why are you trying to make things there as "private"
as technically possible (even on price of extra code size and
complexity). Finally it is an open source stuff, so let to users to use
it how they want and for their own responsibility. :)

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] librdmacm/mckey: add notifications on events

2009-11-12 Thread Sean Hefty
>The librdmacm examples serve for multiple purposes, among them user education
>on how to write rdmacm based apps and as a vehicle to test/validate/reproduce
>features/bugs/issues, for example a follow program claimed that she isn't sure
>to get a multicast error event on her application when a port goes down, so
>with my patch to mckey we were able to see that this event is generated and we
>can now do better testing. In the future mckey can be further enhanced to
>rejoin,etc on either of the events, makes sense?

ok - thanks - I'll merge it in

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: OpenSM: reporting traps 129, 130, 131

2009-11-12 Thread Sasha Khapyorsky
On 08:46 Thu 12 Nov , Hal Rosenstock wrote:
> >
> > Any particular reason why there's no reporting of these traps?
> 
> AFAIK it's been this way for at least the last 5 or 6 years.

Yes, this is what I found too tracking down git/svn history.

> A practical consideration in changing this is that these traps are the
> ones for which babbling port was implemented since they do not obey
> the trap rate so there's a large downside to adding the reports for
> these. If that is to be done, then perhaps this should be done based
> on some option.

I don't think that we need new option for this, instead we should care
that OpenSM is not overloaded by such trap floods.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: OpenSM: reporting traps 129, 130, 131

2009-11-12 Thread Hal Rosenstock
On Sun, Nov 8, 2009 at 9:37 AM, Yevgeny Kliteynik
 wrote:
> Hi Sasha,
>
> I noticed that OpenSM doesn't send InformInfo on traps 129/130/131.
> This is what osm_trap_rcv.c is doing:
>
> 322: static void trap_rcv_process_request(IN osm_sm_t * sm,
> 323:                                 IN const osm_madw_t * p_madw)
> ...
> 435:            if (ib_notice_is_generic(p_ntci) &&
> 436:                (p_ntci->g_or_v.generic.trap_num == CL_HTON16(129) ||
> 437:                 p_ntci->g_or_v.generic.trap_num == CL_HTON16(130) ||
> 438:                 p_ntci->g_or_v.generic.trap_num == CL_HTON16(131))) {
> 439:                    /* If this is a trap 129, 130, or 131 - then this is
> a
> 440:                     * trap signaling a change on a physical port.
> 441:                     * Mark the physp_change_trap flag as TRUE.
> 442:                     */
> 443:                    physp_change_trap = TRUE;
> ...
>
> 539:    /* If we reached here due to trap 129/130/131 - do not need to do
> 540:       the notice report. Just goto exit. We know this is the case
> 541:       if physp_change_trap is TRUE. */
> 542:    if (physp_change_trap == TRUE)
> 543:            goto Exit;
>
>
> Any particular reason why there's no reporting of these traps?

AFAIK it's been this way for at least the last 5 or 6 years.

A practical consideration in changing this is that these traps are the
ones for which babbling port was implemented since they do not obey
the trap rate so there's a large downside to adding the reports for
these. If that is to be done, then perhaps this should be done based
on some option.

-- Hal

>
> -- Yevgeny
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: saquery on ibsim

2009-11-12 Thread Sasha Khapyorsky
On 18:30 Thu 12 Nov , Keshetti Mahesh wrote:
> 
> Is there any limitation in ibsim which is not letting the saquery program
> retrieve all path records available ?

Yes, ibsim doesn't support large (> 256 bytes) RMPP packets yet.

Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: saquery on ibsim

2009-11-12 Thread Keshetti Mahesh
Thanks Hal for the quick reply.

> ibsim does not support RMPP.
>

Is there any plan to add RMPP support to ibsim in the future ?

-- 
Keshetti Mahesh
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: saquery on ibsim

2009-11-12 Thread Hal Rosenstock
On Thu, Nov 12, 2009 at 8:00 AM, Keshetti Mahesh
 wrote:
> I am observing difference in behavior of 'saquery' on real cluster and
> ibsim. Whenever I query for path records on cluster using
>
> #saquery -p
>
> it returns path records of all possible source-destination combination.
> But when I run the same command on a simulated cluster (ibsim) it
> returns only 2-3 path records even though there are many nodes(> 25).
>
> Is there any limitation in ibsim which is not letting the saquery program
> retrieve all path records available ?

ibsim does not support RMPP.

-- Hal

>
> --
> Keshetti Mahesh
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


saquery on ibsim

2009-11-12 Thread Keshetti Mahesh
I am observing difference in behavior of 'saquery' on real cluster and
ibsim. Whenever I query for path records on cluster using

#saquery -p

it returns path records of all possible source-destination combination.
But when I run the same command on a simulated cluster (ibsim) it
returns only 2-3 path records even though there are many nodes(> 25).

Is there any limitation in ibsim which is not letting the saquery program
retrieve all path records available ?

--
Keshetti Mahesh
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] opensm/osm_state_mgr.c: force heavy sweep when fabric consists of single switch

2009-11-12 Thread Yevgeny Kliteynik

Eli Dorfman (Voltaire) wrote:

Yevgeny Kliteynik wrote:

Eli Dorfman (Voltaire) wrote:

Yevgeny Kliteynik wrote:

Eli Dorfman (Voltaire) wrote:

Yevgeny Kliteynik wrote:

Eli Dorfman (Voltaire) wrote:

Yevgeny Kliteynik wrote:

Yevgeny Kliteynik wrote:

Line Holen wrote:

On 11/ 4/09 04:54 PM, Yevgeny Kliteynik wrote:

Line Holen wrote:

On 11/ 4/09 10:47 AM, Yevgeny Kliteynik wrote:

Sasha Khapyorsky wrote:

On 12:26 Tue 03 Nov , Yevgeny Kliteynik wrote:

Always do heavy sweep when there is only one node in the
fabric, and this node is a switch, and SM runs on top of it -
there may be a race when OSM starts running before the
external ports are ports are up, or if they went through
reset while SM was starting.
In this race switch brings up the ports and turns on the
PSC bit, but OSM might get PortInfo before SwitchInfo, and it
might see all ports as down, but PSC bit on. If that happens,
OSM turns off PSC bit, and it will never see external ports
again - it won't perform any heavy sweep, only light sweep

Could such race happen when there are more than one node in a
fabric?

I think that my description of the race was misleading.
The race can happen on *any* fabric when SM runs on switch.
But when it does happen, SM thinks that the whole subnet
is just one switch - that's what it managed to discover.
I've actually seen it happening.
So the patch fixes this particular case.

So the next question that you would probably ask is can
this race happen on some *other* switch and not the one
SM is running on?

Well, I don't know. I have a hunch that it can't, but I
couldn't prove it to myself yet.

The race on the managed switch is a special case because
SM always sees port 0, and always gets responses to its
SMP queries. On any other switch, if the ports were reset,
SM won't get any response until the ports are up again.

Perhaps there might be a case where SM got some port as down,
and by the time SM got SwitchInfo with PSC bit the port
was already up, so SM won't start discovery beyond this
port. But this race would be fixed on the next heavy sweep,
when SM will discover this port that it missed the previous
time, whereas race on managed switch is fatal - SM won't
ever do any heavy sweep.

-- Yevgeny

At least for the 3.2 branch there is a general race
regardless of
where the SM is running. I haven't checked the current
master, but
I cannot recall seeing any patches related to this so I assume
the race is still there.

There is a window between SM discovering a switch and
clearing PSC
for the same switch. The SM will not detect a state change on
the
switch ports during this time.

If the port changes state during that period, the switch issues
new trap 128, which (I think) should cause SM to re-discover the
fabric once this discovery cycle is over. Is this correct?


I think the switch shall send a trap whenever it sets the PSC bit.
Once set I believe it will not send another trap until it is
reset.
Or do I misinterpret the spec ?

I may be wrong, but I thought that this is how things work:
- port state changes
- switch turns on PSC bit and starts sending traps
- SM gets the trap, sends trap repress
- switch gets trap repress and stops sending traps
- PSC is still on
- port state changes again (the same or any other port)
- switch turns on PSC bit (which doesn't matter as PSC is
  already on) and starts sending traps again
- etc...

Anyway, I'll double-check this issue.

Yep, verified.
Switch sends traps regardless the PSC bit status.
Also, the spec doesn't link them together:

  o14-5.1.1: If a switch supports Traps (PortInfo:
  CapabilityMask.IsTrap-Supported is one), its SMA
  shall send trap 128 to the SM indicated by the 
PortInfo:MasterSMLID

under any condition that   would cause SwitchInfo:PortStateChange to
be set
  to one. (See 14.2.5.4 SwitchInfo on page 827.)


Trap will be sent according to the SMLID. After first bring up the
SMLID is not set yet and trap will not be sent.
In that case the opensm would discover the change only by PSC bit.
For IS3 chips the PSC bit and/or trap were set only after one or more
ports changed their state, so I don't understand how can the SM
discover PSC bit set while all ports are down. Or is this a change in
IS4?

It can happen when SM runs on the switch, not not host.
In this case if all ports are going down, SM will see
them all down and it will see PSC bit on.

So this patch is only for SM running on a switch which is the only
node in the fabric?
I don't see the race when there is more than one switch - please
explain.

Quoting from above:

  The race can happen on *any* fabric when SM runs on switch.
  But when it does happen, SM thinks that the whole subnet
  is just one switch - that's what it managed to discover.

I saw that but I don't understand how this can happen.
If PSC bit is set after *every* port state change and
SM clears PSC bit before reading PortInfo from the switch,

osm_node_info_rcv.c, ni_rcv_process_switch():
I see in the code that SM rece

Re: ipath now and then (was [PATCH] IB/core: export struct ib_port)

2009-11-12 Thread Or Gerlitz
Ralph Campbell wrote:
> I don't understand what you are suggesting.
> The kernel module name ib_ipath and/or directory name
> drivers/infiniband/hw/ipath could be reused for some
> other purpose certainly.

In a 2nd thought, its better that you go and remove the hw/ipath directory, I 
assume the qib code could be made to serve software iboe in the same manner 
ipath can, just make sure to keep the IB L2 handling in separate files from the 
L3/L4 ones...

Or
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] librdmacm/mckey: add notifications on events

2009-11-12 Thread Or Gerlitz
Sean Hefty wrote:
> mckey is intended to be a fairly simple send/receive multicast test program.
> What's the reasoning behind adding the event handling?

The librdmacm examples serve for multiple purposes, among them user education 
on how to write rdmacm based apps and as a vehicle to test/validate/reproduce 
features/bugs/issues, for example a follow program claimed that she isn't sure 
to get a multicast error event on her application when a port goes down, so 
with my patch to mckey we were able to see that this event is generated and we 
can now do better testing. In the future mckey can be further enhanced to 
rejoin,etc on either of the events, makes sense?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html