[PATCH 1/3 V2] opensm: connect switches in tree - added new option

2009-10-28 Thread Yevgeny Kliteynik
Adding new option: '--connect_switches'
This option should connect more switches with a
down/up routes in up/down and fat-tree routing.

Changes from v1:
 - added option description in osm_subnet.h
 - added loading option from options file in osm_subnet.c

Signed-off-by: Yevgeny Kliteynik klit...@dev.mellanox.co.il
---
 opensm/include/opensm/osm_subnet.h |   10 --
 opensm/man/opensm.8.in |   11 +--
 opensm/opensm/main.c   |   15 +--
 opensm/opensm/osm_subnet.c |7 +++
 4 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/opensm/include/opensm/osm_subnet.h 
b/opensm/include/opensm/osm_subnet.h
index b63c97e..095b294 100644
--- a/opensm/include/opensm/osm_subnet.h
+++ b/opensm/include/opensm/osm_subnet.h
@@ -189,6 +189,7 @@ typedef struct osm_subn_opt {
char *routing_engine_names;
boolean_t use_ucast_cache;
boolean_t connect_roots;
+   boolean_t connect_switches;
char *lid_matrix_dump_file;
char *lfts_file;
char *root_guid_file;
@@ -369,8 +370,13 @@ typedef struct osm_subn_opt {
 *
 *  connect_roots
 *  The option which will enforce root to root connectivity with
-*  up/down routing engine (even if this violates pure deadlock
-*  free up/down algorithm)
+*  up/down and fat-tree routing engines (even if this violates
+*  pure deadlock free up/down or fat-tree algorithm)
+*
+*  connect_switches
+*  The option which will enforce switch to switch connectivity
+*  with up/down and fat-tree routing engines (even if this
+*  violates pure deadlock free up/down or fat-tree algorithm)
 *
 *  use_ucast_cache
 *  When TRUE enables unicast routing cache.
diff --git a/opensm/man/opensm.8.in b/opensm/man/opensm.8.in
index 03002c0..d6fceef 100644
--- a/opensm/man/opensm.8.in
+++ b/opensm/man/opensm.8.in
@@ -19,6 +19,7 @@ opensm \- InfiniBand subnet manager and administration (SM/SA)
 [\-\-lash_start_vl vl number]
 [\-A | \-\-ucast_cache]
 [\-z | \-\-connect_roots]
+[\-\-connect_switches]
 [\-M file name | \-\-lid_matrix_file file name]
 [\-U file name | \-\-lfts_file file name]
 [\-S | \-\-sadb_file file name]
@@ -170,8 +171,14 @@ recalculations: one when the host goes down, and the other 
when
 the host comes back online.
 .TP
 \fB\-z\fR, \fB\-\-connect_roots\fR
-This option enforces a routing engine (currently up/down
-only) to make connectivity between root switches and in
+This option enforces routing engines (up/down and
+fat-tree) to make connectivity between root switches and in
+this way to be fully IBA complaint. In many cases this can
+violate pure deadlock free algorithm, so use it carefully.
+.TP
+\fB\-\-connect_switches\fR
+This option enforces routing engines (up/down and
+fat-tree) to make connectivity between all switches and in
 this way to be fully IBA complaint. In many cases this can
 violate pure deadlock free algorithm, so use it carefully.
 .TP
diff --git a/opensm/opensm/main.c b/opensm/opensm/main.c
index 2e28c83..8175887 100644
--- a/opensm/opensm/main.c
+++ b/opensm/opensm/main.c
@@ -189,8 +189,14 @@ static void show_usage(void)
printf(--sm_sl sl number\n
 Sets the SL to use to communicate with the SM/SA. 
Defaults to 0.\n\n);
printf(--connect_roots, -z\n
-This option enforces a routing engine (currently\n
-up/down only) to make connectivity between root 
switches\n
+This option enforces routing engines (up/down and \n
+fat-tree) to make connectivity between root 
switches\n
+and in this way be IBA compliant. In many cases,\n
+this can violate \pure\ deadlock free algorithm, 
so\n
+use it carefully.\n\n);
+   printf(--connect_switches\n
+This option enforces routing engines (up/down and \n
+fat-tree) to make connectivity between all the 
switches\n
 and in this way be IBA compliant. In many cases,\n
 this can violate \pure\ deadlock free algorithm, 
so\n
 use it carefully.\n\n);
@@ -610,6 +616,7 @@ int main(int argc, char *argv[])
{do_mesh_analysis, 0, NULL, 5},
{lash_start_vl, 1, NULL, 6},
{sm_sl, 1, NULL, 7},
+   {connect_switches, 0, NULL, 8},
{NULL, 0, NULL, 0}  /* Required at the end of the array */
};

@@ -983,6 +990,10 @@ int main(int argc, char *argv[])
opt.sm_sl = (uint8_t) temp;
printf( SMSL = %d\n, opt.sm_sl);
break;
+   case 8:
+   opt.connect_switches = TRUE;
+   printf( Connect switches option is on\n);

[PATCH] opensm/osm_sa.c: don't ignore failure in osm_mgrp_add_port()

2009-10-28 Thread Yevgeny Kliteynik
Hi Sasha,

Small fix in loading SA DB: don't ignore failure
in osm_mgrp_add_port() - require clients re-registration.

Signed-off-by: Yevgeny Kliteynik klit...@dev.mellanox.co.il
---
 opensm/opensm/osm_sa.c |7 ---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/opensm/opensm/osm_sa.c b/opensm/opensm/osm_sa.c
index a124a45..7b9340d 100644
--- a/opensm/opensm/osm_sa.c
+++ b/opensm/opensm/osm_sa.c
@@ -1002,9 +1002,10 @@ int osm_sa_db_file_load(osm_opensm_t * p_osm)
port = osm_get_port_by_guid(p_osm-subn, guid);
if (port 
cl_qmap_get(p_mgrp-mcm_port_tbl, guid) ==
-   cl_qmap_end(p_mgrp-mcm_port_tbl))
-   osm_mgrp_add_port(p_osm-subn, p_osm-log,
- p_mgrp, port, mcmr, proxy);
+   cl_qmap_end(p_mgrp-mcm_port_tbl) 
+   !osm_mgrp_add_port(p_osm-subn, p_osm-log,
+   p_mgrp, port, mcmr, proxy))
+   rereg_clients = 1;
} else if (!strncmp(p, Service Record:, 15)) {
ib_service_record_t s_rec;
uint32_t modified_time, lease_period;
-- 
1.5.1.4


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] ibutils/ibdm/doc: Fix some typos

2009-10-28 Thread Yevgeny Kliteynik

Thanks, applied.

-- Yevgeny

Hal Rosenstock wrote:

Signed-off-by: Hal Rosenstock hal.rosenst...@gmail.com
---
diff --git a/ibdm/doc/ibdmchk.1 b/ibdm/doc/ibdmchk.1
index 650c9ff..7537473 100644
--- a/ibdm/doc/ibdmchk.1
+++ b/ibdm/doc/ibdmchk.1
@@ -186,7 +186,7 @@ Use enhanced routing algorithm when \s-1LMC\s0  0 and 
report the resulting path
 .IX Item -r|--roots roots file A file with all the roots node names (one on each 
line).
 .SH VERIFICATION MODE DESCRIPTION
 .IX Header VERIFICATION MODE DESCRIPTION
-After the cluster is built and OpenSM is run (using flag \-D 0x43) it reports 
the subnet and \s-1FDB\s0 tables into the files osm\-subnet.lst, osm.fdbs and 
osm.fdbs in /var/log/ (or subnet.lst, osm.fdbs and osm.mcfdbs into /tmp in 
older OpenSM versions). ibdiagnet is also producing the same files in its 
outoput directroy.
+After the cluster is built and OpenSM is run (using flag \-D 0x43) it reports 
the subnet and \s-1FDB\s0 tables into the files osm\-subnet.lst, osm.fdbs and 
osm.fdbs in /var/log/ (or subnet.lst, osm.fdbs and osm.mcfdbs into /tmp in 
older OpenSM versions). ibdiagnet is also producing the same files in its 
output directory.
 Based on these files the utility checks all \s-1CA\s0 to \s-1CA\s0 
connectivity. Further analysis for credit deadlock potential is performed and 
reported.
 In case of an \s-1LMC\s0  0 it reports histograms for how many systems and 
nodes are common between the different paths for the same port pairs.
 .SH ARGUMENTS
diff --git a/ibdm/doc/ibdmchk.pod b/ibdm/doc/ibdmchk.pod
index e6a2232..dabfafa 100644
--- a/ibdm/doc/ibdmchk.pod
+++ b/ibdm/doc/ibdmchk.pod
@@ -59,7 +59,7 @@ A file with all the roots node names (one on each line).
 
 =head1 VERIFICATION MODE DESCRIPTION
 
-After the cluster is built and OpenSM is run (using flag -D 0x43) it reports the subnet and FDB tables into the files osm-subnet.lst, osm.fdbs and osm.fdbs in /var/log/ (or subnet.lst, osm.fdbs and osm.mcfdbs into /tmp in older OpenSM versions). ibdiagnet is also producing the same files in its outoput directroy.

+After the cluster is built and OpenSM is run (using flag -D 0x43) it reports 
the subnet and FDB tables into the files osm-subnet.lst, osm.fdbs and osm.fdbs 
in /var/log/ (or subnet.lst, osm.fdbs and osm.mcfdbs into /tmp in older OpenSM 
versions). ibdiagnet is also producing the same files in its output directory.
 Based on these files the utility checks all CA to CA connectivity. Further 
analysis for credit deadlock potential is performed and reported.
 In case of an LMC  0 it reports histograms for how many systems and nodes are 
common between the different paths for the same port pairs.
 



--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] infiniband-diags/ibqueryerrors: Use remap'ed node name in clear port error message

2009-10-28 Thread Sasha Khapyorsky
On 10:22 Tue 27 Oct , Ira Weiny wrote:
 
 From: Ira Weiny wei...@llnl.gov
 Date: Tue, 27 Oct 2009 10:22:36 -0700
 Subject: [PATCH] infiniband-diags/ibqueryerrors: Use remap'ed node name in 
 clear port error message
 
 
 Signed-off-by: Ira Weiny wei...@llnl.gov

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] link-local address fix for rdma_resolve_addr

2009-10-28 Thread Or Gerlitz

Jason Gunthorpe wrote:

Wow, seriously? You do understand the purpose of review, right?
I think I do, maybe not to the depth you and your arguments are, but 
again, repeating myself: my kind of simple argument is that your review 
is way beyond the --change-- suggested by a patch but rather of a whole 
logic, and you block a patch b/c you don't like the logic this patch 
integrates with. To some extent such practice is excepted, but you took 
it to way beyond acceptable limit. I don't accept your assertion that 
the whole logic is broken and it makes sense to me to have a patch from 
Dave to fix the IPv6 part of it. Next or in parallel you are welcome to 
sent a patch fixing/re-writing the whole bind logic or even the whole 
rdma stack or the whole kernel.



And yes, actually, accounting for how rdma_bind() is different from bind() when 
doing route resolution is pretty much the main remaining problem

go and fix that

Or.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RDMA] Fixup IPv6 support and IPv4 routing corner cases for RDMA CM

2009-10-28 Thread Or Gerlitz
Jason Gunthorpe wrote:
 **COMPILE TESTED ONLY**

any reason why other people have to test for you?

 Convert the address resolution process for outgoing connections
 to be very similar to the way the TCP stack does the same operations.
 This fixes many corner case bugs that can crop up.

rdma_join_multicast(3) states that before  joining  a  multicast  group,  the  
rdma_cm_id  must  be  bound to an RDMA device by calling rdma_bind_addr or 
rdma_resolve_addr, please make sure that this flow isn't broken by your patch.

Or.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ofa-general][PATCH 3/4] SRP fail-over faster

2009-10-28 Thread David Dillow
On Sat, 2009-10-24 at 03:35 -0400, Vu Pham wrote:
 It's a big improvement from 3-5 minutes cutting down to 1s and now you
 talk about device_loss_timeout=0. I'll look at the trade-off to have
 it; however, to receive and process the async event (port error)
 already cost you a fair amount of cycles.

I agree that it is a great improvement over just sending packets blindly
to the link, and waiting for SCSI to time them out -- I've been using
the variant of the patches from OFED -- but it is harder to change
things once they are in the mainstream kernel, so I'd like to see it
done better.

And hey, maybe I'm just overly touchy about this. These should be rare
events, and there's nothing we can do about the commands sent prior to
being told about the link error. It's just that I don't want my file
system to stall the petaflop simulation platforms if I can avoid it --
and there's no reason to send any command down the wire once we've been
told there is no link or the target is not there. Maybe we don't need to
destroy the link immediately, but we need to let the SCSI mid-layer know
that things are failing.
-- 
Dave Dillow
National Center for Computational Science
Oak Ridge National Laboratory
(865) 241-6602 office


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] remove extraneous Windows include

2009-10-28 Thread Sasha Khapyorsky
On 10:36 Tue 27 Oct , Stan C. Smith wrote:
 
 Remove unneeded Windows include.
 
 signed-off-by: stan smith stan.sm...@intel.com

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] opemsm code cleanup

2009-10-28 Thread Sasha Khapyorsky
On 10:43 Tue 27 Oct , Stan C. Smith wrote:
 
 Cleanup code by removing if statement which does nothing.
 
 Signed-off-by: stan smith stan.sm...@intel.com

Whitespaces are mangled in the patch. Applied by hands. Thanks.

Sasha

 
 diff --git a/opensm/opensm/osm_vl15intf.c b/opensm/opensm/osm_vl15intf.c
 index 9e43a9c..ee9626f 100644
 --- a/opensm/opensm/osm_vl15intf.c
 +++ b/opensm/opensm/osm_vl15intf.c
 @@ -271,9 +271,6 @@ ib_api_status_t osm_vl15_init(IN osm_vl15_t * p_vl, IN 
 osm_vendor_t * p_vend,
  */
 status = cl_thread_init(p_vl-poller, vl15_poller, p_vl,
 opensm poller);
 -   if (status != IB_SUCCESS)
 -   goto Exit;
 -
  Exit:
 OSM_LOG_EXIT(p_log);
 return (status);
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] opensm/osm_sa.c: don't ignore failure in osm_mgrp_add_port()

2009-10-28 Thread Sasha Khapyorsky
On 11:54 Wed 28 Oct , Yevgeny Kliteynik wrote:
 Hi Sasha,
 
 Small fix in loading SA DB: don't ignore failure
 in osm_mgrp_add_port() - require clients re-registration.
 
 Signed-off-by: Yevgeny Kliteynik klit...@dev.mellanox.co.il

Applied. Thanks.

Sasha
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RDMA] Fixup IPv6 support and IPv4 routing corner cases for RDMA CM

2009-10-28 Thread Jason Gunthorpe
On Wed, Oct 28, 2009 at 10:05:19AM -0700, Sean Hefty wrote:
 Can you explain how rdma_resolve_addr is used in conjunction with
 multicast? I do not understand what the dest would be. Is it just a man
 page typo?
 
 A UD endpoint can communicate using multicast and to other UD
 endpoints.  A user could resolve a UD endpoint before joining a
 multicast group.

So the IP world analog would be:

fd = socket(AF_INET,SOCK_DGRAM);
connect(fd,'Some Unicast Address');
setsockopt(fd,IP_MULITCAST_ADD_MEMBERSHIP,'Some Multicast Address');
sendto(fd,...,'Some Multicast Address');

?

I think that is still OK. The routines still bind the rdma cm_id to
the devices via rdma_translate_ip pretty much like they did before.

There is no support for Linux IP multicast routing though..

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [ofa-general][PATCH 3/4] SRP fail-over faster

2009-10-28 Thread Roland Dreier

  +if (timer_pending(target-qp_err_timer))
  +del_timer_sync(target-qp_err_timer);
  +
   spin_unlock_irq(target-scsi_host-host_lock);

As was pointed out, I don't think you can do del_timer_sync while
holding the lock, since the timer function takes the same lock.

But I don't know that just switching to del_timer without the sync works
here ... without the sync then the timeout function could still run any
time after the del_timer, even after everything gets freed.

BTW the test of timer_pending isn't needed here... del_timer does the
test internally anyway.

I do agree it would be very good to improve the SRP error handling.  I
have some concerns about the overall design here -- it seems that if we
handle connection failure and allow a new connection to proceed while
cleaning up asynchronously, then this opens the door to a lot of
complexity, and I don't see that complexity handled in this patchset.
For example, the new connection could fail too before the old one is
done cleaning up, etc, etc and we end up with an arbitrarily large queue
of things waiting to clean up.  Or maybe it really it is simpler than that.

I think the best way to move this forward would be to post another
cleaned up version of your patch set.  Please try to reorganize things
so each patch is reasonably self contained.  Of course your patchset is
taking multiple steps to improve things.  But as much as possible,
please try to avoid combining two things into a single patch, and
conversely also try to avoid putting things into a patch that don't make
sense without a later patch.

Avoiding policy in the kernel as much as possible in terms of hard-coded
timeouts etc is a good goal too.

Also it would help to give each patch a separate descriptive subject,
and put as much detail in the changelogs as you can.

Thanks,
  Roland
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] please pull infiniband.git

2009-10-28 Thread Roland Dreier
Linus, please pull from

master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git for-linus

This tree is also available from kernel.org mirrors at:

git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git 
for-linus

This will just get one PCI device ID addition:

Eli Cohen (1):
  mlx4_core: Add a new supported 40 GigE device ID

 drivers/net/mlx4/main.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 5dd7225..291a505 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -1282,6 +1282,7 @@ static struct pci_device_id mlx4_pci_table[] = {
{ PCI_VDEVICE(MELLANOX, 0x6372) }, /* MT25458 ConnectX EN 10GBASE-T 
10GigE */
{ PCI_VDEVICE(MELLANOX, 0x675a) }, /* MT25458 ConnectX EN 
10GBASE-T+Gen2 10GigE */
{ PCI_VDEVICE(MELLANOX, 0x6764) }, /* MT26468 ConnectX EN 10GigE PCIe 
gen2*/
+   { PCI_VDEVICE(MELLANOX, 0x6746) }, /* MT26438 ConnectX EN 40GigE PCIe 
gen2 5GT/s */
{ PCI_VDEVICE(MELLANOX, 0x676e) }, /* MT26478 ConnectX2 40GigE PCIe 
gen2 */
{ 0, }
 };
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Adjusting minimum packet size or wait to merge requests in SRP

2009-10-28 Thread Chris Worley
It appears that SRP tries to coalesce and fragment initiator I/O
requests into 64KB packets, as that looks to be the size requested
to/from the device on the target side (and the I/O scheduler is
disabled on the target).

Is there a way to control this, where no coalescing occurs when
latency is an issue and requests are small, and no fragmentation
occurs when requests are large?

Or, am I totally wrong in my assumption that SRP is coalescing/fragmenting data?

Thanks,

Chris
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Adjusting minimum packet size or wait to merge requests in SRP

2009-10-28 Thread Bart Van Assche
On Wed, Oct 28, 2009 at 7:47 PM, Chris Worley worl...@gmail.com wrote:
 It appears that SRP tries to coalesce and fragment initiator I/O
 requests into 64KB packets, as that looks to be the size requested
 to/from the device on the target side (and the I/O scheduler is
 disabled on the target).

 Is there a way to control this, where no coalescing occurs when
 latency is an issue and requests are small, and no fragmentation
 occurs when requests are large?

 Or, am I totally wrong in my assumption that SRP is coalescing/fragmenting 
 data?

Regarding avoiding coalescing of I/O requests: which I/O scheduler is
being used on the initiator system and how has it been configured via
sysfs ?

Adjusting the constant MAX_RDMA_SIZE in scst/srpt/src/ib_srpt.h might
help to avoid fragmentation of large requests by the SRP protocol.
Please post a follow-up message to the mailing list with your findings
such that MAX_RDMA_SIZE can be converted from a compile-time constant
to a sysfs variable if this would be useful.

Bart.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path

2009-10-28 Thread Sean Hefty
But, I still think this API should return EINVAL if the cm_id is in
AF_INET/AF_INET6 mode. That is to say, this API only works with the
AF_IB idea we have been discussing.

I suggest this because using this API really does override the
capabilities of the AF_INET/6 in unexpected ways, as the discussion
drifted through it seemed like at least bonding, routing
and ND operations would/could be overridden.

If so then I'd say it should be part of an AF_IB patch.

Sean, what are your thoughts on applying it to AF_INET/6?

Even without any other kernel changes, this patch enables us to solve the
biggest scaling problem that we've measured, so I want to allow it regardless of
what the original addressing was.  Whether a path record comes from the SA, a
local cache, some wonky multicast protocol, or is made up is really independent
from how the GIDs were discovered.

- Sean

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Adjusting minimum packet size or wait to merge requests in SRP

2009-10-28 Thread Chris Worley
On Wed, Oct 28, 2009 at 1:14 PM, Bart Van Assche
bart.vanass...@gmail.com wrote:
 On Wed, Oct 28, 2009 at 7:47 PM, Chris Worley worl...@gmail.com wrote:
 It appears that SRP tries to coalesce and fragment initiator I/O
 requests into 64KB packets, as that looks to be the size requested
 to/from the device on the target side (and the I/O scheduler is
 disabled on the target).

 Is there a way to control this, where no coalescing occurs when
 latency is an issue and requests are small, and no fragmentation
 occurs when requests are large?

 Or, am I totally wrong in my assumption that SRP is coalescing/fragmenting 
 data?

 Regarding avoiding coalescing of I/O requests: which I/O scheduler is
 being used on the initiator system and how has it been configured via
 sysfs ?

There is no scheduler running on either target or initiator on the
drives in question (sorry I worded that incorrectly initially), or so
I've been told (this information is second-hand).  I did see iostat
output from the initiator in his case, where there were long waits and
service times that I'm guessing was due to some coalescing/merging.
There was also a hint in the iostat output that a scheduler was
enabled, as there were non-zero values (occasionally) under the
[rw]qm/s columns, which, if I understand iostat correctly, means there
is a scheduler merging results.

So you're saying there is no hold-off for merging on the initiator
side of the IB/SRP stack?

 Adjusting the constant MAX_RDMA_SIZE in scst/srpt/src/ib_srpt.h might
 help to avoid fragmentation of large requests by the SRP protocol.
 Please post a follow-up message to the mailing list with your findings
 such that MAX_RDMA_SIZE can be converted from a compile-time constant
 to a sysfs variable if this would be useful.

Will do.

Thanks,

Chris

 Bart.

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Adjusting minimum packet size or wait to merge requests in SRP

2009-10-28 Thread Roland Dreier

  It appears that SRP tries to coalesce and fragment initiator I/O
  requests into 64KB packets, as that looks to be the size requested
  to/from the device on the target side (and the I/O scheduler is
  disabled on the target).

There is no code in the SRP initiator that does anything to change IO
requests that I know of.  So I think this is happening somewhere higher
in the stack.

 - R.
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path

2009-10-28 Thread Jason Gunthorpe
On Wed, Oct 28, 2009 at 02:41:15PM -0700, Sean Hefty wrote:
 Does a DGID returning API already exist?
 
 yes - query_route returns the following information: SGID, DGID, pkey, source
 address, destination address, and path records (max of 2).  Not all of the
 information is valid, depending on the state of the rdma cm id.  The librdmacm
 already invokes this after rdma_resolve_addr completes.

Great, I didn't realize that was there. No further comments from me then

Reviewed-By: Jason Gunthorpe jguntho...@obsidianresearch.com

Jason
--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] uDAPL v2: ucm: increase timers during subsequent retries, add create_ah error checking

2009-10-28 Thread Davis, Arlin R

- increase timers during subsequent retries,
- check/process create_ah errors during connect phase,
- cleanup some debug messaging.

Signed-off-by: Arlin Davis arlin.r.da...@intel.com
---
 dapl/openib_ucm/cm.c |   81 ++---
 1 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index 96ee382..07b8458 100644
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -163,17 +163,16 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
*timer = cm-hca-ib_trans.cm_timer; 
/* wait longer each retry */
if ((time - cm-timer)/1000  
-   (cm-hca-ib_trans.rep_time * cm-retries)) {
+   (cm-hca-ib_trans.rep_time  cm-retries)) {
dapl_log(DAPL_DBG_TYPE_WARN,
  CM_REQ retry %d [lid, port, qpn]:
- %x %x %x - %x %x %x \n, 
-cm-retries,
-ntohs(cm-msg.saddr.ib.lid), 
-ntohs(cm-msg.sport),
-ntohl(cm-msg.saddr.ib.qpn), 
-ntohs(cm-msg.daddr.ib.lid), 
-ntohs(cm-msg.dport),
-ntohl(cm-msg.dqpn));
+ %x %x %x - %x %x %x Time(ms) %llu  
%llu\n, 
+cm-retries, ntohs(cm-msg.saddr.ib.lid), 
+ntohs(cm-msg.sport), 
ntohl(cm-msg.saddr.ib.qpn), 
+ntohs(cm-msg.daddr.ib.lid), 
ntohs(cm-msg.dport),
+ntohl(cm-msg.dqpn), (time - cm-timer)/1000, 
+cm-hca-ib_trans.rep_time  cm-retries);
+   cm-retries++;
dapl_os_unlock(cm-lock);
dapli_cm_connect(cm-ep, cm);
return;
@@ -182,10 +181,10 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
case DCM_RTU_PENDING: 
*timer = cm-hca-ib_trans.cm_timer;  
if ((time - cm-timer)/1000  
-   (cm-hca-ib_trans.rtu_time * cm-retries)) {
+   (cm-hca-ib_trans.rtu_time  cm-retries)) {
dapl_log(DAPL_DBG_TYPE_WARN,
  CM_REPLY retry %d [lid, port, qpn]:
- %x %x %x - %x %x %x r_pid %x,%d\n, 
+ %x %x %x - %x %x %x r_pid %x,%d Time(ms) 
%llu  %llu\n, 
 cm-retries,
 ntohs(cm-msg.saddr.ib.lid), 
 ntohs(cm-msg.sport),
@@ -194,7 +193,9 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
 ntohs(cm-msg.dport),
 ntohl(cm-msg.daddr.ib.qpn),  
 ntohl(*(DAT_UINT32*)cm-msg.resv),
-ntohl(*(DAT_UINT32*)cm-msg.resv)); 
+ntohl(*(DAT_UINT32*)cm-msg.resv), 
+(time - cm-timer)/1000, 
cm-hca-ib_trans.rtu_time  cm-retries);
+   cm-retries++;
dapl_os_unlock(cm-lock);
ucm_reply(cm);
return;
@@ -204,10 +205,10 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
*timer = cm-hca-ib_trans.cm_timer; 
/* wait longer each retry */
if ((time - cm-timer)/1000  
-   (cm-hca-ib_trans.rep_time)) {
+   (cm-hca-ib_trans.rtu_time  cm-retries)) {
dapl_log(DAPL_DBG_TYPE_WARN,
  CM_DREQ retry %d [lid, port, qpn]:
- %x %x %x - %x %x %x r_pid %x,%d\n, 
+ %x %x %x - %x %x %x r_pid %x,%d Time(ms) 
%llu  %llu\n, 
 cm-retries,
 ntohs(cm-msg.saddr.ib.lid), 
 ntohs(cm-msg.sport),
@@ -216,7 +217,9 @@ static void ucm_check_timers(dp_ib_cm_handle_t cm, int 
*timer)
 ntohs(cm-msg.dport),
 ntohl(cm-msg.dqpn), 
 ntohl(*(DAT_UINT32*)cm-msg.resv),
-ntohl(*(DAT_UINT32*)cm-msg.resv)); 
+ntohl(*(DAT_UINT32*)cm-msg.resv), 
+(time - cm-timer)/1000, 
cm-hca-ib_trans.rtu_time  cm-retries);
+   cm-retries++;
dapl_os_unlock(cm-lock);
dapli_cm_disconnect(cm);
 return;
@@ -448,8 +451,8 @@ retry_listenq:
} else {
  

[PATCH 3/5] uDAPL v2: ucm: change some debug message levels, check for valid UD REPLY during retries

2009-10-28 Thread Davis, Arlin R

Signed-off-by: Arlin Davis arlin.r.da...@intel.com
---
 dapl/openib_ucm/cm.c |   38 +-
 1 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index 07b8458..b28e911 100644
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -395,12 +395,22 @@ static void ucm_process_recv(ib_hca_transport_t *tp,
}
dapl_os_unlock(cm-lock);
break;
-   
+   case DCM_RELEASED:
+   /* UD reply retried, ignore */
+   if (ntohs(msg-op) != DCM_REP) {
+   dapl_log(DAPL_DBG_TYPE_WARN,
+ucm_recv: UNKNOWN operation
+- op %d, %s spsp %d sqpn %d\n, 
+   ntohs(msg-op), dapl_cm_state_str(cm-state),
+   ntohs(msg-sport), ntohl(msg-sqpn));
+   }
+   dapl_os_unlock(cm-lock);
+   break;
default:
dapl_log(DAPL_DBG_TYPE_WARN,
 ucm_recv: UNKNOWN state
-- op %d, st %d spsp %d sqpn %d\n, 
-   ntohs(msg-op), cm-state, 
+- op %d, %s spsp %d sqpn %d\n, 
+   ntohs(msg-op), dapl_cm_state_str(cm-state), 
ntohs(msg-sport), ntohl(msg-sqpn));
dapl_os_unlock(cm-lock);
break;
@@ -479,7 +489,7 @@ retry_listenq:
/* not match on listenq for valid request, send reject */
if (ntohs(msg-op) == DCM_REQ  !found)
ucm_reject(tp, msg);
-#if DAPL_DBG
+
if (!found) {
dapl_log(DAPL_DBG_TYPE_WARN,
 ucm_recv: NO MATCH op %s 0x%x %d i0x%x c0x%x
@@ -490,7 +500,7 @@ retry_listenq:
ntohs(msg-saddr.ib.lid), ntohs(msg-sport), 
ntohl(msg-saddr.ib.qpn));
}
-#endif
+
return found;
 }
 
@@ -813,8 +823,18 @@ DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm)
case DCM_DISC_PENDING:
/* DREQ timeout, resend until retries exhausted */
cm-msg.op = htons(DCM_DREQ);
-   if (cm-retries = cm-hca-ib_trans.retries)
+   if (cm-retries = cm-hca-ib_trans.retries) {
+   dapl_log(DAPL_DBG_TYPE_ERR, 
+CM_DREQ: RETRIES EXHAUSTED:
+0x%x %d 0x%x - 0x%x %d 0x%x\n,
+   htons(cm-msg.saddr.ib.lid), 
+   htonl(cm-msg.saddr.ib.qpn), 
+   htons(cm-msg.sport), 
+   htons(cm-msg.daddr.ib.lid), 
+   htonl(cm-msg.dqpn), 
+   htons(cm-msg.dport));
finalize = 1;
+   }
break;
case DCM_DISC_RECV:
/* DREQ received, send DREP and schedule event */
@@ -857,7 +877,7 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
}

if (cm-retries == cm-hca-ib_trans.retries) {
-   dapl_log(DAPL_DBG_TYPE_WARN, 
+   dapl_log(DAPL_DBG_TYPE_ERR, 
 CM_REQ: RETRIES EXHAUSTED:
  0x%x %d 0x%x - 0x%x %d 0x%x\n,
 htons(cm-msg.saddr.ib.lid), 
@@ -1289,8 +1309,8 @@ static int ucm_reply(dp_ib_cm_handle_t cm)
}
 
if (cm-retries == cm-hca-ib_trans.retries) {
-   dapl_log(DAPL_DBG_TYPE_WARN, 
- CM_REP: RETRIES EXHAUSTED
+   dapl_log(DAPL_DBG_TYPE_ERR, 
+ CM_REPLY: RETRIES EXHAUSTED
  0x%x %d 0x%x - 0x%x %d 0x%x\n,
 htons(cm-msg.saddr.ib.lid), 
 htons(cm-msg.sport), 
-- 
1.5.2.5

--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] uDAPL v2: Patch series for ucm, scm: fixes for issues discovered during scale-up, out testing

2009-10-28 Thread Arlin Davis

Linux testing completed with Intel MPI/HPCC benchmarks on 128 nodes, 1024 cores.
ucm, scm: address handles need destroyed when freeing Endpoints with UD QP's.

Signed-off-by: Arlin Davis arlin.r.da...@intel.com
---
 dapl/openib_scm/cm.c   |4 
 dapl/openib_ucm/cm.c   |6 ++
 dapl/openib_ucm/dapl_ib_util.h |1 +
 dapl/openib_ucm/device.c   |   16 ++--
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index 453e32e..0d2d058 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -355,6 +355,10 @@ multi_cleanup:
dapl_os_lock(cr-lock);
hca_ptr = cr-hca;
cr-ep = NULL;
+   if (cr-ah) {
+   ibv_destroy_ah(cr-ah);
+   cr-ah = NULL;
+   }
cr-state = DCM_DESTROY;
dapl_os_unlock(cr-lock);
}
diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index cc480c4..96ee382 100644
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -679,6 +679,10 @@ static void ucm_ud_free(DAPL_EP *ep)
dapl_os_lock(cm-lock);
hca = cm-hca;
cm-ep = NULL;
+   if (cm-ah) {
+   ibv_destroy_ah(cm-ah);
+   cm-ah = NULL;
+   }
cm-state = DCM_DESTROY;
dapl_os_unlock(cm-lock);
}
@@ -1041,6 +1045,7 @@ ud_bail:
event = IB_CME_LOCAL_FAILURE;
goto bail;
}
+   cm-ah = xevent.remote_ah.ah; /* keep ref to destroy */
 
dapl_os_memcpy(xevent.remote_ah.ia_addr,
   cm-msg.daddr,
@@ -1218,6 +1223,7 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm, 
ib_cm_msg_t *msg)
if (xevent.remote_ah.ah == NULL) 
goto bail;
 
+   cm-ah = xevent.remote_ah.ah; /* keep ref to destroy */
dapl_os_memcpy(xevent.remote_ah.ia_addr,
   cm-msg.daddr,
sizeof(union dcm_addr));
diff --git a/dapl/openib_ucm/dapl_ib_util.h b/dapl/openib_ucm/dapl_ib_util.h
index 27ff8dd..6273459 100644
--- a/dapl/openib_ucm/dapl_ib_util.h
+++ b/dapl/openib_ucm/dapl_ib_util.h
@@ -43,6 +43,7 @@ struct ib_cm_handle
struct dapl_hca *hca;
struct dapl_sp  *sp;
struct dapl_ep  *ep;
+   struct ibv_ah   *ah;
uint16_tp_size; /* accept p_data, for retries */
uint8_t p_data[DCM_MAX_PDATA_SIZE];
ib_cm_msg_t msg;
diff --git a/dapl/openib_ucm/device.c b/dapl/openib_ucm/device.c
index 077446b..e890eef 100644
--- a/dapl/openib_ucm/device.c
+++ b/dapl/openib_ucm/device.c
@@ -434,14 +434,18 @@ static void ucm_service_destroy(IN DAPL_HCA *hca)
if (tp-rch)
ibv_destroy_comp_channel(tp-rch);
 
-dapl_log(DAPL_DBG_TYPE_UTIL,
- destroy_service: pd %p ctx %p handle 0x%x\n,
- tp-pd, tp-pd-context, tp-pd-handle);
-   if (tp-pd)
-   ibv_dealloc_pd(tp-pd);
+   if (tp-ah) {
+   int i;
 
-   if (tp-ah)
+   for (i = 0;i  0x; i++) {
+   if (tp-ah[i])
+   ibv_destroy_ah(tp-ah[i]);
+   }
dapl_os_free(tp-ah, (sizeof(*tp-ah) * 0x));
+   }
+
+   if (tp-pd)
+   ibv_dealloc_pd(tp-pd);
 
if (tp-sid)
dapl_os_free(tp-sid, (sizeof(*tp-sid) * 0x));
-- 
1.5.2.5


--
To unsubscribe from this list: send the line unsubscribe linux-rdma in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH libibverbs] Make ibv_get_device_list return codes via errno

2009-10-28 Thread Jason Gunthorpe
- Supress fprintf stderr on sundry cases like no verbs drivers
  loaded
- Fix double free in find_sysfs_devs if ibv_read_sysfs_file fails
  (unlikely)
- Update all example programs and man page

Code expecting this behavior linking to old libibverbs will
get the old fprint and errno set to garbage (probably ESPIPE).

Signed-off-by: Jason Gunthorpe jguntho...@obsidianresearch.com
---
 examples/asyncwatch.c |2 +-
 examples/device_list.c|2 +-
 examples/devinfo.c|4 ++--
 examples/rc_pingpong.c|2 +-
 examples/srq_pingpong.c   |2 +-
 examples/uc_pingpong.c|2 +-
 examples/ud_pingpong.c|2 +-
 man/ibv_get_device_list.3 |   16 +++-
 src/device.c  |   21 +++--
 src/init.c|   38 ++
 10 files changed, 56 insertions(+), 35 deletions(-)

The double free thing could be split out if you want, I just caught it
while working on this.

Works like this:
$ build/examples/ibv_devinfo   
Failed to get IB devices list: Function not implemented
$ build/examples/ibv_devinfo   
No IB devices found

diff --git a/examples/asyncwatch.c b/examples/asyncwatch.c
index 16aee2c..e56b4dc 100644
--- a/examples/asyncwatch.c
+++ b/examples/asyncwatch.c
@@ -82,7 +82,7 @@ int main(int argc, char *argv[])
 
dev_list = ibv_get_device_list(NULL);
if (!dev_list) {
-   fprintf(stderr, No IB devices found\n);
+   perror(Failed to get IB devices list);
return 1;
}
 
diff --git a/examples/device_list.c b/examples/device_list.c
index 3ce8cbd..70c3af5 100644
--- a/examples/device_list.c
+++ b/examples/device_list.c
@@ -49,7 +49,7 @@ int main(int argc, char *argv[])
 
dev_list = ibv_get_device_list(num_devices);
if (!dev_list) {
-   fprintf(stderr, No IB devices found\n);
+   perror(Failed to get IB devices list);
return 1;
}
 
diff --git a/examples/devinfo.c b/examples/devinfo.c
index caa5d5f..33d1a9b 100644
--- a/examples/devinfo.c
+++ b/examples/devinfo.c
@@ -361,7 +361,7 @@ int main(int argc, char *argv[])
case 'l':
dev_list = orig_dev_list = 
ibv_get_device_list(num_of_hcas);
if (!dev_list) {
-   fprintf(stderr, Failed to get IB devices 
list);
+   perror(Failed to get IB devices list);
return -1;
}
 
@@ -387,7 +387,7 @@ int main(int argc, char *argv[])
 
dev_list = orig_dev_list = ibv_get_device_list(NULL);
if (!dev_list) {
-   fprintf(stderr, Failed to get IB device list\n);
+   perror(Failed to get IB devices list);
return -1;
}
 
diff --git a/examples/rc_pingpong.c b/examples/rc_pingpong.c
index d4115e4..fa969e0 100644
--- a/examples/rc_pingpong.c
+++ b/examples/rc_pingpong.c
@@ -593,7 +593,7 @@ int main(int argc, char *argv[])
 
dev_list = ibv_get_device_list(NULL);
if (!dev_list) {
-   fprintf(stderr, No IB devices found\n);
+   perror(Failed to get IB devices list);
return 1;
}
 
diff --git a/examples/srq_pingpong.c b/examples/srq_pingpong.c
index e47bae6..1e36c57 100644
--- a/examples/srq_pingpong.c
+++ b/examples/srq_pingpong.c
@@ -682,7 +682,7 @@ int main(int argc, char *argv[])
 
dev_list = ibv_get_device_list(NULL);
if (!dev_list) {
-   fprintf(stderr, No IB devices found\n);
+   perror(Failed to get IB devices list);
return 1;
}
 
diff --git a/examples/uc_pingpong.c b/examples/uc_pingpong.c
index 404b059..6f31247 100644
--- a/examples/uc_pingpong.c
+++ b/examples/uc_pingpong.c
@@ -581,7 +581,7 @@ int main(int argc, char *argv[])
 
dev_list = ibv_get_device_list(NULL);
if (!dev_list) {
-   fprintf(stderr, No IB devices found\n);
+   perror(Failed to get IB devices list);
return 1;
}
 
diff --git a/examples/ud_pingpong.c b/examples/ud_pingpong.c
index 8f3d50b..6f10212 100644
--- a/examples/ud_pingpong.c
+++ b/examples/ud_pingpong.c
@@ -580,7 +580,7 @@ int main(int argc, char *argv[])
 
dev_list = ibv_get_device_list(NULL);
if (!dev_list) {
-   fprintf(stderr, No IB devices found\n);
+   perror(Failed to get IB devices list);
return 1;
}
 
diff --git a/man/ibv_get_device_list.3 b/man/ibv_get_device_list.3
index 003fffb..16cc1a0 100644
--- a/man/ibv_get_device_list.3
+++ b/man/ibv_get_device_list.3
@@ -25,10 +25,24 @@ returned by
 .B ibv_get_device_list()\fR.
 .SH RETURN VALUE
 .B ibv_get_device_list()
-returns the array of available RDMA devices, or NULL if the request fails.
+returns the array of available RDMA devices, or sets
+.I errno
+and returns NULL if the request fails. If no 

[infiniband-diags] [PATCH] [2/2] remove 'dist' field from ibnd_node_t, which was virtually not used

2009-10-28 Thread Al Chu
Remove the 'dist' field from the ibnd_node_t struct and rearch code
appropriately.  It ends up this field was only used to pass a value from
create_node() to add_to_nodedist(), of which create_node() is the only
function that calls add_to_nodedist().  In other words, it served pretty
much no purpose.

Al

-- 
Albert Chu
ch...@llnl.gov
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory
From: Albert Chu ch...@llnl.gov
Date: Wed, 28 Oct 2009 16:18:39 -0700
Subject: [PATCH] remove 'dist' field from ibnd_node_t, which was virtually not 
used


Signed-off-by: Albert Chu ch...@llnl.gov
---
 .../libibnetdisc/include/infiniband/ibnetdisc.h|1 -
 infiniband-diags/libibnetdisc/src/ibnetdisc.c  |6 ++
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h 
b/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h
index 8303175..a8d290c 100644
--- a/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h
+++ b/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h
@@ -48,7 +48,6 @@ typedef struct ibnd_node {
struct ibnd_node *next; /* all node list in fabric */
 
ib_portid_t path_portid;/* path from from_node */
-   int dist;   /* num of hops from from_node */
int smalid;
int smalmc;
 
diff --git a/infiniband-diags/libibnetdisc/src/ibnetdisc.c 
b/infiniband-diags/libibnetdisc/src/ibnetdisc.c
index b25c3d0..047b705 100644
--- a/infiniband-diags/libibnetdisc/src/ibnetdisc.c
+++ b/infiniband-diags/libibnetdisc/src/ibnetdisc.c
@@ -387,9 +387,8 @@ static void add_to_type_list(ibnd_node_t * node, 
ibnd_fabric_t * fabric)
}
 }
 
-static void add_to_nodedist(ibnd_node_t * node, ibnd_scan_t * ibnd_scan)
+static void add_to_nodedist(ibnd_node_t * node, ibnd_scan_t * ibnd_scan, int 
dist)
 {
-   int dist = node-dist;
if (node-type != IB_NODE_SWITCH)
dist = MAXHOPS; /* special Ca list */
 
@@ -410,7 +409,6 @@ static ibnd_node_t *create_node(ibnd_fabric_t * fabric, 
ibnd_scan_t * ibnd_scan,
}
 
memcpy(node, temp, sizeof(*node));
-   node-dist = dist;
node-path_portid = *path;
 
add_to_nodeguid_hash(node, fabric-nodestbl);
@@ -420,7 +418,7 @@ static ibnd_node_t *create_node(ibnd_fabric_t * fabric, 
ibnd_scan_t * ibnd_scan,
fabric-nodes = (ibnd_node_t *) node;
 
add_to_type_list(node, fabric);
-   add_to_nodedist(node, ibnd_scan);
+   add_to_nodedist(node, ibnd_scan, dist);
 
return node;
 }
-- 
1.5.4.5



[infiniband-diags] [PATCH] [1/2] split out ibnd_fabric_t fields that are only used during a scan

2009-10-28 Thread Al Chu
Split out public parameters from ibnd_fabric_t that are useless b/c they
are only used during the ibnetdiscover scan.

Note that this patch has similarities to a previous patch from Ira,
however it is separate and independent of that patch series.

Al

-- 
Albert Chu
ch...@llnl.gov
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory
From: Albert Chu ch...@llnl.gov
Date: Tue, 27 Oct 2009 16:16:14 -0700
Subject: [PATCH] split out ibnd_fabric_t fields that are only used during a scan


Signed-off-by: Albert Chu ch...@llnl.gov
---
 .../libibnetdisc/include/infiniband/ibnetdisc.h|6 --
 infiniband-diags/libibnetdisc/src/chassis.c|   68 
 infiniband-diags/libibnetdisc/src/chassis.h|2 +-
 infiniband-diags/libibnetdisc/src/ibnetdisc.c  |   60 +
 infiniband-diags/libibnetdisc/src/internal.h   |   10 +++
 5 files changed, 83 insertions(+), 63 deletions(-)

diff --git a/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h 
b/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h
index c55ce00..8303175 100644
--- a/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h
+++ b/infiniband-diags/libibnetdisc/include/infiniband/ibnetdisc.h
@@ -124,7 +124,6 @@ typedef struct ibnd_chassis {
 } ibnd_chassis_t;
 
 #define HTSZ 137
-#define MAXHOPS63
 
 /** =
  * Fabric
@@ -145,14 +144,9 @@ typedef struct ibnd_fabric {
/* internal use only */
ibnd_node_t *nodestbl[HTSZ];
ibnd_port_t *portstbl[HTSZ];
-   ibnd_node_t *nodesdist[MAXHOPS + 1];
-   ibnd_chassis_t *first_chassis;
-   ibnd_chassis_t *current_chassis;
-   ibnd_chassis_t *last_chassis;
ibnd_node_t *switches;
ibnd_node_t *ch_adapters;
ibnd_node_t *routers;
-   ib_portid_t selfportid;
 } ibnd_fabric_t;
 
 /** =
diff --git a/infiniband-diags/libibnetdisc/src/chassis.c 
b/infiniband-diags/libibnetdisc/src/chassis.c
index 4886cfc..5043f42 100644
--- a/infiniband-diags/libibnetdisc/src/chassis.c
+++ b/infiniband-diags/libibnetdisc/src/chassis.c
@@ -96,7 +96,7 @@ static ibnd_chassis_t *find_chassisnum(ibnd_fabric_t * fabric,
 {
ibnd_chassis_t *current;
 
-   for (current = fabric-first_chassis; current; current = current-next) 
{
+   for (current = fabric-chassis; current; current = current-next) {
if (current-chassisnum == chassisnum)
return current;
}
@@ -214,7 +214,7 @@ static ibnd_chassis_t *find_chassisguid(ibnd_fabric_t * 
fabric,
uint64_t chguid;
 
chguid = get_chassisguid(node);
-   for (current = fabric-first_chassis; current; current = current-next) 
{
+   for (current = fabric-chassis; current; current = current-next) {
if (current-chassisguid == chguid)
return current;
}
@@ -782,19 +782,19 @@ static void voltaire_portmap(ibnd_port_t * port)
port-ext_portnum = int2ext_map_slb8[chipnum][portnum];
 }
 
-static int add_chassis(ibnd_fabric_t * fabric)
+static int add_chassis(ibnd_scan_t *ibnd_scan)
 {
-   if (!(fabric-current_chassis = calloc(1, sizeof(ibnd_chassis_t {
+   if (!(ibnd_scan-current_chassis = calloc(1, sizeof(ibnd_chassis_t {
IBND_ERROR(OOM: failed to allocate chassis object\n);
return (-1);
}
 
-   if (fabric-first_chassis == NULL) {
-   fabric-first_chassis = fabric-current_chassis;
-   fabric-last_chassis = fabric-current_chassis;
+   if (ibnd_scan-first_chassis == NULL) {
+   ibnd_scan-first_chassis = ibnd_scan-current_chassis;
+   ibnd_scan-last_chassis = ibnd_scan-current_chassis;
} else {
-   fabric-last_chassis-next = fabric-current_chassis;
-   fabric-last_chassis = fabric-current_chassis;
+   ibnd_scan-last_chassis-next = ibnd_scan-current_chassis;
+   ibnd_scan-last_chassis = ibnd_scan-current_chassis;
}
return (0);
 }
@@ -818,33 +818,35 @@ static void add_node_to_chassis(ibnd_chassis_t * chassis, 
ibnd_node_t * node)
Returns:
0 on success, -1 on failure
 */
-int group_nodes(ibnd_fabric_t * fabric)
+int group_nodes(ibnd_fabric_t * fabric, ibnd_scan_t *ibnd_scan)
 {
ibnd_node_t *node;
int dist;
int chassisnum = 0;
ibnd_chassis_t *chassis;
+   ibnd_chassis_t *ch, *ch_next;
 
-   fabric-first_chassis = NULL;
-   fabric-current_chassis = NULL;
+   ibnd_scan-first_chassis = NULL;
+   ibnd_scan-current_chassis = NULL;
+   ibnd_scan-last_chassis = NULL;
 
/* first pass on switches and build for every Voltaire node */
/* an appropriate chassis record (slotnum and position) */