Re: [O-MPI devel] [OMPI svn] svn:open-mpi r8692

2006-01-14 Thread Graham E Fagg

Hi all,
 whatever this fixed/changed, I no longer get corrupted memory in the 
tuned data segment hung off each communicator... ! I'm still testing to 
see if I get TimPs error.

G

On Sat, 14 Jan 2006 bosi...@osl.iu.edu wrote:


Author: bosilca
Date: 2006-01-14 15:21:44 -0500 (Sat, 14 Jan 2006)
New Revision: 8692

Modified:
  trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c
  trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.h
  trunk/ompi/mca/btl/tcp/btl_tcp_frag.c
  trunk/ompi/mca/btl/tcp/btl_tcp_frag.h
Log:
A better implementation for the TCP endpoint cache + few comments.


Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c
===
--- trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c   2006-01-14 20:19:01 UTC (rev 
8691)
+++ trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c   2006-01-14 20:21:44 UTC (rev 
8692)
@@ -79,7 +79,7 @@
endpoint->endpoint_nbo = false;
#if MCA_BTL_TCP_ENDPOINT_CACHE
endpoint->endpoint_cache= NULL;
-endpoint->endpoint_cache_pos= 0;
+endpoint->endpoint_cache_pos= NULL;
endpoint->endpoint_cache_length = 0;
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
OBJ_CONSTRUCT(&endpoint->endpoint_frags, opal_list_t);
@@ -187,21 +187,20 @@
static inline void mca_btl_tcp_endpoint_event_init(mca_btl_base_endpoint_t* 
btl_endpoint, int sd)
{
#if MCA_BTL_TCP_ENDPOINT_CACHE
-btl_endpoint->endpoint_cache = 
(char*)malloc(mca_btl_tcp_component.tcp_endpoint_cache);
+btl_endpoint->endpoint_cache = 
(char*)malloc(mca_btl_tcp_component.tcp_endpoint_cache);
+btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache;
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */

-opal_event_set(
-&btl_endpoint->endpoint_recv_event,
-btl_endpoint->endpoint_sd,
-OPAL_EV_READ|OPAL_EV_PERSIST,
-mca_btl_tcp_endpoint_recv_handler,
-btl_endpoint);
-opal_event_set(
-&btl_endpoint->endpoint_send_event,
-btl_endpoint->endpoint_sd,
-OPAL_EV_WRITE|OPAL_EV_PERSIST,
-mca_btl_tcp_endpoint_send_handler,
-btl_endpoint);
+opal_event_set( &btl_endpoint->endpoint_recv_event,
+   btl_endpoint->endpoint_sd,
+   OPAL_EV_READ|OPAL_EV_PERSIST,
+   mca_btl_tcp_endpoint_recv_handler,
+   btl_endpoint );
+opal_event_set( &btl_endpoint->endpoint_send_event,
+   btl_endpoint->endpoint_sd,
+   OPAL_EV_WRITE|OPAL_EV_PERSIST,
+   mca_btl_tcp_endpoint_send_handler,
+   btl_endpoint);
}


@@ -357,7 +356,9 @@
btl_endpoint->endpoint_sd = -1;
#if MCA_BTL_TCP_ENDPOINT_CACHE
free( btl_endpoint->endpoint_cache );
-btl_endpoint->endpoint_cache = NULL;
+btl_endpoint->endpoint_cache= NULL;
+btl_endpoint->endpoint_cache_pos= NULL;
+btl_endpoint->endpoint_cache_length = 0;
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
}
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
@@ -619,13 +620,12 @@
}

#if MCA_BTL_TCP_ENDPOINT_CACHE
-btl_endpoint->endpoint_cache_pos = 0;
+assert( 0 == btl_endpoint->endpoint_cache_length );
data_still_pending_on_endpoint:
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
/* check for completion of non-blocking recv on the current 
fragment */
if(mca_btl_tcp_frag_recv(frag, sd) == false) {
btl_endpoint->endpoint_recv_frag = frag;
-OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
} else {
btl_endpoint->endpoint_recv_frag = NULL;
switch(frag->hdr.type) {
@@ -636,39 +636,37 @@
break;
}
default:
-{
-break;
-}
+break;
}
#if MCA_BTL_TCP_ENDPOINT_CACHE
if( 0 != btl_endpoint->endpoint_cache_length ) {
+   /* If the cache still contain some data we can reuse the 
same fragment
+* until we flush it completly.
+*/
MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint);
goto data_still_pending_on_endpoint;
}
#endif  /* MCA_BTL_TCP_ENDPOINT_CACHE */
MCA_BTL_TCP_FRAG_RETURN_MAX(frag);
-OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
}
-break;
-}
-case MCA_BTL_TCP_SHUTDOWN:
-{
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
+assert( 0 == btl_endpoint->endpoint_cache_length );
break;
}
+case MCA_BTL_TCP_SHUTDOWN:
+OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
+break;
default:
-{
-OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock);
-BTL_ERROR(("invalid socket state(%d)", 
btl_endpoint->e

Re: [O-MPI devel] Intel tests

2006-01-14 Thread Tim Prins
Graham,

with the trunk r8695 I no longer get these errors.

Tim

Quoting Graham E Fagg :

> Hi Tim
>   I can get an error but not quite the same as yours. In my case I
> get a
> segfault as someone corrupts the memory attached to a communicator
> (data
> segment). Looks like a possible inplace error. Expect a fix shortly.
>
> G.
>
> On Tue, 10 Jan 2006, Tim Prins wrote:
>
> > Graham,
> >
> > It works properly if I select the basic coll component. Anyways,
> here is
> > the output you requested. The full output is about 140MB, so I
> killed it
> > before it  finished...
> >
> > Tim
> >
> > Quoting Graham E Fagg :
> >
> >> Hi Tim
> >>   nope, can you rerun with  mpirun -np 4 -mca coll_base_verbose 1
> >> 
> >> and email me the output?
> >> Thanks
> >> G
> >> On Tue, 10 Jan 2006, Tim Prins wrote:
> >>
> >>> Hi everyone,
> >>>
> >>> I have been playing around with Open-MPI, using it as a test bed
> >> for
> >>> another project I am working on, and have found that on the
> intel
> >> test
> >>> suite, ompi is failing the MPI_Allreduce_user_c,
> >>> MPI_Reduce_scatter_user_c, and MPI_Reduce_user_c tests (it
> prints
> >>> something like  MPITEST error (2): i=0, int value=4, expected 1,
> >> etc).
> >>> Are these known error?
> >>>
> >>> BTW, this is on a x86_64 linux box running 4 processes locally,
> >> running
> >>> the trunk svn version 8667, with no additional mca parameters
> set.
> >>>
> >>> Thanks,
> >>>
> >>> TIm
> >>> ___
> >>> devel mailing list
> >>> de...@open-mpi.org
> >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> >>>
> >>
> >>
> >> Thanks,
> >>Graham.
> >>
> --
> >> Dr Graham E. Fagg   | Distributed, Parallel and
> Meta-Computing
> >> Innovative Computing Lab. PVM3.4, HARNESS, FT-MPI, SNIPE & Open
> MPI
> >> Computer Science Dept   | Suite 203, 1122 Volunteer Blvd,
> >> University of Tennessee | Knoxville, Tennessee, USA. TN
> 37996-3450
> >> Email: f...@cs.utk.edu  | Phone:+1(865)974-5790 |
> >> Fax:+1(865)974-8296
> >> Broken complex systems are always derived from working simple
> >> systems
> >>
> --
> >> ___
> >> devel mailing list
> >> de...@open-mpi.org
> >> http://www.open-mpi.org/mailman/listinfo.cgi/devel
> >>
> >
> >
> >
>
>
> Thanks,
>   Graham.
> --
> Dr Graham E. Fagg   | Distributed, Parallel and Meta-Computing
> Innovative Computing Lab. PVM3.4, HARNESS, FT-MPI, SNIPE & Open MPI
> Computer Science Dept   | Suite 203, 1122 Volunteer Blvd,
> University of Tennessee | Knoxville, Tennessee, USA. TN 37996-3450
> Email: f...@cs.utk.edu  | Phone:+1(865)974-5790 |
> Fax:+1(865)974-8296
> Broken complex systems are always derived from working simple
> systems
> --
>




Re: [O-MPI devel] Intel tests

2006-01-14 Thread David Daniel

Hi Graham,

On Jan 14, 2006, at 2:07 PM, Graham E Fagg wrote:

Hi all,
  whatever this fixed/changed, I no longer get corrupted memory in the
tuned data segment hung off each communicator... ! I'm still  
testing to

see if I get TimPs error.
G

On Sat, 14 Jan 2006 bosi...@osl.iu.edu wrote:


Author: bosilca
Date: 2006-01-14 15:21:44 -0500 (Sat, 14 Jan 2006)
New Revision: 8692

Modified:
  trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c
  trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.h
  trunk/ompi/mca/btl/tcp/btl_tcp_frag.c
  trunk/ompi/mca/btl/tcp/btl_tcp_frag.h
Log:
A better implementation for the TCP endpoint cache + few comments.



On  a 64-bit bproc/myrinet system I'm seeing Tim P's problem with the  
current head of the trunk. See attached output.


David




$ ompi_info | head
Open MPI: 1.1a1svn01142006
   Open MPI SVN revision: svn01142006
Open RTE: 1.1a1svn01142006
   Open RTE SVN revision: svn01142006
OPAL: 1.1a1svn01142006
   OPAL SVN revision: svn01142006
  Prefix: /scratch/modules/opt/openmpi-trunk- 
nofortran-bproc64

Configured architecture: x86_64-unknown-linux-gnu
   Configured by: ddd
   Configured on: Sat Jan 14 17:22:16 MST 2006

$ make MPIRUN='mpirun -mca coll basic' MPI_Allreduce_user_c
(cd src ; make MPI_Allreduce_user_c)
make[1]: Entering directory `/home/ddd/intel_tests/src'
mpicc -g -Isrc   -c -o libmpitest.o libmpitest.c
mpicc -g -Isrc  -o MPI_Allreduce_user_c MPI_Allreduce_user_c.c  
libmpitest.o -lm

make[1]: Leaving directory `/home/ddd/intel_tests/src'
mpirun -mca coll basic -n 4 --  `pwd`/src/MPI_Allreduce_user_c
MPITEST info  (0): Starting MPI_Allreduce_user() test
MPITEST_results: MPI_Allreduce_user() all tests PASSED (7076)

$ make MPIRUN='mpirun' MPI_Allreduce_user_c
(cd src ; make MPI_Allreduce_user_c)
make[1]: Entering directory `/home/ddd/intel_tests/src'
make[1]: `MPI_Allreduce_user_c' is up to date.
make[1]: Leaving directory `/home/ddd/intel_tests/src'
mpirun -n 4 --  `pwd`/src/MPI_Allreduce_user_c
MPITEST info  (0): Starting MPI_Allreduce_user() test
MPITEST error (0): i=0, int value=4, expected 1
MPITEST error (0): i=1, int value=4, expected 1
MPITEST error (0): i=2, int value=4, expected 1
MPITEST error (0): i=3, int value=4, expected 1

...