Re: [O-MPI devel] [OMPI svn] svn:open-mpi r8692
Hi all, whatever this fixed/changed, I no longer get corrupted memory in the tuned data segment hung off each communicator... ! I'm still testing to see if I get TimPs error. G On Sat, 14 Jan 2006 bosi...@osl.iu.edu wrote: Author: bosilca Date: 2006-01-14 15:21:44 -0500 (Sat, 14 Jan 2006) New Revision: 8692 Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.h trunk/ompi/mca/btl/tcp/btl_tcp_frag.c trunk/ompi/mca/btl/tcp/btl_tcp_frag.h Log: A better implementation for the TCP endpoint cache + few comments. Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c === --- trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c 2006-01-14 20:19:01 UTC (rev 8691) +++ trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c 2006-01-14 20:21:44 UTC (rev 8692) @@ -79,7 +79,7 @@ endpoint->endpoint_nbo = false; #if MCA_BTL_TCP_ENDPOINT_CACHE endpoint->endpoint_cache= NULL; -endpoint->endpoint_cache_pos= 0; +endpoint->endpoint_cache_pos= NULL; endpoint->endpoint_cache_length = 0; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ OBJ_CONSTRUCT(&endpoint->endpoint_frags, opal_list_t); @@ -187,21 +187,20 @@ static inline void mca_btl_tcp_endpoint_event_init(mca_btl_base_endpoint_t* btl_endpoint, int sd) { #if MCA_BTL_TCP_ENDPOINT_CACHE -btl_endpoint->endpoint_cache = (char*)malloc(mca_btl_tcp_component.tcp_endpoint_cache); +btl_endpoint->endpoint_cache = (char*)malloc(mca_btl_tcp_component.tcp_endpoint_cache); +btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ -opal_event_set( -&btl_endpoint->endpoint_recv_event, -btl_endpoint->endpoint_sd, -OPAL_EV_READ|OPAL_EV_PERSIST, -mca_btl_tcp_endpoint_recv_handler, -btl_endpoint); -opal_event_set( -&btl_endpoint->endpoint_send_event, -btl_endpoint->endpoint_sd, -OPAL_EV_WRITE|OPAL_EV_PERSIST, -mca_btl_tcp_endpoint_send_handler, -btl_endpoint); +opal_event_set( &btl_endpoint->endpoint_recv_event, + btl_endpoint->endpoint_sd, + OPAL_EV_READ|OPAL_EV_PERSIST, + mca_btl_tcp_endpoint_recv_handler, + btl_endpoint ); +opal_event_set( &btl_endpoint->endpoint_send_event, + btl_endpoint->endpoint_sd, + OPAL_EV_WRITE|OPAL_EV_PERSIST, + mca_btl_tcp_endpoint_send_handler, + btl_endpoint); } @@ -357,7 +356,9 @@ btl_endpoint->endpoint_sd = -1; #if MCA_BTL_TCP_ENDPOINT_CACHE free( btl_endpoint->endpoint_cache ); -btl_endpoint->endpoint_cache = NULL; +btl_endpoint->endpoint_cache= NULL; +btl_endpoint->endpoint_cache_pos= NULL; +btl_endpoint->endpoint_cache_length = 0; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ } btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED; @@ -619,13 +620,12 @@ } #if MCA_BTL_TCP_ENDPOINT_CACHE -btl_endpoint->endpoint_cache_pos = 0; +assert( 0 == btl_endpoint->endpoint_cache_length ); data_still_pending_on_endpoint: #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* check for completion of non-blocking recv on the current fragment */ if(mca_btl_tcp_frag_recv(frag, sd) == false) { btl_endpoint->endpoint_recv_frag = frag; -OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); } else { btl_endpoint->endpoint_recv_frag = NULL; switch(frag->hdr.type) { @@ -636,39 +636,37 @@ break; } default: -{ -break; -} +break; } #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { + /* If the cache still contain some data we can reuse the same fragment +* until we flush it completly. +*/ MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); goto data_still_pending_on_endpoint; } #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ MCA_BTL_TCP_FRAG_RETURN_MAX(frag); -OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); } -break; -} -case MCA_BTL_TCP_SHUTDOWN: -{ OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); +assert( 0 == btl_endpoint->endpoint_cache_length ); break; } +case MCA_BTL_TCP_SHUTDOWN: +OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); +break; default: -{ -OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); -BTL_ERROR(("invalid socket state(%d)", btl_endpoint->e
Re: [O-MPI devel] Intel tests
Graham, with the trunk r8695 I no longer get these errors. Tim Quoting Graham E Fagg : > Hi Tim > I can get an error but not quite the same as yours. In my case I > get a > segfault as someone corrupts the memory attached to a communicator > (data > segment). Looks like a possible inplace error. Expect a fix shortly. > > G. > > On Tue, 10 Jan 2006, Tim Prins wrote: > > > Graham, > > > > It works properly if I select the basic coll component. Anyways, > here is > > the output you requested. The full output is about 140MB, so I > killed it > > before it finished... > > > > Tim > > > > Quoting Graham E Fagg : > > > >> Hi Tim > >> nope, can you rerun with mpirun -np 4 -mca coll_base_verbose 1 > >> > >> and email me the output? > >> Thanks > >> G > >> On Tue, 10 Jan 2006, Tim Prins wrote: > >> > >>> Hi everyone, > >>> > >>> I have been playing around with Open-MPI, using it as a test bed > >> for > >>> another project I am working on, and have found that on the > intel > >> test > >>> suite, ompi is failing the MPI_Allreduce_user_c, > >>> MPI_Reduce_scatter_user_c, and MPI_Reduce_user_c tests (it > prints > >>> something like MPITEST error (2): i=0, int value=4, expected 1, > >> etc). > >>> Are these known error? > >>> > >>> BTW, this is on a x86_64 linux box running 4 processes locally, > >> running > >>> the trunk svn version 8667, with no additional mca parameters > set. > >>> > >>> Thanks, > >>> > >>> TIm > >>> ___ > >>> devel mailing list > >>> de...@open-mpi.org > >>> http://www.open-mpi.org/mailman/listinfo.cgi/devel > >>> > >> > >> > >> Thanks, > >>Graham. > >> > -- > >> Dr Graham E. Fagg | Distributed, Parallel and > Meta-Computing > >> Innovative Computing Lab. PVM3.4, HARNESS, FT-MPI, SNIPE & Open > MPI > >> Computer Science Dept | Suite 203, 1122 Volunteer Blvd, > >> University of Tennessee | Knoxville, Tennessee, USA. TN > 37996-3450 > >> Email: f...@cs.utk.edu | Phone:+1(865)974-5790 | > >> Fax:+1(865)974-8296 > >> Broken complex systems are always derived from working simple > >> systems > >> > -- > >> ___ > >> devel mailing list > >> de...@open-mpi.org > >> http://www.open-mpi.org/mailman/listinfo.cgi/devel > >> > > > > > > > > > Thanks, > Graham. > -- > Dr Graham E. Fagg | Distributed, Parallel and Meta-Computing > Innovative Computing Lab. PVM3.4, HARNESS, FT-MPI, SNIPE & Open MPI > Computer Science Dept | Suite 203, 1122 Volunteer Blvd, > University of Tennessee | Knoxville, Tennessee, USA. TN 37996-3450 > Email: f...@cs.utk.edu | Phone:+1(865)974-5790 | > Fax:+1(865)974-8296 > Broken complex systems are always derived from working simple > systems > -- >
Re: [O-MPI devel] Intel tests
Hi Graham, On Jan 14, 2006, at 2:07 PM, Graham E Fagg wrote: Hi all, whatever this fixed/changed, I no longer get corrupted memory in the tuned data segment hung off each communicator... ! I'm still testing to see if I get TimPs error. G On Sat, 14 Jan 2006 bosi...@osl.iu.edu wrote: Author: bosilca Date: 2006-01-14 15:21:44 -0500 (Sat, 14 Jan 2006) New Revision: 8692 Modified: trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.c trunk/ompi/mca/btl/tcp/btl_tcp_endpoint.h trunk/ompi/mca/btl/tcp/btl_tcp_frag.c trunk/ompi/mca/btl/tcp/btl_tcp_frag.h Log: A better implementation for the TCP endpoint cache + few comments. On a 64-bit bproc/myrinet system I'm seeing Tim P's problem with the current head of the trunk. See attached output. David $ ompi_info | head Open MPI: 1.1a1svn01142006 Open MPI SVN revision: svn01142006 Open RTE: 1.1a1svn01142006 Open RTE SVN revision: svn01142006 OPAL: 1.1a1svn01142006 OPAL SVN revision: svn01142006 Prefix: /scratch/modules/opt/openmpi-trunk- nofortran-bproc64 Configured architecture: x86_64-unknown-linux-gnu Configured by: ddd Configured on: Sat Jan 14 17:22:16 MST 2006 $ make MPIRUN='mpirun -mca coll basic' MPI_Allreduce_user_c (cd src ; make MPI_Allreduce_user_c) make[1]: Entering directory `/home/ddd/intel_tests/src' mpicc -g -Isrc -c -o libmpitest.o libmpitest.c mpicc -g -Isrc -o MPI_Allreduce_user_c MPI_Allreduce_user_c.c libmpitest.o -lm make[1]: Leaving directory `/home/ddd/intel_tests/src' mpirun -mca coll basic -n 4 -- `pwd`/src/MPI_Allreduce_user_c MPITEST info (0): Starting MPI_Allreduce_user() test MPITEST_results: MPI_Allreduce_user() all tests PASSED (7076) $ make MPIRUN='mpirun' MPI_Allreduce_user_c (cd src ; make MPI_Allreduce_user_c) make[1]: Entering directory `/home/ddd/intel_tests/src' make[1]: `MPI_Allreduce_user_c' is up to date. make[1]: Leaving directory `/home/ddd/intel_tests/src' mpirun -n 4 -- `pwd`/src/MPI_Allreduce_user_c MPITEST info (0): Starting MPI_Allreduce_user() test MPITEST error (0): i=0, int value=4, expected 1 MPITEST error (0): i=1, int value=4, expected 1 MPITEST error (0): i=2, int value=4, expected 1 MPITEST error (0): i=3, int value=4, expected 1 ...