Interestingly enough, in ompi_mpi_init, opal_argv_join is called without then array length, so I suppose that in the usual argc/argv couple, you have an additional value to argv which may be NULL. So try allocating 3 additional values, the last being NULL, and it may work.
Cheers, Matthieu 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>: > I tried the following code without CUDA, the error is still there: > > #include "mpi.h" > > #include <cstdlib> > #include <cstring> > #include <cmath> > > int main(int argc, char **argv) > { > // override command line arguments to make sure cudaengine get the > correct one > char **argv_new = new char*[ argc + 2 ]; > for( int i = 0 ; i < argc ; i++ ) > { > argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; > strcpy( argv_new[i], argv[i] ); > } > argv_new[ argc ] = new char[ 32 ]; > argv_new[ argc+1 ] = new char[ 32 ]; > strcpy( argv_new[argc], "-device" ); > sprintf( argv_new[argc+1], "%d", 0 ); > > argc += 2; > argv = argv_new; > > MPI_Init(&argc,&argv); > > // do something... > > MPI_Finalize(); > > for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; > delete [] argv; > } > > At the end of the program the pointer stored in argv is exactly that of > argv_new so this should not be a problem. Manually inserting printf tells me > that the fault occured at MPI_Init. The code works fine if I use > MPI_Init(NULL,NULL) instead. The same code also compiles and runs without a > problem on my laptop with mpich2-1.4. > > Best, > Yu-Hang > > > > On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher > <matthieu.bruc...@gmail.com> wrote: >> >> Hi, >> >> Are you sure this is the correct code? This seems strange and not a good >> idea: >> >> MPI_Init(&argc,&argv); >> >> // do something... >> >> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >> delete [] argv; >> >> Did you mean argc_new and argv_new instead? >> Do you have the same error without CUDA? >> >> Cheers, >> >> Matthieu >> >> >> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>: >> > Hi, >> > >> > I tried to augment the command line argument list by allocating my own >> > list >> > of strings and passing them to MPI_Init, yet I got a segmentation fault >> > for >> > both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The >> > code is: >> > >> > #include "mpi.h" >> > #include "cuda_runtime.h" >> > #include <cstdlib> >> > #include <cstring> >> > #include <cmath> >> > >> > int main(int argc, char **argv) >> > { >> > int device = 0; >> > int skip = 0; >> > bool skipmode = false; >> > bool specified = false; >> > for( int i = 0 ; i < argc ; i++ ) >> > { >> > if ( strcmp( argv[i], "-device" ) == 0 ) >> > { >> > i++; >> > if ( argv[i][0] == '-' ) >> > { >> > skipmode = true; >> > skip = fabs( atoi( argv[i] ) ); >> > } >> > else >> > { >> > skipmode = false; >> > device = atoi( argv[i] ); >> > } >> > specified = true; >> > } >> > } >> > >> > if ( !specified || skipmode ) >> > { >> > char* var; >> > int dev_count, local_rank = 0; >> > if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank = >> > atoi(var); >> > else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) >> > local_rank = atoi(var); >> > else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) >> > local_rank = atoi(var); >> > cudaGetDeviceCount( &dev_count ); >> > if ( skipmode ) >> > { >> > device = 0; >> > if ( device == skip ) local_rank++; >> > while( local_rank-- > 0 ) >> > { >> > device = (++device) % dev_count; >> > if ( device == skip ) local_rank++; >> > } >> > } >> > else device = local_rank % dev_count; >> > } >> > >> > // override command line arguments to make sure cudaengine get the >> > correct one >> > char **argv_new = new char*[ argc + 2 ]; >> > for( int i = 0 ; i < argc ; i++ ) >> > { >> > argv_new[i] = new char[ strlen( argv[i] ) + 1 ]; >> > strcpy( argv_new[i], argv[i] ); >> > } >> > argv_new[ argc ] = new char[ 32 ]; >> > argv_new[ argc+1 ] = new char[ 32 ]; >> > strcpy( argv_new[argc], "-device" ); >> > sprintf( argv_new[argc+1], "%d", device ); >> > argc += 2; >> > argv = argv_new; >> > >> > cudaSetDevice( device ); >> > >> > MPI_Init(&argc,&argv); >> > >> > // do something... >> > >> > MPI_Finalize(); >> > >> > cudaDeviceReset(); >> > for( int i = 0 ; i < argc ; i++ ) delete [] argv[i]; >> > delete [] argv; >> > } >> > >> > When compiled using nvcc -ccbin mpic++, The error I got was: >> > >> > [jueying:16317] *** Process received signal *** >> > [jueying:16317] Signal: Segmentation fault (11) >> > [jueying:16317] Signal code: Address not mapped (1) >> > [jueying:16317] Failing at address: 0x21 >> > [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000] >> > [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551] >> > [jueying:16317] [ 2] >> > /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39) >> > [0x7f460b993079] >> > [jueying:16317] [ 3] >> > /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347) >> > [0x7f460c106a57] >> > [jueying:16317] [ 4] /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b) >> > [0x7f460c12523b] >> > [jueying:16317] [ 5] ./lmp_jueying() [0x40c035] >> > [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5) >> > [0x39e5621a05] >> > [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21] >> > [jueying:16317] *** End of error message *** >> > >> > Thanks for the help. >> > >> > Best regards, >> > Yu-Hang Tang >> > >> > _______________________________________________ >> > users mailing list >> > us...@open-mpi.org >> > http://www.open-mpi.org/mailman/listinfo.cgi/users >> >> >> >> -- >> Information System Engineer, Ph.D. >> Blog: http://matt.eifelle.com >> LinkedIn: http://www.linkedin.com/in/matthieubrucher >> Music band: http://liliejay.com/ >> _______________________________________________ >> users mailing list >> us...@open-mpi.org >> http://www.open-mpi.org/mailman/listinfo.cgi/users > > > > > -- > Yu-Hang Tang > Room 105, 37 Manning St > Division of Applied Mathematics, Brown University > Providence, RI 02912 > > _______________________________________________ > users mailing list > us...@open-mpi.org > http://www.open-mpi.org/mailman/listinfo.cgi/users -- Information System Engineer, Ph.D. Blog: http://matt.eifelle.com LinkedIn: http://www.linkedin.com/in/matthieubrucher Music band: http://liliejay.com/