Hello,
It seems that argv[argc] should always be NULL according to the
standard. So OMPI failure is not actually a bug!
could you please point to the exact document where this is explicitly
stated?
Otherwise, I'd assume this is a bug.
Kind regards,
Alex Granovsky
-----Original Message-----
From: Matthieu Brucher
Sent: Tuesday, November 12, 2013 8:56 PM
To: Open MPI Users
Subject: Re: [OMPI users] Segmentation fault in MPI_Init when passing
pointers allocated in main()
It seems that argv[argc] should always be NULL according to the
standard. So OMPI failure is not actually a bug!
Cheers,
2013/11/12 Matthieu Brucher <matthieu.bruc...@gmail.com>:
Interestingly enough, in ompi_mpi_init, opal_argv_join is called
without then array length, so I suppose that in the usual argc/argv
couple, you have an additional value to argv which may be NULL. So try
allocating 3 additional values, the last being NULL, and it may work.
Cheers,
Matthieu
2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>:
I tried the following code without CUDA, the error is still there:
#include "mpi.h"
#include <cstdlib>
#include <cstring>
#include <cmath>
int main(int argc, char **argv)
{
// override command line arguments to make sure cudaengine get the
correct one
char **argv_new = new char*[ argc + 2 ];
for( int i = 0 ; i < argc ; i++ )
{
argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
strcpy( argv_new[i], argv[i] );
}
argv_new[ argc ] = new char[ 32 ];
argv_new[ argc+1 ] = new char[ 32 ];
strcpy( argv_new[argc], "-device" );
sprintf( argv_new[argc+1], "%d", 0 );
argc += 2;
argv = argv_new;
MPI_Init(&argc,&argv);
// do something...
MPI_Finalize();
for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
delete [] argv;
}
At the end of the program the pointer stored in argv is exactly that of
argv_new so this should not be a problem. Manually inserting printf tells
me
that the fault occured at MPI_Init. The code works fine if I use
MPI_Init(NULL,NULL) instead. The same code also compiles and runs without
a
problem on my laptop with mpich2-1.4.
Best,
Yu-Hang
On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher
<matthieu.bruc...@gmail.com> wrote:
Hi,
Are you sure this is the correct code? This seems strange and not a good
idea:
MPI_Init(&argc,&argv);
// do something...
for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
delete [] argv;
Did you mean argc_new and argv_new instead?
Do you have the same error without CUDA?
Cheers,
Matthieu
2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>:
> Hi,
>
> I tried to augment the command line argument list by allocating my own
> list
> of strings and passing them to MPI_Init, yet I got a segmentation
> fault
> for
> both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2.
> The
> code is:
>
> #include "mpi.h"
> #include "cuda_runtime.h"
> #include <cstdlib>
> #include <cstring>
> #include <cmath>
>
> int main(int argc, char **argv)
> {
> int device = 0;
> int skip = 0;
> bool skipmode = false;
> bool specified = false;
> for( int i = 0 ; i < argc ; i++ )
> {
> if ( strcmp( argv[i], "-device" ) == 0 )
> {
> i++;
> if ( argv[i][0] == '-' )
> {
> skipmode = true;
> skip = fabs( atoi( argv[i] ) );
> }
> else
> {
> skipmode = false;
> device = atoi( argv[i] );
> }
> specified = true;
> }
> }
>
> if ( !specified || skipmode )
> {
> char* var;
> int dev_count, local_rank = 0;
> if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank =
> atoi(var);
> else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL)
> local_rank = atoi(var);
> else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL)
> local_rank = atoi(var);
> cudaGetDeviceCount( &dev_count );
> if ( skipmode )
> {
> device = 0;
> if ( device == skip ) local_rank++;
> while( local_rank-- > 0 )
> {
> device = (++device) % dev_count;
> if ( device == skip ) local_rank++;
> }
> }
> else device = local_rank % dev_count;
> }
>
> // override command line arguments to make sure cudaengine get the
> correct one
> char **argv_new = new char*[ argc + 2 ];
> for( int i = 0 ; i < argc ; i++ )
> {
> argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
> strcpy( argv_new[i], argv[i] );
> }
> argv_new[ argc ] = new char[ 32 ];
> argv_new[ argc+1 ] = new char[ 32 ];
> strcpy( argv_new[argc], "-device" );
> sprintf( argv_new[argc+1], "%d", device );
> argc += 2;
> argv = argv_new;
>
> cudaSetDevice( device );
>
> MPI_Init(&argc,&argv);
>
> // do something...
>
> MPI_Finalize();
>
> cudaDeviceReset();
> for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
> delete [] argv;
> }
>
> When compiled using nvcc -ccbin mpic++, The error I got was:
>
> [jueying:16317] *** Process received signal ***
> [jueying:16317] Signal: Segmentation fault (11)
> [jueying:16317] Signal code: Address not mapped (1)
> [jueying:16317] Failing at address: 0x21
> [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000]
> [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551]
> [jueying:16317] [ 2]
> /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39)
> [0x7f460b993079]
> [jueying:16317] [ 3]
> /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347)
> [0x7f460c106a57]
> [jueying:16317] [ 4]
> /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b)
> [0x7f460c12523b]
> [jueying:16317] [ 5] ./lmp_jueying() [0x40c035]
> [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5)
> [0x39e5621a05]
> [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21]
> [jueying:16317] *** End of error message ***
>
> Thanks for the help.
>
> Best regards,
> Yu-Hang Tang
>
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users
--
Information System Engineer, Ph.D.
Blog: http://matt.eifelle.com
LinkedIn: http://www.linkedin.com/in/matthieubrucher
Music band: http://liliejay.com/
_______________________________________________
users mailing list
us...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users
--
Yu-Hang Tang
Room 105, 37 Manning St
Division of Applied Mathematics, Brown University
Providence, RI 02912
_______________________________________________
users mailing list
us...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users
--
Information System Engineer, Ph.D.
Blog: http://matt.eifelle.com
LinkedIn: http://www.linkedin.com/in/matthieubrucher
Music band: http://liliejay.com/
--
Information System Engineer, Ph.D.
Blog: http://matt.eifelle.com
LinkedIn: http://www.linkedin.com/in/matthieubrucher
Music band: http://liliejay.com/
_______________________________________________
users mailing list
us...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users