It seems that argv[argc] should always be NULL according to the
standard. So OMPI failure is not actually a bug!

Cheers,

2013/11/12 Matthieu Brucher <matthieu.bruc...@gmail.com>:
> Interestingly enough, in ompi_mpi_init, opal_argv_join is called
> without then array length, so I suppose that in the usual argc/argv
> couple, you have an additional value to argv which may be NULL. So try
> allocating 3 additional values, the last being NULL, and it may work.
>
> Cheers,
>
> Matthieu
>
> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>:
>> I tried the following code without CUDA, the error is still there:
>>
>> #include "mpi.h"
>>
>> #include <cstdlib>
>> #include <cstring>
>> #include <cmath>
>>
>> int main(int argc, char **argv)
>> {
>>     // override command line arguments to make sure cudaengine get the
>> correct one
>>     char **argv_new = new char*[ argc + 2 ];
>>     for( int i = 0 ; i < argc ; i++ )
>>     {
>>         argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
>>         strcpy( argv_new[i], argv[i] );
>>     }
>>     argv_new[ argc   ] = new char[ 32 ];
>>     argv_new[ argc+1 ] = new char[ 32 ];
>>     strcpy( argv_new[argc],   "-device" );
>>     sprintf( argv_new[argc+1], "%d", 0 );
>>
>>     argc += 2;
>>     argv = argv_new;
>>
>>     MPI_Init(&argc,&argv);
>>
>>     // do something...
>>
>>     MPI_Finalize();
>>
>>     for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>>     delete [] argv;
>> }
>>
>> At the end of the program the pointer stored in argv is exactly that of
>> argv_new so this should not be a problem. Manually inserting printf tells me
>> that the fault occured at MPI_Init. The code works fine if I use
>> MPI_Init(NULL,NULL) instead. The same code also compiles and runs without a
>> problem on my laptop with mpich2-1.4.
>>
>> Best,
>> Yu-Hang
>>
>>
>>
>> On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher
>> <matthieu.bruc...@gmail.com> wrote:
>>>
>>> Hi,
>>>
>>> Are you sure this is the correct code? This seems strange and not a good
>>> idea:
>>>
>>>    MPI_Init(&argc,&argv);
>>>
>>>     // do something...
>>>
>>>     for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>>>     delete [] argv;
>>>
>>> Did you mean argc_new and argv_new instead?
>>> Do you have the same error without CUDA?
>>>
>>> Cheers,
>>>
>>> Matthieu
>>>
>>>
>>> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>:
>>> > Hi,
>>> >
>>> > I tried to augment the command line argument list by allocating my own
>>> > list
>>> > of strings and passing them to MPI_Init, yet I got a segmentation fault
>>> > for
>>> > both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The
>>> > code is:
>>> >
>>> > #include "mpi.h"
>>> > #include "cuda_runtime.h"
>>> > #include <cstdlib>
>>> > #include <cstring>
>>> > #include <cmath>
>>> >
>>> > int main(int argc, char **argv)
>>> > {
>>> >     int device = 0;
>>> >     int skip = 0;
>>> >     bool skipmode = false;
>>> >     bool specified = false;
>>> >     for( int i = 0 ; i < argc ; i++ )
>>> >     {
>>> >         if ( strcmp( argv[i], "-device" ) == 0 )
>>> >         {
>>> >             i++;
>>> >             if ( argv[i][0] == '-' )
>>> >             {
>>> >                 skipmode = true;
>>> >                 skip = fabs( atoi( argv[i] ) );
>>> >             }
>>> >             else
>>> >             {
>>> >                 skipmode = false;
>>> >                 device = atoi( argv[i] );
>>> >             }
>>> >             specified = true;
>>> >         }
>>> >     }
>>> >
>>> >     if ( !specified || skipmode )
>>> >     {
>>> >         char* var;
>>> >         int dev_count, local_rank = 0;
>>> >         if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank =
>>> > atoi(var);
>>> >         else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK"))  != NULL)
>>> > local_rank = atoi(var);
>>> >         else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL)
>>> > local_rank = atoi(var);
>>> >         cudaGetDeviceCount( &dev_count );
>>> >         if ( skipmode )
>>> >         {
>>> >             device = 0;
>>> >             if ( device == skip ) local_rank++;
>>> >             while( local_rank-- > 0 )
>>> >             {
>>> >                 device = (++device) % dev_count;
>>> >                 if ( device == skip ) local_rank++;
>>> >             }
>>> >         }
>>> >         else device = local_rank % dev_count;
>>> >     }
>>> >
>>> >     // override command line arguments to make sure cudaengine get the
>>> > correct one
>>> >     char **argv_new = new char*[ argc + 2 ];
>>> >     for( int i = 0 ; i < argc ; i++ )
>>> >     {
>>> >         argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
>>> >         strcpy( argv_new[i], argv[i] );
>>> >     }
>>> >     argv_new[ argc   ] = new char[ 32 ];
>>> >     argv_new[ argc+1 ] = new char[ 32 ];
>>> >     strcpy( argv_new[argc],   "-device" );
>>> >     sprintf( argv_new[argc+1], "%d", device );
>>> >     argc += 2;
>>> >     argv = argv_new;
>>> >
>>> >     cudaSetDevice( device );
>>> >
>>> >     MPI_Init(&argc,&argv);
>>> >
>>> >     // do something...
>>> >
>>> >     MPI_Finalize();
>>> >
>>> >     cudaDeviceReset();
>>> >     for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>>> >     delete [] argv;
>>> > }
>>> >
>>> > When compiled using nvcc -ccbin mpic++, The error I got was:
>>> >
>>> > [jueying:16317] *** Process received signal ***
>>> > [jueying:16317] Signal: Segmentation fault (11)
>>> > [jueying:16317] Signal code: Address not mapped (1)
>>> > [jueying:16317] Failing at address: 0x21
>>> > [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000]
>>> > [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551]
>>> > [jueying:16317] [ 2]
>>> > /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39)
>>> > [0x7f460b993079]
>>> > [jueying:16317] [ 3]
>>> > /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347)
>>> > [0x7f460c106a57]
>>> > [jueying:16317] [ 4] /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b)
>>> > [0x7f460c12523b]
>>> > [jueying:16317] [ 5] ./lmp_jueying() [0x40c035]
>>> > [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5)
>>> > [0x39e5621a05]
>>> > [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21]
>>> > [jueying:16317] *** End of error message ***
>>> >
>>> > Thanks for the help.
>>> >
>>> > Best regards,
>>> > Yu-Hang Tang
>>> >
>>> > _______________________________________________
>>> > users mailing list
>>> > us...@open-mpi.org
>>> > http://www.open-mpi.org/mailman/listinfo.cgi/users
>>>
>>>
>>>
>>> --
>>> Information System Engineer, Ph.D.
>>> Blog: http://matt.eifelle.com
>>> LinkedIn: http://www.linkedin.com/in/matthieubrucher
>>> Music band: http://liliejay.com/
>>> _______________________________________________
>>> users mailing list
>>> us...@open-mpi.org
>>> http://www.open-mpi.org/mailman/listinfo.cgi/users
>>
>>
>>
>>
>> --
>> Yu-Hang Tang
>> Room 105, 37 Manning St
>> Division of Applied Mathematics, Brown University
>> Providence, RI 02912
>>
>> _______________________________________________
>> users mailing list
>> us...@open-mpi.org
>> http://www.open-mpi.org/mailman/listinfo.cgi/users
>
>
>
> --
> Information System Engineer, Ph.D.
> Blog: http://matt.eifelle.com
> LinkedIn: http://www.linkedin.com/in/matthieubrucher
> Music band: http://liliejay.com/



-- 
Information System Engineer, Ph.D.
Blog: http://matt.eifelle.com
LinkedIn: http://www.linkedin.com/in/matthieubrucher
Music band: http://liliejay.com/

Reply via email to