On Nov 12, 2013, at 8:56 AM, Matthieu Brucher <matthieu.bruc...@gmail.com> 
wrote:

> It seems that argv[argc] should always be NULL according to the
> standard.

That is definitely true.

> So OMPI failure is not actually a bug!

I think that is true as well, though I suppose we could try to catch it 
(doubtful - what if it isn't NULL but garbage? after all, you are looking past 
the end of the array)

Something else is also wrong here. You are never allowed to release argv 
entries as those belong to the system, so the last loop in your program is 
wrong. Also, you do something else that is wrong - you create a new argv array 
(argv_new), but then you set argv to point to that array - which messes up the 
system array again. On top of that, you changed the system value of argc 
instead of setting your own variable.


> 
> Cheers,
> 
> 2013/11/12 Matthieu Brucher <matthieu.bruc...@gmail.com>:
>> Interestingly enough, in ompi_mpi_init, opal_argv_join is called
>> without then array length, so I suppose that in the usual argc/argv
>> couple, you have an additional value to argv which may be NULL. So try
>> allocating 3 additional values, the last being NULL, and it may work.
>> 
>> Cheers,
>> 
>> Matthieu
>> 
>> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>:
>>> I tried the following code without CUDA, the error is still there:
>>> 
>>> #include "mpi.h"
>>> 
>>> #include <cstdlib>
>>> #include <cstring>
>>> #include <cmath>
>>> 
>>> int main(int argc, char **argv)
>>> {
>>>    // override command line arguments to make sure cudaengine get the
>>> correct one
>>>    char **argv_new = new char*[ argc + 2 ];
>>>    for( int i = 0 ; i < argc ; i++ )
>>>    {
>>>        argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
>>>        strcpy( argv_new[i], argv[i] );
>>>    }
>>>    argv_new[ argc   ] = new char[ 32 ];
>>>    argv_new[ argc+1 ] = new char[ 32 ];
>>>    strcpy( argv_new[argc],   "-device" );
>>>    sprintf( argv_new[argc+1], "%d", 0 );
>>> 
>>>    argc += 2;
>>>    argv = argv_new;
>>> 
>>>    MPI_Init(&argc,&argv);
>>> 
>>>    // do something...
>>> 
>>>    MPI_Finalize();
>>> 
>>>    for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>>>    delete [] argv;
>>> }
>>> 
>>> At the end of the program the pointer stored in argv is exactly that of
>>> argv_new so this should not be a problem. Manually inserting printf tells me
>>> that the fault occured at MPI_Init. The code works fine if I use
>>> MPI_Init(NULL,NULL) instead. The same code also compiles and runs without a
>>> problem on my laptop with mpich2-1.4.
>>> 
>>> Best,
>>> Yu-Hang
>>> 
>>> 
>>> 
>>> On Tue, Nov 12, 2013 at 11:18 AM, Matthieu Brucher
>>> <matthieu.bruc...@gmail.com> wrote:
>>>> 
>>>> Hi,
>>>> 
>>>> Are you sure this is the correct code? This seems strange and not a good
>>>> idea:
>>>> 
>>>>   MPI_Init(&argc,&argv);
>>>> 
>>>>    // do something...
>>>> 
>>>>    for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>>>>    delete [] argv;
>>>> 
>>>> Did you mean argc_new and argv_new instead?
>>>> Do you have the same error without CUDA?
>>>> 
>>>> Cheers,
>>>> 
>>>> Matthieu
>>>> 
>>>> 
>>>> 2013/11/12 Tang, Yu-Hang <yuhang_t...@brown.edu>:
>>>>> Hi,
>>>>> 
>>>>> I tried to augment the command line argument list by allocating my own
>>>>> list
>>>>> of strings and passing them to MPI_Init, yet I got a segmentation fault
>>>>> for
>>>>> both OpenMPI 1.6.3 and 1.7.2, while the code works fine with MPICH2. The
>>>>> code is:
>>>>> 
>>>>> #include "mpi.h"
>>>>> #include "cuda_runtime.h"
>>>>> #include <cstdlib>
>>>>> #include <cstring>
>>>>> #include <cmath>
>>>>> 
>>>>> int main(int argc, char **argv)
>>>>> {
>>>>>    int device = 0;
>>>>>    int skip = 0;
>>>>>    bool skipmode = false;
>>>>>    bool specified = false;
>>>>>    for( int i = 0 ; i < argc ; i++ )
>>>>>    {
>>>>>        if ( strcmp( argv[i], "-device" ) == 0 )
>>>>>        {
>>>>>            i++;
>>>>>            if ( argv[i][0] == '-' )
>>>>>            {
>>>>>                skipmode = true;
>>>>>                skip = fabs( atoi( argv[i] ) );
>>>>>            }
>>>>>            else
>>>>>            {
>>>>>                skipmode = false;
>>>>>                device = atoi( argv[i] );
>>>>>            }
>>>>>            specified = true;
>>>>>        }
>>>>>    }
>>>>> 
>>>>>    if ( !specified || skipmode )
>>>>>    {
>>>>>        char* var;
>>>>>        int dev_count, local_rank = 0;
>>>>>        if ( (var = getenv("SLURM_LOCALID")) != NULL) local_rank =
>>>>> atoi(var);
>>>>>        else if( (var = getenv("MV2_COMM_WORLD_LOCAL_RANK"))  != NULL)
>>>>> local_rank = atoi(var);
>>>>>        else if( (var = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL)
>>>>> local_rank = atoi(var);
>>>>>        cudaGetDeviceCount( &dev_count );
>>>>>        if ( skipmode )
>>>>>        {
>>>>>            device = 0;
>>>>>            if ( device == skip ) local_rank++;
>>>>>            while( local_rank-- > 0 )
>>>>>            {
>>>>>                device = (++device) % dev_count;
>>>>>                if ( device == skip ) local_rank++;
>>>>>            }
>>>>>        }
>>>>>        else device = local_rank % dev_count;
>>>>>    }
>>>>> 
>>>>>    // override command line arguments to make sure cudaengine get the
>>>>> correct one
>>>>>    char **argv_new = new char*[ argc + 2 ];
>>>>>    for( int i = 0 ; i < argc ; i++ )
>>>>>    {
>>>>>        argv_new[i] = new char[ strlen( argv[i] ) + 1 ];
>>>>>        strcpy( argv_new[i], argv[i] );
>>>>>    }
>>>>>    argv_new[ argc   ] = new char[ 32 ];
>>>>>    argv_new[ argc+1 ] = new char[ 32 ];
>>>>>    strcpy( argv_new[argc],   "-device" );
>>>>>    sprintf( argv_new[argc+1], "%d", device );
>>>>>    argc += 2;
>>>>>    argv = argv_new;
>>>>> 
>>>>>    cudaSetDevice( device );
>>>>> 
>>>>>    MPI_Init(&argc,&argv);
>>>>> 
>>>>>    // do something...
>>>>> 
>>>>>    MPI_Finalize();
>>>>> 
>>>>>    cudaDeviceReset();
>>>>>    for( int i = 0 ; i < argc ; i++ ) delete [] argv[i];
>>>>>    delete [] argv;
>>>>> }
>>>>> 
>>>>> When compiled using nvcc -ccbin mpic++, The error I got was:
>>>>> 
>>>>> [jueying:16317] *** Process received signal ***
>>>>> [jueying:16317] Signal: Segmentation fault (11)
>>>>> [jueying:16317] Signal code: Address not mapped (1)
>>>>> [jueying:16317] Failing at address: 0x21
>>>>> [jueying:16317] [ 0] /usr/lib64/libpthread.so.0() [0x39e5e0f000]
>>>>> [jueying:16317] [ 1] /usr/lib64/libc.so.6() [0x39e5760551]
>>>>> [jueying:16317] [ 2]
>>>>> /opt/openmpi/1.7.2/lib/libopen-pal.so.5(opal_argv_join+0x39)
>>>>> [0x7f460b993079]
>>>>> [jueying:16317] [ 3]
>>>>> /opt/openmpi/1.7.2/lib/libmpi.so.1(ompi_mpi_init+0x347)
>>>>> [0x7f460c106a57]
>>>>> [jueying:16317] [ 4] /opt/openmpi/1.7.2/lib/libmpi.so.1(MPI_Init+0x16b)
>>>>> [0x7f460c12523b]
>>>>> [jueying:16317] [ 5] ./lmp_jueying() [0x40c035]
>>>>> [jueying:16317] [ 6] /usr/lib64/libc.so.6(__libc_start_main+0xf5)
>>>>> [0x39e5621a05]
>>>>> [jueying:16317] [ 7] ./lmp_jueying() [0x40dd21]
>>>>> [jueying:16317] *** End of error message ***
>>>>> 
>>>>> Thanks for the help.
>>>>> 
>>>>> Best regards,
>>>>> Yu-Hang Tang
>>>>> 
>>>>> _______________________________________________
>>>>> users mailing list
>>>>> us...@open-mpi.org
>>>>> http://www.open-mpi.org/mailman/listinfo.cgi/users
>>>> 
>>>> 
>>>> 
>>>> --
>>>> Information System Engineer, Ph.D.
>>>> Blog: http://matt.eifelle.com
>>>> LinkedIn: http://www.linkedin.com/in/matthieubrucher
>>>> Music band: http://liliejay.com/
>>>> _______________________________________________
>>>> users mailing list
>>>> us...@open-mpi.org
>>>> http://www.open-mpi.org/mailman/listinfo.cgi/users
>>> 
>>> 
>>> 
>>> 
>>> --
>>> Yu-Hang Tang
>>> Room 105, 37 Manning St
>>> Division of Applied Mathematics, Brown University
>>> Providence, RI 02912
>>> 
>>> _______________________________________________
>>> users mailing list
>>> us...@open-mpi.org
>>> http://www.open-mpi.org/mailman/listinfo.cgi/users
>> 
>> 
>> 
>> --
>> Information System Engineer, Ph.D.
>> Blog: http://matt.eifelle.com
>> LinkedIn: http://www.linkedin.com/in/matthieubrucher
>> Music band: http://liliejay.com/
> 
> 
> 
> -- 
> Information System Engineer, Ph.D.
> Blog: http://matt.eifelle.com
> LinkedIn: http://www.linkedin.com/in/matthieubrucher
> Music band: http://liliejay.com/
> _______________________________________________
> users mailing list
> us...@open-mpi.org
> http://www.open-mpi.org/mailman/listinfo.cgi/users

Reply via email to