Hi again,
I have attached a very small example which raise the assertion.
The problem is arising from a process which does not have any element to
write in the file (and then in the MPI_File_set_view)...
You can see this "bug" with openmpi 1.6.3, 1.6.4 and 1.7.0 configured with:
./configure --enable-mem-debug --enable-mem-profile --enable-memchecker
--with-mpi-param-check --enable-debug
Just compile the given example (idx_null.cc) as-is with
mpicxx -o idx_null idx_null.cc
and run with 3 processes:
mpirun -n 3 idx_null
You can modify the example by commenting "#define WITH_ZERO_ELEMNT_BUG"
to see that everything is going well when all processes have something
to write.
There is no "bug" if you use openmpi 1.6.3 (and higher) without the
debugging options.
Also, all is working well with mpich-3.0.3 configured with:
./configure --enable-g=yes
So, is this a wrong "assert" in openmpi?
Is there a real problem to use this code in a "release" mode?
Thanks,
Eric
On 04/05/2013 12:57 PM, Eric Chamberland wrote:
Hi all,
I have a well working (large) code which is using openmpi 1.6.3 (see
config.log here:
http://www.giref.ulaval.ca/~ericc/bug_openmpi/config.log_nodebug)
(I have used it for reading with MPI I/O with success over 1500 procs
with very large files)
However, when I use openmpi compiled with "debug" options:
./configure --enable-mem-debug --enable-mem-profile --enable-memchecker
--with-mpi-param-check --enable-debug --prefix=/opt/openmpi-1.6.3_debug
(se other config.log here:
http://www.giref.ulaval.ca/~ericc/bug_openmpi/config.log_debug) the code
is aborting with an assertion on a very small example on 2 processors.
(the same very small example is working well without the debug mode)
Here is the assertion causing an abort:
===================================
openmpi-1.6.3/opal/datatype/opal_datatype.h:
static inline int32_t
opal_datatype_is_contiguous_memory_layout( const opal_datatype_t*
datatype, int32_t count )
{
if( !(datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) return 0;
if( (count == 1) || (datatype->flags & OPAL_DATATYPE_FLAG_NO_GAPS)
) return 1;
/* This is the assertion: */
assert( (OPAL_PTRDIFF_TYPE)datatype->size != (datatype->ub -
datatype->lb) );
return 0;
}
===================================
Does anyone can tell me what does this mean?
It happens while writing a file with MPI I/O when I am calling for the
fourth time a "MPI_File_set_view"... with different types of
MPI_Datatype created with "MPI_Type_indexed".
I am trying to reproduce the bug with a very small example to be send
here, but if anyone has a hint to give me...
(I would like: this assert is not good! just ignore it ;-) )
Thanks,
Eric
_______________________________________________
users mailing list
us...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/users
#include "mpi.h"
#include <cstdio>
#include <cstdlib>
using namespace std;
void abortOnError(int ierr) {
if (ierr != MPI_SUCCESS) {
printf("ERROR Returned by MPI: %d\n",ierr);
char* lCharPtr = new char[MPI_MAX_ERROR_STRING];
int lLongueur = 0;
MPI_Error_string(ierr,lCharPtr, &lLongueur);
printf("ERROR_string Returned by MPI: %s\n",lCharPtr);
MPI_Abort( MPI_COMM_WORLD, 1 );
}
}
// This main is showing how to have an assertion raised if you try
// to create a MPI_File_set_view with some process holding no data
#define WITH_ZERO_ELEMNT_BUG
int main(int argc, char *argv[])
{
int rank, size, i;
MPI_Datatype lTypeIndexIntWithExtent, lTypeIndexIntWithoutExtent;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size != 3)
{
printf("Please run with 3 processes.\n");
MPI_Finalize();
return 1;
}
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int displacement[3];
int* buffer = 0;
int lTailleBuf = 0;
if (rank == 0)
{
lTailleBuf = 3;
displacement[0] = 0;
displacement[1] = 4;
displacement[2] = 5;
buffer = new int[lTailleBuf];
for (i=0; i<lTailleBuf; i++) buffer[i] = 10*(i+1);
}
if (rank == 1)
{
lTailleBuf = 2;
displacement[0] = 1;
displacement[1] = 2;
#ifdef WITH_ZERO_ELEMNT_BUG
displacement[2] = 3;
++lTailleBuf;
#endif
buffer = new int[lTailleBuf];
for (i=0; i<lTailleBuf; i++) buffer[i] = -(i+1);
}
// BUG: A rank without any "element"
if (rank == 2)
{
#ifdef WITH_ZERO_ELEMNT_BUG
lTailleBuf = 0;
#else
displacement[0] = 0;
lTailleBuf = 1;
buffer = new int[lTailleBuf];
for (i=0; i<lTailleBuf; i++) buffer[i] = 1000*(i+1);
#endif
}
MPI_File lFile;
abortOnError(MPI_File_open( MPI_COMM_WORLD, const_cast<char*>("temp"),
MPI_MODE_RDWR | MPI_MODE_CREATE, MPI_INFO_NULL, &lFile ));
MPI_Type_create_indexed_block(lTailleBuf, 1, displacement, MPI_INT,
&lTypeIndexIntWithoutExtent);
MPI_Type_commit(&lTypeIndexIntWithoutExtent);
// Here we compute the total number of int to write to resize the type:
// Ici, on veut s'échanger le nb total de int à écrire à chaque appel pcqu'on
doit calculer le bon "extent" du type.
// Ça revient à dire que chaque processus ne n'écrira qu'une petite partie du
fichier, mais devra avancer son pointeur
// local d'écriture suffisamment loin pour ne pas écrire par dessus les
données des autres
int lTailleGlobale = 0;
printf("[%d] Local size : %d \n",rank,lTailleBuf);
MPI_Allreduce( &lTailleBuf, &lTailleGlobale, 1, MPI_INT, MPI_SUM,
MPI_COMM_WORLD );
printf("[%d] MPI_AllReduce : %d \n",rank,lTailleGlobale);
//We now modify the extent of the type "type_without_extent"
MPI_Type_create_resized( lTypeIndexIntWithoutExtent, 0,
lTailleGlobale*sizeof(int), &lTypeIndexIntWithExtent );
MPI_Type_commit(&lTypeIndexIntWithExtent);
abortOnError(MPI_File_set_view( lFile, 0, MPI_INT, lTypeIndexIntWithExtent,
const_cast<char*>("native"), MPI_INFO_NULL));
for (int i =0; i<2;++i) {
abortOnError(MPI_File_write_all( lFile, buffer, lTailleBuf, MPI_INT,
MPI_STATUS_IGNORE));
MPI_Offset lOffset,lSharedOffset;
MPI_File_get_position(lFile, &lOffset);
MPI_File_get_position_shared(lFile, &lSharedOffset);
printf("[%d] Offset after write : %d int: Local: %ld Shared: %ld
\n",rank,lTailleBuf,lOffset,lSharedOffset);
}
abortOnError(MPI_File_close( &lFile ));
abortOnError(MPI_Type_free(&lTypeIndexIntWithExtent));
abortOnError(MPI_Type_free(&lTypeIndexIntWithoutExtent));
MPI_Finalize();
return 0;
}