Hello, I seem to have encountered a bug in MPI-IO, in which MPI_File_get_position_shared hangs when called by multiple processes in a communicator. It can be illustrated by the following simple test case, in which a file is simply created with C IO, and opened with MPI-IO. (defining or undefining MY_MPI_IO_BUG on line 5 enables/disables the bug). From the MPI2 documentation, It seems that all processes should be able to call MPI_File_get_position_shared, but if more than one process uses it, it fails. Setting the shared pointer helps, but this should not be necessary, and the code still hangs (in more complete code, after writing data).
I encounter the same problem with Open MPI 1.2.6 and MPICH2 1.0.7, so I may have misread the documentation, but I suspect a ROMIO bug. Best regards, Yvan Fournier
/*============================================================================ * Parallel file I/O shared pointer bug test *============================================================================*/ #define MY_MPI_IO_BUG 1 /*---------------------------------------------------------------------------- * Standard C library headers *----------------------------------------------------------------------------*/ #include <assert.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <mpi.h> /*----------------------------------------------------------------------------*/ #ifdef __cplusplus extern "C" { #if 0 } /* Fake brace to force Emacs auto-indentation back to column 0 */ #endif #endif /* __cplusplus */ /*============================================================================ * Private function definitions *============================================================================*/ /*---------------------------------------------------------------------------- * Output MPI error message. * * This supposes that the default MPI errorhandler is not used * * parameters: * error_code <-- associated MPI error code * * returns: * 0 in case of success, system error code in case of failure *----------------------------------------------------------------------------*/ static void _mpi_io_error_message(int error_code) { char buffer[MPI_MAX_ERROR_STRING]; int buffer_len; MPI_Error_string(error_code, buffer, &buffer_len); printf("MPI IO error %d: %s", error_code, buffer); } /*---------------------------------------------------------------------------- * Return the position of the file pointer. * * When using MPI-IO with individual file pointers, we consider the file * pointer to be equal to the highest value of then individual file pointers. * * parameters: * fh <-- MPI IO file descriptor * * returns: * current position of the file pointer *----------------------------------------------------------------------------*/ MPI_Offset _mpi_file_tell(MPI_File fh) { int errcode = MPI_SUCCESS; MPI_Offset offset = 0, disp = 0, retval = 0; int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); #if defined(MY_MPI_IO_BUG) printf("rank %d: will call MPI_File_get_position_shared\n", rank); errcode = MPI_File_get_position_shared(fh, &offset); if (errcode == MPI_SUCCESS) { MPI_File_get_byte_offset(fh, offset, &disp); retval = disp; } printf("rank %d: offsets: %ld %ld\n", rank, (long)offset, (long)disp); #else long aux[2]; if (rank == 0) { printf("root rank will call MPI_File_get_position_shared\n"); errcode = MPI_File_get_position_shared(fh, &offset); if (errcode == MPI_SUCCESS) { MPI_File_get_byte_offset(fh, offset, &disp); retval = disp; } aux[0] = disp; aux[1] = retval; } MPI_Bcast(aux, 2, MPI_LONG, 0, MPI_COMM_WORLD); disp = aux[0]; retval = aux[1]; printf("rank %d: offsets: %ld %ld\n", rank, (long)offset, (long)disp); #endif if (errcode != MPI_SUCCESS) _mpi_io_error_message(errcode); return retval; } /*============================================================================ * Unit test *============================================================================*/ static void _create_test_data(void) { int i; FILE *f; char header[80]; char footer[80]; sprintf(header, "fvm test file"); for (i = strlen(header); i < 80; i++) header[i] = '\0'; sprintf(footer, "fvm test file end"); for (i = strlen(footer); i < 80; i++) footer[i] = '\0'; f = fopen("file_test_data", "w+"); fwrite(header, 1, 80, f); fwrite(footer, 1, 80, f); fclose(f); } /*---------------------------------------------------------------------------*/ int main (int argc, char *argv[]) { int rank = 0; int retval = MPI_SUCCESS; MPI_Offset offset; MPI_File fh = MPI_FILE_NULL; /* Initialization */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (rank == 0) _create_test_data(); /* Open file */ retval = MPI_File_open(MPI_COMM_WORLD, "file_test_data", MPI_MODE_RDONLY, MPI_INFO_NULL, &fh); if (retval != MPI_SUCCESS) _mpi_io_error_message(retval); #if 0 /* Hides the bug, but does not make it dissapear in more complex cases (i.e. when MPI_File_read_shared is used locally and MPI_File_read_ordered is used collectively) */ retval = MPI_File_seek_shared(fh, 1, MPI_SEEK_SET); if (retval != MPI_SUCCESS) _mpi_io_error_message(retval); #endif offset = _mpi_file_tell(fh); /* Close file */ retval = MPI_File_close(&fh); if (retval != MPI_SUCCESS) _mpi_io_error_message(retval); /* We are finished */ MPI_Finalize(); exit(EXIT_SUCCESS); }