Dear forum members,

this may be too specialized a problem, but maybe somebody still has some
insights.

Our code (running on an IBM BlueGene/Q machine) reads in some data,
using HDF5. This is done collectively, on each core (everyone reads in
the same data, at the same time). It is not known a priori which
processor owns which part of the data, they have to compute this
themselves and discard the data they don't own. The data file is ~9.4MB
in a simple test case. The data is a custom data type of a nested struct
with two 32-bit integers and two 64-bit doubles that form a complex
number, with a total of 192 bits.

If I use less than 1024 cores, there is no problem. However, for >=1024
cores, I get a crash with the error

"Out of memory in file
/bgsys/source/srcV1R2M3.12428/comm/lib/dev/mpich2/src/mpi/romio/adio/ad_bg/ad_bg_rdcoll.c,
line 1073"

We use parallel HDF5 1.8.15; I've also tried 1.8.14. Another library
dependence is FFTW 3.3.3, but that should not really matter.

I traced the crash with Totalview to the call of H5Dread(). The
second-to-last call in the crash trace is MPIDO_Alltoallv, the last one
is PAMI_Context_trylock_advancev. I don't have exact calls nor line
numbers since the HDF5 library was not compiled with debug symbols. [the
file mentioned in the error message is not accessible]

Is this an HDF5 problem, or a problem with IBM's MPI implementation?
Might it be an MPI buffer overflow?!? Or is there maybe a problem with
data contiguity in the struct?

The problem disappears if I read in the file in chunks of less than
192kiB at a time. A more workable workaround is to replace collective
communication by independent communication, in which case, the problem
disappears.
H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); -->
H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT);

Since this data file is quite small (usually not larger than a few
hundred megabytes at most), reading in the file independently is not a
huge performance problem at this stage, but for very large simulations
it might be.

In other, older parts of the code, we're (successfully!) reading in (up
to) 256 GiB of data in predefined data types (double, float) using
H5FD_MPIO_COLLECTIVE without any problem, so I'm thinking this problem
is connected with the user-defined data type in some way.

I attach some condensed code with all calls to the HDF5 library; I'm not
sure anyone is in the position to actually reproduce this problem, so
the main() routine and the data file are probably unnecessary. However,
I'd be happy to also send those if need be.

Thanks in advance for any hints.

Best regards,
Wolf

-- 


#include <hdf5.h>

#ifndef JUQUEEN
  #define JUQUEEN
#endif

/* declarations */
bool DoesFileExist(const char *);
bool DoesDatasetExist(hid_t, const std::string& rDatasetName);
int readH5SurfFour( const char *name, Complex * data );

int readH5SurfFour( const char *name, Complex * data )
{ // each processor reads the same chunk of data, and discards what does not belong there
  if (!DoesFileExist(name)) return(0); // first collectively check whether the file exists

  // ===============================================================
  // =================== read input from HDF5 ======================
  // ===============================================================
  
  /* HDF5 APIs definitions */
  hid_t	file_id, dset_id;	/* file and dataset identifiers */
  hid_t	filespace, memspace;	/* file and memory dataspace identifiers */
  hsize_t	chunk_dims[1];	/* chunk dimensions */
  hsize_t	count[1];	/* hyperslab selection parameters */
  hsize_t	stride[1];
  hsize_t	block[1];
  hsize_t	offset[1];
  hid_t	plist_id;		/* property list identifier */
  herr_t	status_h5;
  MPI_Info info  = MPI_INFO_NULL;

  /* Create a new datatype for complex numbers. */  
  typedef struct {
    double re;   /*real part */
    double im;   /*imaginary part */
  } complex_t;
  typedef struct {
    int jx; /* x index */
    int jy; /* y index */
    complex_t val; /* complex value at index (jx,jy) */
  } surf_t;
  hid_t complex_id = H5Tcreate (H5T_COMPOUND, sizeof(complex_t));
  H5Tinsert (complex_id, "real", HOFFSET(complex_t,re), H5T_NATIVE_DOUBLE);
  H5Tinsert (complex_id, "imag", HOFFSET(complex_t,im), H5T_NATIVE_DOUBLE);
  hid_t s_id = H5Tcreate (H5T_COMPOUND, sizeof(surf_t));
  H5Tinsert (s_id, "jx", HOFFSET(surf_t,jx), H5T_NATIVE_INT);
  H5Tinsert (s_id, "jy", HOFFSET(surf_t,jy), H5T_NATIVE_INT);
  H5Tinsert (s_id, "val", HOFFSET(surf_t,val), complex_id);

  /* Open file_id and dataset using the default properties. */
  plist_id = H5Pcreate(H5P_FILE_ACCESS);
  H5Pset_fapl_mpio(plist_id, localComm, info);
  
  std::string filename;
#ifdef JUQUEEN
  filename.append("bglockless:");
#endif // JUQUEEN
  filename.append(name);
  file_id = H5Fopen (filename.c_str(), H5F_ACC_RDONLY, plist_id); // filename.c_str() is the file name
  H5Pclose(plist_id);

  std::string dataset = "values";
  /* check whether the dataset exists */
  if (DoesDatasetExist(file_id, dataset.c_str()))
  { dset_id = H5Dopen (file_id, dataset.c_str(), H5P_DEFAULT); } // dataset.c_str() is the dataset name
  else 
  { H5Fclose (file_id);
    H5Tclose (complex_id);
    H5Tclose(s_id);
    return (0); } // return false if dataset can't be found, don't crash
  
  filespace = H5Dget_space (dset_id);
  
  hsize_t dim[1];
  H5Sget_simple_extent_dims(filespace, dim, NULL);
  
  // each proc should be able to handle 48MB of data at a time, even on JUQUEEN
  // (2x64bit double + 2x32bit int) = 192bit
  int maxSize = 1024*2048; // the problem of "out of memory crashes" disappears if maxSize = 8192;
  int nChunks = dim[0]/maxSize +1; // how many chunks of maximal size maxSize to read, at least 1
  ptrdiff_t size;
  
  for (int jj = 0; jj < nChunks; ++jj)
  {
    // last (or first, if only 1) chunk might not be maxSize
    if (jj == (nChunks-1)) { size = (dim[0]-jj*maxSize); } else { size = maxSize; } 
    surf_t *values = (surf_t *) calloc(size,sizeof(surf_t));
    
    plist_id = H5Pcreate(H5P_DATASET_XFER);
    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
    // H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_INDEPENDENT); // NOTE: using this fixes the problem    
    
    /* Define and select the hyperslab to use for reading. */
    chunk_dims[0] = size; 
    memspace  = H5Screate_simple(1, chunk_dims, NULL); 
    
    count[0] = 1;		// one block in x-direction
    stride[0] = 1;		// the blocks have no gaps between them in x-direction
    block[0] = chunk_dims[0];	// contiguous x-size of block is chunk_dims[0] = size
    offset[0] = jj*maxSize;	// x-index [in data file] of (0,0) entry [in memory] of block
    
    status_h5 = H5Sselect_hyperslab (filespace, H5S_SELECT_SET, offset, stride, count, block); assert(status_h5 >= 0);

    /* Read data using the previously defined hyperslab. */
    status_h5 = H5Dread (dset_id, s_id, memspace, filespace, plist_id, values); assert(status_h5 >= 0);
    
    /************ computations to see who gets which part of the data here *****************/
    
    /* Close and release resources. */
    status_h5 = H5Pclose (plist_id);  assert(status_h5 >= 0);
    status_h5 = H5Sclose (memspace);  assert(status_h5 >= 0);
    free(values);
  } // end for (int jj = 0; jj < nChunks; ++jj)
  status_h5 = H5Dclose (dset_id);   assert(status_h5 >= 0);
  status_h5 = H5Fclose (file_id);   assert(status_h5 >= 0);
  status_h5 = H5Sclose (filespace); assert(status_h5 >= 0);    
  status_h5 = H5Tclose(s_id);       assert(status_h5 >= 0);
  status_h5 = H5Tclose(complex_id); assert(status_h5 >= 0);
  return (1);
}

bool DoesFileExist(const char* rFileName)
{
  MPI_Info info  = MPI_INFO_NULL;
  bool result=false;
  /* Set up file access property list with parallel I/O access */
  hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
  H5Pset_fapl_mpio(plist_id, localComm, info);
  // This is not a nice way of doing it because the error stack produces a load of 'HDF failed' output.
  // The "TRY" macros are a convenient way to temporarily turn the error stack off.
  H5E_BEGIN_TRY
  {
    hid_t file_id = H5Fopen (rFileName, H5F_ACC_RDONLY, plist_id);
    if (file_id>0) { H5Fclose(file_id); result = true; }
  }
  H5E_END_TRY;
  H5Pclose(plist_id);
  return result;
}

// from: http://stackoverflow.com/questions/1686869/searching-a-hdf5-dataset
bool DoesDatasetExist(hid_t mFileId, const std::string& rDatasetName)
{
#if H5_VERS_MAJOR>=1 && H5_VERS_MINOR>=8
  // This is a nice method for testing existence, introduced in HDF5 1.8.0
  htri_t dataset_status = H5Lexists(mFileId, rDatasetName.c_str(), H5P_DEFAULT);
  return (dataset_status>0);
#else
  bool result=false;
  // This is not a nice way of doing it because the error stack produces a load of 'HDF failed' output.
  // The "TRY" macros are a convenient way to temporarily turn the error stack off.
  H5E_BEGIN_TRY
  {
    hid_t dataset_id = H5Dopen(mFileId, rDatasetName.c_str());
    if (dataset_id>0)
    {
      H5Dclose(dataset_id);
      result = true;
    }
  }
  H5E_END_TRY;
  return result;
#endif
}

_______________________________________________
Hdf-forum is for HDF software users discussion.
[email protected]
http://lists.hdfgroup.org/mailman/listinfo/hdf-forum_lists.hdfgroup.org
Twitter: https://twitter.com/hdf5

Reply via email to