7th question is as follows:

(7) The result of communication which use derived datatype after taking 
checkpoint is incorrect.

Framework         : crcp
Component         : bkmrk
The source file   : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : traffic_message_append

Framework         : datatype
The source file   : ompi/datatype/datatype.h
The function name : ompi_ddt_type_size

Here's the code that causes the problem:

struct dd {
  char  x;
  float a;
  char  y;
  float b;
  int   c;
};
struct dd buf,ans_dd_buf;

  if (rank == 0) {
    buf.x = (char)1;
    buf.a = (float)4329.1003;
    buf.y = (char)2;
    buf.b = (float)8474.73;
    buf.c = (int)48;
  }
  else {
    buf.x = (char)0;
    buf.a = (float)0;
    buf.y = (char)0;
    buf.b = (float)0;
    buf.c = (int)0;
  }
  ans_dd_buf.x = (char)1;
  ans_dd_buf.a = (float)4329.1003;
  ans_dd_buf.y = (char)2;
  ans_dd_buf.b = (float)8474.73;
  ans_dd_buf.c = (int)48;

  /* item number per a block */
  b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;
  /* datatype per a block */
  dt[0]  = dt[2] = MPI_BYTE;
  dt[1]  = dt[3] = MPI_FLOAT;
  dt[4]  = MPI_INT;
  /* disp per a block */
  dp[0] = 0;
  MPI_Address(&buf.x,&st);
  MPI_Address(&buf.a,&cr);
  dp[1] = (cr - st);
  MPI_Address(&buf.y,&cr);
  dp[2] = (cr - st);
  MPI_Address(&buf.b,&cr);
  dp[3] = (cr - st);
  MPI_Address(&buf.c,&cr);
  dp[4] = (cr - st);
  cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc = MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  MPI_Barrier(MPI_COMM_WORLD);
  printf("   rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b=%d[%d] 
x->c=%d[%d]\n"
    ,rank
    ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
    ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
    ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
    ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
    ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
  );
  fflush(stdout);

  if (rank == 0) {
    MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);  /** take checkpoint at this point **/
    printf(" rank=%d sleep end   \n",rank); fflush(stdout);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  else { /* rank 1 */
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);  /** take checkpoint at this point **/
    printf(" rank=%d sleep end   \n",rank); fflush(stdout);
    MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); }  /* The error 
occurs at this point */
  if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

* Take checkpoint while Rank 0 and Rank 1 are performing sleep function

* Construct derived datatype from the structure dd.

* I think that image of memory mapping of the derived datatype is as follows:

              1111111111
    01234567890123456789
    --------------------
    X###AAAAY###BBBBCCCC
    --------------------

### means space.

* ddt_size for /** Quick reference to the size of the datatype */ in
  ompi_crcp_bkmrk_pml_traffic_message_ref_t structure is obtained by
  ompi_ddt_type_size function in traffic_message_append function.

    if( NULL != datatype ) {
        ompi_ddt_type_size(datatype,
                           &ddt_size);

* I think that the returned value of ddt_size is wrong.
  The obtained value is 14.(Does it means total size in the memory is 14bytes?)

  struct dd {
    char  x;    -> charactor is 1byte.
    float a;    -> float     is 4byte.
    char  y;    -> charactor is 1byte.
    float b;    -> float     is 4byte.
    int   c;    -> integer   is 4byte.
  };

* But the returned value of ddt_size should be 20bytes, considering the memory 
mapping.

* Rank 1 receive messages of only 14bytes in the bkmrk.
  The wrong result is obtained.

* t_mpi_question-7-ng.c : the error occurs.
  Here's my debugging output.

  ft_event_post_drain_message:Irecv drain_msg_ref=c89200 rank=0 tag=1000 cnt=1 
ddt=14 to=c929b0 [datatype->size=1]
  wait_quiesce_drained: x=1 a=142658605493679655240073216.000000 y=4 b=0.000000 
c=32
    /* 14bytes data is received, it is incorrect. values are wrong. */
  drain_message_check_recv:datatype->size=1 14 count=1 1
  ompi_ddt_copy_content_same_ddt:Start size=14 flag=102/4 count=1
    /* DT_FLAG_CONTIGUOUS is false. */


* t_mpi_question-7-ok.c : the error does not occur.
  Here's my debugging output.

  ft_event_post_drain_message:Irecv drain_msg_ref=a51280 rank=0 tag=1000 cnt=1 
ddt=20 to=a5b6b0 [datatype->size=1]
  wait_quiesce_drained: x=1 a=4329.100098 y=2 b=8474.730469 c=48
    /* 20bytes data is received correctly. */
  drain_message_check_recv:datatype->size=1 20 count=1 1
  ompi_ddt_copy_content_same_ddt:Start size=20 flag=186/4 count=1
    /* DT_FLAG_CONTIGUOUS is true. */

* difference list

-bash-3.2$ diff -c t_mpi_question-7-ng.c t_mpi_question-7-ok.c
*** t_mpi_question-7-ng.c       Fri Feb 26 13:07:05 2010
--- t_mpi_question-7-ok.c       Fri Feb 26 13:20:25 2010
***************
*** 8,16 ****
  #define ITEMNUM 5

  struct dd {
!   char  x;
    float a;
!   char  y;
    float b;
    int   c;
  };
--- 8,16 ----
  #define ITEMNUM 5

  struct dd {
!   int   x;
    float a;
!   int   y;
    float b;
    int   c;
  };
***************
*** 31,52 ****
    MPI_Comm_size(MPI_COMM_WORLD,&size);

    if (rank == 0) {
!     buf.x = (char)1;
      buf.a = (float)4329.1003;
!     buf.y = (char)2;
      buf.b = (float)8474.73;
      buf.c = (int)48;
    }
    else {
!     buf.x = (char)0;
      buf.a = (float)0;
!     buf.y = (char)0;
      buf.b = (float)0;
      buf.c = (int)0;
    }
!   ans_dd_buf.x = (char)1;
    ans_dd_buf.a = (float)4329.1003;
!   ans_dd_buf.y = (char)2;
    ans_dd_buf.b = (float)8474.73;
    ans_dd_buf.c = (int)48;

--- 31,52 ----
    MPI_Comm_size(MPI_COMM_WORLD,&size);

    if (rank == 0) {
!     buf.x = (int)1;
      buf.a = (float)4329.1003;
!     buf.y = (int)2;
      buf.b = (float)8474.73;
      buf.c = (int)48;
    }
    else {
!     buf.x = (int)0;
      buf.a = (float)0;
!     buf.y = (int)0;
      buf.b = (float)0;
      buf.c = (int)0;
    }
!   ans_dd_buf.x = (int)1;
    ans_dd_buf.a = (float)4329.1003;
!   ans_dd_buf.y = (int)2;
    ans_dd_buf.b = (float)8474.73;
    ans_dd_buf.c = (int)48;

***************
*** 54,60 ****
    b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

    /* datatype per a block */
!   dt[0]  = dt[2] = MPI_BYTE;
    dt[1]  = dt[3] = MPI_FLOAT;
    dt[4]  = MPI_INT;

--- 54,60 ----
    b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

    /* datatype per a block */
!   dt[0]  = dt[2] = MPI_INT;
    dt[1]  = dt[3] = MPI_FLOAT;
    dt[4]  = MPI_INT;


-bash-3.2$ cat t_mpi_question-7-ng.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60

#define ITEMNUM 5

struct dd {
  char  x;
  float a;
  char  y;
  float b;
  int   c;
};

int main(int ac,char **av)
{
  int rank,size,cc;
  MPI_Request req;
  MPI_Status sts;
  struct dd buf,ans_dd_buf;
  int b_l[ITEMNUM];
  MPI_Aint dp[ITEMNUM],st,cr;
  MPI_Datatype dt[ITEMNUM],newdt;

  MPI_Init(&ac,&av);

  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  if (rank == 0) {
    buf.x = (char)1;
    buf.a = (float)4329.1003;
    buf.y = (char)2;
    buf.b = (float)8474.73;
    buf.c = (int)48;
  }
  else {
    buf.x = (char)0;
    buf.a = (float)0;
    buf.y = (char)0;
    buf.b = (float)0;
    buf.c = (int)0;
  }
  ans_dd_buf.x = (char)1;
  ans_dd_buf.a = (float)4329.1003;
  ans_dd_buf.y = (char)2;
  ans_dd_buf.b = (float)8474.73;
  ans_dd_buf.c = (int)48;

  /* item number per a block */
  b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

  /* datatype per a block */
  dt[0]  = dt[2] = MPI_BYTE;
  dt[1]  = dt[3] = MPI_FLOAT;
  dt[4]  = MPI_INT;

  /* disp per a block */
  dp[0] = 0;
  MPI_Address(&buf.x,&st);

  MPI_Address(&buf.a,&cr);
  dp[1] = (cr - st);

  MPI_Address(&buf.y,&cr);
  dp[2] = (cr - st);

  MPI_Address(&buf.b,&cr);
  dp[3] = (cr - st);

  MPI_Address(&buf.c,&cr);
  dp[4] = (cr - st);

  cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc = MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }

  MPI_Barrier(MPI_COMM_WORLD);

  printf("   rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b=%d[%d] 
x->c=%d[%d]\n"
    ,rank
    ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
    ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
    ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
    ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
    ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
  );
  fflush(stdout);

  if (rank == 0) {
    MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end   \n",rank); fflush(stdout);

    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end   \n",rank); fflush(stdout);

    MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }

  printf("   rank=%d pass-2 %d %f %d %f %d \n"
    ,rank,buf.x,buf.a,buf.y,buf.b,buf.c); fflush(stdout);
  if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

  cc =  MPI_Finalize();
  if (rank ==0) {
    printf("   rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}

-bash-3.2$ cat t_mpi_question-7-ok.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60

#define ITEMNUM 5

struct dd {
  int   x;
  float a;
  int   y;
  float b;
  int   c;
};

int main(int ac,char **av)
{
  int rank,size,cc;
  MPI_Request req;
  MPI_Status sts;
  struct dd buf,ans_dd_buf;
  int b_l[ITEMNUM];
  MPI_Aint dp[ITEMNUM],st,cr;
  MPI_Datatype dt[ITEMNUM],newdt;

  MPI_Init(&ac,&av);

  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
  MPI_Comm_size(MPI_COMM_WORLD,&size);

  if (rank == 0) {
    buf.x = (int)1;
    buf.a = (float)4329.1003;
    buf.y = (int)2;
    buf.b = (float)8474.73;
    buf.c = (int)48;
  }
  else {
    buf.x = (int)0;
    buf.a = (float)0;
    buf.y = (int)0;
    buf.b = (float)0;
    buf.c = (int)0;
  }
  ans_dd_buf.x = (int)1;
  ans_dd_buf.a = (float)4329.1003;
  ans_dd_buf.y = (int)2;
  ans_dd_buf.b = (float)8474.73;
  ans_dd_buf.c = (int)48;

  /* item number per a block */
  b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

  /* datatype per a block */
  dt[0]  = dt[2] = MPI_INT;
  dt[1]  = dt[3] = MPI_FLOAT;
  dt[4]  = MPI_INT;

  /* disp per a block */
  dp[0] = 0;
  MPI_Address(&buf.x,&st);

  MPI_Address(&buf.a,&cr);
  dp[1] = (cr - st);

  MPI_Address(&buf.y,&cr);
  dp[2] = (cr - st);

  MPI_Address(&buf.b,&cr);
  dp[3] = (cr - st);

  MPI_Address(&buf.c,&cr);
  dp[4] = (cr - st);

  cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
  cc = MPI_Type_commit(&newdt);
  if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }

  MPI_Barrier(MPI_COMM_WORLD);

  printf("   rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b=%d[%d] 
x->c=%d[%d]\n"
    ,rank
    ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
    ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
    ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
    ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
    ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
  );
  fflush(stdout);

  if (rank == 0) {
    MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);

    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end   \n",rank); fflush(stdout);

    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }
  else {
    printf(" rank=%d sleep start \n",rank); fflush(stdout);
    sleep(SLPTIME);
    printf(" rank=%d sleep end   \n",rank); fflush(stdout);

    MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
    MPI_Wait(&req,&sts);
    MPI_Type_free(&newdt);
  }

  printf("   rank=%d pass-2 %d %f %d %f %d \n"
    ,rank,buf.x,buf.a,buf.y,buf.b,buf.c); fflush(stdout);
  if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
  if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

  cc =  MPI_Finalize();
  if (rank ==0) {
    printf("   rank=%d program end \n",rank); fflush(stdout);
  }
  return(0);
}

Reply via email to