Thank you for your bug reports on the crcp/bkmrk component. It is nice to have someone else going through the logic and checking it. Please keep reporting issues as you find them, and the repeater programs are useful. If you develop any patches that fix these bugs, let me know and I will be happy to look over them and include them in the Open MPI trunk, if appropriate.

Briefly reading through your previous messages, I suspect that much of your conclusions about the bugs are correct. Unfortunately, I will not be able to take a good look at these bug reports for another few weeks (probably early April). I wanted to reply so that you knew that the messages were not being ignored.

Thanks again,
Josh

On Mar 18, 2010, at 5:23 AM, Takayuki Seki wrote:


7th question is as follows:

(7) The result of communication which use derived datatype after taking checkpoint is incorrect.

Framework         : crcp
Component         : bkmrk
The source file   : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
The function name : traffic_message_append

Framework         : datatype
The source file   : ompi/datatype/datatype.h
The function name : ompi_ddt_type_size

Here's the code that causes the problem:

struct dd {
 char  x;
 float a;
 char  y;
 float b;
 int   c;
};
struct dd buf,ans_dd_buf;

 if (rank == 0) {
   buf.x = (char)1;
   buf.a = (float)4329.1003;
   buf.y = (char)2;
   buf.b = (float)8474.73;
   buf.c = (int)48;
 }
 else {
   buf.x = (char)0;
   buf.a = (float)0;
   buf.y = (char)0;
   buf.b = (float)0;
   buf.c = (int)0;
 }
 ans_dd_buf.x = (char)1;
 ans_dd_buf.a = (float)4329.1003;
 ans_dd_buf.y = (char)2;
 ans_dd_buf.b = (float)8474.73;
 ans_dd_buf.c = (int)48;

 /* item number per a block */
 b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;
 /* datatype per a block */
 dt[0]  = dt[2] = MPI_BYTE;
 dt[1]  = dt[3] = MPI_FLOAT;
 dt[4]  = MPI_INT;
 /* disp per a block */
 dp[0] = 0;
 MPI_Address(&buf.x,&st);
 MPI_Address(&buf.a,&cr);
 dp[1] = (cr - st);
 MPI_Address(&buf.y,&cr);
 dp[2] = (cr - st);
 MPI_Address(&buf.b,&cr);
 dp[3] = (cr - st);
 MPI_Address(&buf.c,&cr);
 dp[4] = (cr - st);
 cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
 if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
 cc = MPI_Type_commit(&newdt);
 if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
 MPI_Barrier(MPI_COMM_WORLD);
printf(" rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b= %d[%d] x->c=%d[%d]\n"
   ,rank
   ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
   ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
   ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
   ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
   ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
 );
 fflush(stdout);

 if (rank == 0) {
   MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);
   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);  /** take checkpoint at this point **/
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);
   MPI_Wait(&req,&sts);
   MPI_Type_free(&newdt);
 }
 else { /* rank 1 */
   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);  /** take checkpoint at this point **/
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);
   MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
   MPI_Wait(&req,&sts);
   MPI_Type_free(&newdt);
 }
 if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); } /* The error occurs at this point */
 if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

* Take checkpoint while Rank 0 and Rank 1 are performing sleep function

* Construct derived datatype from the structure dd.

* I think that image of memory mapping of the derived datatype is as follows:

             1111111111
   01234567890123456789
   --------------------
   X###AAAAY###BBBBCCCC
   --------------------

### means space.

* ddt_size for /** Quick reference to the size of the datatype */ in
 ompi_crcp_bkmrk_pml_traffic_message_ref_t structure is obtained by
 ompi_ddt_type_size function in traffic_message_append function.

   if( NULL != datatype ) {
       ompi_ddt_type_size(datatype,
                          &ddt_size);

* I think that the returned value of ddt_size is wrong.
The obtained value is 14.(Does it means total size in the memory is 14bytes?)

 struct dd {
   char  x;    -> charactor is 1byte.
   float a;    -> float     is 4byte.
   char  y;    -> charactor is 1byte.
   float b;    -> float     is 4byte.
   int   c;    -> integer   is 4byte.
 };

* But the returned value of ddt_size should be 20bytes, considering the memory mapping.

* Rank 1 receive messages of only 14bytes in the bkmrk.
 The wrong result is obtained.

* t_mpi_question-7-ng.c : the error occurs.
 Here's my debugging output.

ft_event_post_drain_message:Irecv drain_msg_ref=c89200 rank=0 tag=1000 cnt=1 ddt=14 to=c929b0 [datatype->size=1] wait_quiesce_drained: x=1 a=142658605493679655240073216.000000 y=4 b=0.000000 c=32
   /* 14bytes data is received, it is incorrect. values are wrong. */
 drain_message_check_recv:datatype->size=1 14 count=1 1
 ompi_ddt_copy_content_same_ddt:Start size=14 flag=102/4 count=1
   /* DT_FLAG_CONTIGUOUS is false. */


* t_mpi_question-7-ok.c : the error does not occur.
 Here's my debugging output.

ft_event_post_drain_message:Irecv drain_msg_ref=a51280 rank=0 tag=1000 cnt=1 ddt=20 to=a5b6b0 [datatype->size=1]
 wait_quiesce_drained: x=1 a=4329.100098 y=2 b=8474.730469 c=48
   /* 20bytes data is received correctly. */
 drain_message_check_recv:datatype->size=1 20 count=1 1
 ompi_ddt_copy_content_same_ddt:Start size=20 flag=186/4 count=1
   /* DT_FLAG_CONTIGUOUS is true. */

* difference list

-bash-3.2$ diff -c t_mpi_question-7-ng.c t_mpi_question-7-ok.c
*** t_mpi_question-7-ng.c       Fri Feb 26 13:07:05 2010
--- t_mpi_question-7-ok.c       Fri Feb 26 13:20:25 2010
***************
*** 8,16 ****
 #define ITEMNUM 5

 struct dd {
!   char  x;
   float a;
!   char  y;
   float b;
   int   c;
 };
--- 8,16 ----
 #define ITEMNUM 5

 struct dd {
!   int   x;
   float a;
!   int   y;
   float b;
   int   c;
 };
***************
*** 31,52 ****
   MPI_Comm_size(MPI_COMM_WORLD,&size);

   if (rank == 0) {
!     buf.x = (char)1;
     buf.a = (float)4329.1003;
!     buf.y = (char)2;
     buf.b = (float)8474.73;
     buf.c = (int)48;
   }
   else {
!     buf.x = (char)0;
     buf.a = (float)0;
!     buf.y = (char)0;
     buf.b = (float)0;
     buf.c = (int)0;
   }
!   ans_dd_buf.x = (char)1;
   ans_dd_buf.a = (float)4329.1003;
!   ans_dd_buf.y = (char)2;
   ans_dd_buf.b = (float)8474.73;
   ans_dd_buf.c = (int)48;

--- 31,52 ----
   MPI_Comm_size(MPI_COMM_WORLD,&size);

   if (rank == 0) {
!     buf.x = (int)1;
     buf.a = (float)4329.1003;
!     buf.y = (int)2;
     buf.b = (float)8474.73;
     buf.c = (int)48;
   }
   else {
!     buf.x = (int)0;
     buf.a = (float)0;
!     buf.y = (int)0;
     buf.b = (float)0;
     buf.c = (int)0;
   }
!   ans_dd_buf.x = (int)1;
   ans_dd_buf.a = (float)4329.1003;
!   ans_dd_buf.y = (int)2;
   ans_dd_buf.b = (float)8474.73;
   ans_dd_buf.c = (int)48;

***************
*** 54,60 ****
   b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

   /* datatype per a block */
!   dt[0]  = dt[2] = MPI_BYTE;
   dt[1]  = dt[3] = MPI_FLOAT;
   dt[4]  = MPI_INT;

--- 54,60 ----
   b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

   /* datatype per a block */
!   dt[0]  = dt[2] = MPI_INT;
   dt[1]  = dt[3] = MPI_FLOAT;
   dt[4]  = MPI_INT;


-bash-3.2$ cat t_mpi_question-7-ng.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60

#define ITEMNUM 5

struct dd {
 char  x;
 float a;
 char  y;
 float b;
 int   c;
};

int main(int ac,char **av)
{
 int rank,size,cc;
 MPI_Request req;
 MPI_Status sts;
 struct dd buf,ans_dd_buf;
 int b_l[ITEMNUM];
 MPI_Aint dp[ITEMNUM],st,cr;
 MPI_Datatype dt[ITEMNUM],newdt;

 MPI_Init(&ac,&av);

 MPI_Comm_rank(MPI_COMM_WORLD,&rank);
 MPI_Comm_size(MPI_COMM_WORLD,&size);

 if (rank == 0) {
   buf.x = (char)1;
   buf.a = (float)4329.1003;
   buf.y = (char)2;
   buf.b = (float)8474.73;
   buf.c = (int)48;
 }
 else {
   buf.x = (char)0;
   buf.a = (float)0;
   buf.y = (char)0;
   buf.b = (float)0;
   buf.c = (int)0;
 }
 ans_dd_buf.x = (char)1;
 ans_dd_buf.a = (float)4329.1003;
 ans_dd_buf.y = (char)2;
 ans_dd_buf.b = (float)8474.73;
 ans_dd_buf.c = (int)48;

 /* item number per a block */
 b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

 /* datatype per a block */
 dt[0]  = dt[2] = MPI_BYTE;
 dt[1]  = dt[3] = MPI_FLOAT;
 dt[4]  = MPI_INT;

 /* disp per a block */
 dp[0] = 0;
 MPI_Address(&buf.x,&st);

 MPI_Address(&buf.a,&cr);
 dp[1] = (cr - st);

 MPI_Address(&buf.y,&cr);
 dp[2] = (cr - st);

 MPI_Address(&buf.b,&cr);
 dp[3] = (cr - st);

 MPI_Address(&buf.c,&cr);
 dp[4] = (cr - st);

 cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
 if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
 cc = MPI_Type_commit(&newdt);
 if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }

 MPI_Barrier(MPI_COMM_WORLD);

printf(" rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b= %d[%d] x->c=%d[%d]\n"
   ,rank
   ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
   ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
   ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
   ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
   ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
 );
 fflush(stdout);

 if (rank == 0) {
   MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);

   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);

   MPI_Wait(&req,&sts);
   MPI_Type_free(&newdt);
 }
 else {
   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);

   MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
   MPI_Wait(&req,&sts);
   MPI_Type_free(&newdt);
 }

 printf("   rank=%d pass-2 %d %f %d %f %d \n"
   ,rank,buf.x,buf.a,buf.y,buf.b,buf.c); fflush(stdout);
 if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

 cc =  MPI_Finalize();
 if (rank ==0) {
   printf("   rank=%d program end \n",rank); fflush(stdout);
 }
 return(0);
}

-bash-3.2$ cat t_mpi_question-7-ok.c
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "mpi.h"

#define SLPTIME 60

#define ITEMNUM 5

struct dd {
 int   x;
 float a;
 int   y;
 float b;
 int   c;
};

int main(int ac,char **av)
{
 int rank,size,cc;
 MPI_Request req;
 MPI_Status sts;
 struct dd buf,ans_dd_buf;
 int b_l[ITEMNUM];
 MPI_Aint dp[ITEMNUM],st,cr;
 MPI_Datatype dt[ITEMNUM],newdt;

 MPI_Init(&ac,&av);

 MPI_Comm_rank(MPI_COMM_WORLD,&rank);
 MPI_Comm_size(MPI_COMM_WORLD,&size);

 if (rank == 0) {
   buf.x = (int)1;
   buf.a = (float)4329.1003;
   buf.y = (int)2;
   buf.b = (float)8474.73;
   buf.c = (int)48;
 }
 else {
   buf.x = (int)0;
   buf.a = (float)0;
   buf.y = (int)0;
   buf.b = (float)0;
   buf.c = (int)0;
 }
 ans_dd_buf.x = (int)1;
 ans_dd_buf.a = (float)4329.1003;
 ans_dd_buf.y = (int)2;
 ans_dd_buf.b = (float)8474.73;
 ans_dd_buf.c = (int)48;

 /* item number per a block */
 b_l[0] = b_l[1] = b_l[2] = b_l[3] = b_l[4] = 1;

 /* datatype per a block */
 dt[0]  = dt[2] = MPI_INT;
 dt[1]  = dt[3] = MPI_FLOAT;
 dt[4]  = MPI_INT;

 /* disp per a block */
 dp[0] = 0;
 MPI_Address(&buf.x,&st);

 MPI_Address(&buf.a,&cr);
 dp[1] = (cr - st);

 MPI_Address(&buf.y,&cr);
 dp[2] = (cr - st);

 MPI_Address(&buf.b,&cr);
 dp[3] = (cr - st);

 MPI_Address(&buf.c,&cr);
 dp[4] = (cr - st);

 cc = MPI_Type_struct(ITEMNUM,&b_l[0],&dp[0],&dt[0],&newdt);
 if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }
 cc = MPI_Type_commit(&newdt);
 if (cc != MPI_SUCCESS) { MPI_Abort(MPI_COMM_WORLD,-1); }

 MPI_Barrier(MPI_COMM_WORLD);

printf(" rank=%d pass-1 x->x =%d[%d] x->a=%d[%d] x->y=%d[%d] x->b= %d[%d] x->c=%d[%d]\n"
   ,rank
   ,( (int)((unsigned long)(&buf.x) - (unsigned long)(&buf.x)) ),dp[0]
   ,( (int)((unsigned long)(&buf.a) - (unsigned long)(&buf.x)) ),dp[1]
   ,( (int)((unsigned long)(&buf.y) - (unsigned long)(&buf.x)) ),dp[2]
   ,( (int)((unsigned long)(&buf.b) - (unsigned long)(&buf.x)) ),dp[3]
   ,( (int)((unsigned long)(&buf.c) - (unsigned long)(&buf.x)) ),dp[4]
 );
 fflush(stdout);

 if (rank == 0) {
   MPI_Isend(&buf,1,newdt,1,1000,MPI_COMM_WORLD,&req);

   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);

   MPI_Wait(&req,&sts);
   MPI_Type_free(&newdt);
 }
 else {
   printf(" rank=%d sleep start \n",rank); fflush(stdout);
   sleep(SLPTIME);
   printf(" rank=%d sleep end   \n",rank); fflush(stdout);

   MPI_Irecv(&buf,1,newdt,0,1000,MPI_COMM_WORLD,&req);
   MPI_Wait(&req,&sts);
   MPI_Type_free(&newdt);
 }

 printf("   rank=%d pass-2 %d %f %d %f %d \n"
   ,rank,buf.x,buf.a,buf.y,buf.b,buf.c); fflush(stdout);
 if (ans_dd_buf.x != buf.x) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.a != buf.a) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.y != buf.y) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.b != buf.b) { MPI_Abort(MPI_COMM_WORLD,1); }
 if (ans_dd_buf.c != buf.c) { MPI_Abort(MPI_COMM_WORLD,1); }

 cc =  MPI_Finalize();
 if (rank ==0) {
   printf("   rank=%d program end \n",rank); fflush(stdout);
 }
 return(0);
}

_______________________________________________
devel mailing list
de...@open-mpi.org
http://www.open-mpi.org/mailman/listinfo.cgi/devel

Reply via email to