I'm trying checkpoint/restart of Open MPI. I'm using Open MPI 1.4.1 and BLCR 0.8.2. But it doesn't work well. I'm looking into the source code. And I have some questions about checkpoint/restart. Could anyone answer my questions ?
I will give them one by one. My 1st question is as follows: (1) Clearing the send_init_list, recv_init_list. Framework : crcp Component : bkmrk The source file : ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c The function name : ft_event_finalize_exchange I found the comment /* Clear send_init_list */ in ft_event_finalize_exchange function. However, the corresponding source code seems to be not clearing send_init_list but clearing send_list. Source code is as follows: /* Clear send_init_list */ for(rm_item = opal_list_get_last(&peer_ref->send_list); rm_item != opal_list_get_begin(&peer_ref->send_list); Is it correct? Send_list seems to be already cleared by this point. And, Clearing recv_init_list is in the same. The comment is /* Clear recv_init_list */. However, the corresponding source code seems to be not clearing recv_init_list but clearing recv_list. Recv_list seems to be already cleared by this point. Source code is as follows: /* Clear recv_init_list */ for(rm_item = opal_list_get_last(&peer_ref->recv_list); rm_item != opal_list_get_begin(&peer_ref->recv_list); Here's the code that causes the problem: #define BLOCKNUM 1 #define SLPTIM 60 if (rank == 0) { MPI_Send_init(&buf[0],BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&req1); MPI_Start(&req1); MPI_Wait(&req1,&sts1); MPI_Start(&req1); MPI_Wait(&req1,&sts1); printf(" rank=%d sleep1 start \n",rank); fflush(stdout); sleep(SLPTIM); /** take checkpoint(1st time) **/ printf(" rank=%d sleep1 end \n",rank); fflush(stdout); MPI_Start(&req1); printf(" rank=%d sleep2 start \n",rank); fflush(stdout); sleep(SLPTIM); /** take checkpoint(2nd time), and deadlock occurs. **/ printf(" rank=%d sleep2 end \n",rank); fflush(stdout); MPI_Wait(&req1,&sts1); MPI_Request_free(&req1); } else { /* rank 1 */ MPI_Recv_init(&buf[0],BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&req1); MPI_Start(&req1); MPI_Wait(&req1,&sts1); printf(" rank=%d sleep1 start \n",rank); fflush(stdout); sleep(SLPTIM); /** take checkpoint(1st time) **/ printf(" rank=%d sleep1 end \n",rank); fflush(stdout); MPI_Start(&req1); MPI_Wait(&req1,&sts1); printf(" rank=%d sleep2 start \n",rank); fflush(stdout); sleep(SLPTIM); /** take checkpoint(2nd time), and deadlock occurs. **/ printf(" rank=%d sleep2 end \n",rank); fflush(stdout); MPI_Start(&req1); MPI_Wait(&req1,&sts1); MPI_Request_free(&req1); } * Take checkpoint twice. * Take checkpoint while Process 0 is in MPI_Send function and Process 1 is in sleep function * Deadlock occurs when checkpoint is taken at the second time. * Here's my debugging output. rank=1 pass-1 100 rank=1 sleep1 start /* 1st checkpoint */ rank=0 sleep1 start /* 1st checkpoint */ rank=1 sleep1 end rank=0 sleep1 end DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=1 /* MPI_Barrier */ DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=2 DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref BEFORE-UPDATE matched=0 done=1 num_left_unresolved=2 DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref AFTER-UPDATE matched=1 done=1 num_left_unresolved=1 DEBUG: num_left_unresolved=1 goto cleapup DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100 count=1 ddt_size=4 rank=1 pass-2 200 rank=1 sleep2 start /* 2nd checkpoint */ rank=0 sleep2 start /* 2nd checkpoint */ rank=1 sleep2 end rank=0 sleep2 end DEBUG:do_recv_msg_detail_check_drain p_num_sent(from sender)=3 /* Sender sent the wrong value("3"). I think the correct value may be "1". */ DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref BEFORE-UPDATE matched=1 done=1 num_left_unresolved=3 DEBUG:do_recv_msg_detail_check_drain posted_precv_msg_ref AFTER-UPDATE matched=2 done=1 num_left_unresolved=2 /* The wrong values are set in recv_init_list of receiver, Because recv_init_list was not cleared in taking checkpoint in the first time. */ DEBUG: num_left_unresolved=2 goto cleapup DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100 count=1 ddt_size=4 DEBUG:ft_event_post_drain_message calls wrapped_pml_module->pml_irecv tag=100 count=1 ddt_size=4 /* the wrong receiving is issued. */ -bash-3.2$ cat t_mpi_question-1.c #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include "mpi.h" #define BLOCKNUM 1 #define SLPTIM 60 int main(int ac,char **av) { int i,k,rank,size,cc; int *buf; MPI_Status sts1; MPI_Request req1; MPI_Init(&ac,&av); MPI_Comm_rank(MPI_COMM_WORLD,&rank); MPI_Comm_size(MPI_COMM_WORLD,&size); if (size != 2) { MPI_Abort(MPI_COMM_WORLD,-1); } buf = (int *)malloc(sizeof(int)*BLOCKNUM); if (buf == NULL) { MPI_Abort(MPI_COMM_WORLD,-1); } MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { MPI_Send_init(&buf[0],BLOCKNUM,MPI_INT,1,100,MPI_COMM_WORLD,&req1); for (i=0;i<BLOCKNUM;i++) { buf[i] = (100+i); } MPI_Start(&req1); MPI_Wait(&req1,&sts1); for (i=0;i<BLOCKNUM;i++) { buf[i] = (200+i); } MPI_Start(&req1); MPI_Wait(&req1,&sts1); printf(" rank=%d sleep1 start \n",rank); fflush(stdout); sleep(SLPTIM); printf(" rank=%d sleep1 end \n",rank); fflush(stdout); for (i=0;i<BLOCKNUM;i++) { buf[i] = (300+i); } MPI_Start(&req1); printf(" rank=%d sleep2 start \n",rank); fflush(stdout); sleep(SLPTIM); printf(" rank=%d sleep2 end \n",rank); fflush(stdout); MPI_Wait(&req1,&sts1); MPI_Request_free(&req1); } else { MPI_Recv_init(&buf[0],BLOCKNUM,MPI_INT,0,100,MPI_COMM_WORLD,&req1); for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; } MPI_Start(&req1); MPI_Wait(&req1,&sts1); printf(" rank=%d pass-1 %d \n",rank,buf[0]); fflush(stdout); printf(" rank=%d sleep1 start \n",rank); fflush(stdout); sleep(SLPTIM); printf(" rank=%d sleep1 end \n",rank); fflush(stdout); for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; } MPI_Start(&req1); MPI_Wait(&req1,&sts1); printf(" rank=%d pass-2 %d \n",rank,buf[0]); fflush(stdout); printf(" rank=%d sleep2 start \n",rank); fflush(stdout); sleep(SLPTIM); printf(" rank=%d sleep2 end \n",rank); fflush(stdout); for (i=0;i<BLOCKNUM;i++) { buf[i] = 0; } MPI_Start(&req1); MPI_Wait(&req1,&sts1); printf(" rank=%d pass-3 %d \n",rank,buf[0]); fflush(stdout); MPI_Request_free(&req1); } MPI_Barrier(MPI_COMM_WORLD); free(buf); MPI_Finalize(); if (rank == 0) { printf(" rank=%d Program End \n",rank); fflush(stdout); } return(0); }