Hello, I am trying to use checkpoint-restart functionality of OpenMPI. Most of the times checkpointing of MPI application behaves correctly, but in some situations the MPI application hangs indefinitely after the checkpoint is taken. Ompi-checkpoint terminates without error and I do get the snapshot reference, but the application does not resume (seems to be busy waiting somewhere in mpi code). I am not able to reproduce this problem to find the exact scenario which leads to this issue. But, these things are common in all the scenarios which lead to error: 1. OpenIB BTL is used. (using TCP btl does not produce this error) 2. The communication is of the form - Isends/Irecvs followed by Waitall(...)
I saw a ticket(#2397) which shows some bug fixes targeted for V1.7 ; I went through them, but not sure whether my problem is because of those bugs. Are there any known issues specifically when OpenIB btl is used? I am using Open-MPI version 1.5.3 Please find the output of ompi-info and config.log as attachments. I am providing these back-traces of a single process taken at different times, if it helps. All the MPI application processes are in running state. Please let me know if additional information is required. Back trace 1 #0 mca_btl_sm_component_progress () at btl_sm_component.c:560 #1 0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207 #2 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92 #3 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263 #4 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70 #5 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330 #6 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418 #7 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058, w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0, reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295 #8 0x0000000000402271 in cg_unit () at cg.f:502 #9 0x000000000040181b in MAIN__ () at cg.f:56 #10 0x0000000000406e8e in main () Back trace 2 #0 0x00002aaaaf710a8a in get_sw_cqe (cq=<value optimized out>, n=19) at src/cq.c:119 #1 0x00002aaaaf710f01 in next_cqe_sw (ibcq=0x32a7cde0, ne=1, wc=<value optimized out>) at src/cq.c:125 #2 mlx4_poll_one (ibcq=0x32a7cde0, ne=1, wc=<value optimized out>) at src/cq.c:205 #3 mlx4_poll_cq (ibcq=0x32a7cde0, ne=1, wc=<value optimized out>) at src/cq.c:352 #4 0x00002aaaad9d7b53 in opal_pointer_array_get_item () at ../../../../opal/threads/mutex_unix.h:102 #5 btl_openib_component_progress () at btl_openib_component.c:3540 #6 0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207 #7 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92 #8 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263 #9 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70 #10 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330 #11 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418 #12 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058, w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0, reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295 #13 0x0000000000402271 in cg_unit () at cg.f:502 #14 0x000000000040181b in MAIN__ () at cg.f:56 #15 0x0000000000406e8e in main () Back trace 3 #0 mlx4_poll_cq (ibcq=0x32a7cc60, ne=1, wc=<value optimized out>) at src/cq.c:360 #1 0x00002aaaad9d7b53 in opal_pointer_array_get_item () at ../../../../opal/threads/mutex_unix.h:102 #2 btl_openib_component_progress () at btl_openib_component.c:3540 #3 0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207 #4 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92 #5 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263 #6 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70 #7 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330 #8 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418 #9 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058, w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0, reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295 #10 0x0000000000402271 in cg_unit () at cg.f:502 #11 0x000000000040181b in MAIN__ () at cg.f:56 #12 0x0000000000406e8e in main () Back trace 4 #0 0x00002b09eb1d30f8 in opal_progress () at runtime/opal_progress.c:207 #1 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92 #2 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263 #3 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70 #4 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330 #5 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418 #6 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058, w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0, reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295 #7 0x0000000000402271 in cg_unit () at cg.f:502 #8 0x000000000040181b in MAIN__ () at cg.f:56 #9 0x0000000000406e8e in main () Back trace 5 #0 0x0000003a6aa08cd6 in pthread_mutex_lock () from /lib64/libpthread.so.0 #1 0x00002aaaad9d7b53 in opal_pointer_array_get_item () at ../../../../opal/threads/mutex_unix.h:102 #2 btl_openib_component_progress () at btl_openib_component.c:3540 #3 0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207 #4 0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at ../opal/threads/condition.h:92 #5 ompi_request_default_wait_all (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at request/req_wait.c:263 #6 0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0, statuses=0x7fffbab81780) at pwaitall.c:70 #7 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838, status=0x7fffbab81820) at mrmpi_p2p.c:3330 #8 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948, status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418 #9 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c, x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8, r=0x1ca6f058, w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c, reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0, reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10, reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295 #10 0x0000000000402271 in cg_unit () at cg.f:502 #11 0x000000000040181b in MAIN__ () at cg.f:56 #12 0x0000000000406e8e in main () Regards, Kishor
config.log.gz
Description: GNU Zip compressed data
ompi-info.gz
Description: GNU Zip compressed data