Hello,

I am trying to use checkpoint-restart functionality of OpenMPI. Most of the
times checkpointing of MPI application behaves correctly, but in some
situations the MPI application hangs indefinitely after the checkpoint is
taken. Ompi-checkpoint terminates without error and I do get the snapshot
reference, but the application does not resume (seems to be busy waiting
somewhere in mpi code). I am not able to reproduce this problem to find the
exact scenario which leads to this issue.
But, these things are common in all the scenarios which lead to error:
1. OpenIB BTL is used. (using TCP btl does not produce this error)
2. The communication is of the form - Isends/Irecvs followed by Waitall(...)

I saw a ticket(#2397) which shows some bug fixes targeted for V1.7 ; I went
through them, but not sure whether my problem is because of those bugs. Are
there any known issues specifically when OpenIB btl is used?

I am using Open-MPI version 1.5.3
Please find the output of ompi-info and config.log as attachments.


I am providing these back-traces of a single process taken at different
times, if it helps. All the MPI application processes are in running state.
Please let me know if additional information is required.

Back trace 1
#0  mca_btl_sm_component_progress () at btl_sm_component.c:560
#1  0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207
#2  0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#3  ompi_request_default_wait_all (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at request/req_wait.c:263
#4  0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at pwaitall.c:70
#5  0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838,
status=0x7fffbab81820) at mrmpi_p2p.c:3330
#6  0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948,
status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#7  0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c,
x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8,
r=0x1ca6f058,
    w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c,
reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
    reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10,
reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#8  0x0000000000402271 in cg_unit () at cg.f:502
#9  0x000000000040181b in MAIN__ () at cg.f:56
#10 0x0000000000406e8e in main ()

Back trace 2
#0  0x00002aaaaf710a8a in get_sw_cqe (cq=<value optimized out>, n=19) at
src/cq.c:119
#1  0x00002aaaaf710f01 in next_cqe_sw (ibcq=0x32a7cde0, ne=1, wc=<value
optimized out>) at src/cq.c:125
#2  mlx4_poll_one (ibcq=0x32a7cde0, ne=1, wc=<value optimized out>) at
src/cq.c:205
#3  mlx4_poll_cq (ibcq=0x32a7cde0, ne=1, wc=<value optimized out>) at
src/cq.c:352
#4  0x00002aaaad9d7b53 in opal_pointer_array_get_item () at
../../../../opal/threads/mutex_unix.h:102
#5  btl_openib_component_progress () at btl_openib_component.c:3540
#6  0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207
#7  0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#8  ompi_request_default_wait_all (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at request/req_wait.c:263
#9  0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at pwaitall.c:70
#10 0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838,
status=0x7fffbab81820) at mrmpi_p2p.c:3330
#11 0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948,
status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#12 0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c,
x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8,
r=0x1ca6f058,
    w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c,
reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
    reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10,
reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#13 0x0000000000402271 in cg_unit () at cg.f:502
#14 0x000000000040181b in MAIN__ () at cg.f:56
#15 0x0000000000406e8e in main ()

Back trace 3
#0  mlx4_poll_cq (ibcq=0x32a7cc60, ne=1, wc=<value optimized out>) at
src/cq.c:360
#1  0x00002aaaad9d7b53 in opal_pointer_array_get_item () at
../../../../opal/threads/mutex_unix.h:102
#2  btl_openib_component_progress () at btl_openib_component.c:3540
#3  0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207
#4  0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#5  ompi_request_default_wait_all (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at request/req_wait.c:263
#6  0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at pwaitall.c:70
#7  0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838,
status=0x7fffbab81820) at mrmpi_p2p.c:3330
#8  0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948,
status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#9  0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c,
x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8,
r=0x1ca6f058,
    w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c,
reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
    reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10,
reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#10 0x0000000000402271 in cg_unit () at cg.f:502
#11 0x000000000040181b in MAIN__ () at cg.f:56
#12 0x0000000000406e8e in main ()

Back trace 4
#0  0x00002b09eb1d30f8 in opal_progress () at runtime/opal_progress.c:207
#1  0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#2  ompi_request_default_wait_all (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at request/req_wait.c:263
#3  0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at pwaitall.c:70
#4  0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838,
status=0x7fffbab81820) at mrmpi_p2p.c:3330
#5  0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948,
status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#6  0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c,
x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8,
r=0x1ca6f058,
    w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c,
reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
    reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10,
reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#7  0x0000000000402271 in cg_unit () at cg.f:502
#8  0x000000000040181b in MAIN__ () at cg.f:56
#9  0x0000000000406e8e in main ()

Back trace 5
#0  0x0000003a6aa08cd6 in pthread_mutex_lock () from /lib64/libpthread.so.0
#1  0x00002aaaad9d7b53 in opal_pointer_array_get_item () at
../../../../opal/threads/mutex_unix.h:102
#2  btl_openib_component_progress () at btl_openib_component.c:3540
#3  0x00002b09eb1d3105 in opal_progress () at runtime/opal_progress.c:207
#4  0x00002b09eb0f9b3f in opal_condition_wait (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at ../opal/threads/condition.h:92
#5  ompi_request_default_wait_all (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at request/req_wait.c:263
#6  0x00002b09eb126db6 in PMPI_Waitall (count=2, requests=0x326bd3b0,
statuses=0x7fffbab81780) at pwaitall.c:70
#7  0x00002b09eac69c65 in MPI_Wait (request=0x7fffbab81838,
status=0x7fffbab81820) at mrmpi_p2p.c:3330
#8  0x00002b09eac6a1aa in mpi_wait_ (request=0x7fffbab81948,
status=0x7fffbab81920, ierror=0x7fffbab81958) at mrmpi_p2p.c:3418
#9  0x000000000040476c in conj_grad (colidx=0x608a40, rowstr=0x41c1f7c,
x=0x1c4b6298, z=0x1c624608, a=0x14d43820, p=0x1c792978, q=0x1c900ce8,
r=0x1ca6f058,
    w=0x1cbdd3c8, rnorm=@0x7fffbab81dd0, l2npcols=@0x7fffbab81e2c,
reduce_exch_proc=0x7fffbab81d50, reduce_send_starts=0x7fffbab81cf0,
    reduce_send_lengths=0x7fffbab81d30, reduce_recv_starts=0x7fffbab81d10,
reduce_recv_lengths=0x7fffbab81d70) at cg.f:1295
#10 0x0000000000402271 in cg_unit () at cg.f:502
#11 0x000000000040181b in MAIN__ () at cg.f:56
#12 0x0000000000406e8e in main ()

Regards,
Kishor

Attachment: config.log.gz
Description: GNU Zip compressed data

Attachment: ompi-info.gz
Description: GNU Zip compressed data

Reply via email to