Here’s what happens using a debug build:

[raijin7:22225] ompi_comm_peer_lookup: invalid peer index (2)
[raijin7:22225:0:22225] Caught signal 11 (Segmentation fault: address not 
mapped to object at address 0x8)

/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_comm.h:
 [ mca_pml_ob1_peer_lookup() ]
     ...
      75             mca_pml_ob1_comm_proc_t* proc = 
OBJ_NEW(mca_pml_ob1_comm_proc_t);
      76             proc->ompi_proc = ompi_comm_peer_lookup (comm, rank);
      77             OBJ_RETAIN(proc->ompi_proc);
==>    78             opal_atomic_wmb ();
      79             pml_comm->procs[rank] = proc;
      80         }
      81         OPAL_THREAD_UNLOCK(&pml_comm->proc_lock);

==== backtrace ====
0 0x0000000000017505 mca_pml_ob1_peer_lookup()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_comm.h:78
1 0x0000000000019119 mca_pml_ob1_recv_frag_callback_match()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_recvfrag.c:361
2 0x00000000000052d7 mca_btl_vader_check_fboxes()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/opal/mca/btl/vader/../../../../../../../opal/mca/btl/vader/btl_vader_fbox.h:208
3 0x00000000000077fd mca_btl_vader_component_progress()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/opal/mca/btl/vader/../../../../../../../opal/mca/btl/vader/btl_vader_component.c:689
4 0x000000000002ff90 opal_progress()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/opal/../../../../opal/runtime/opal_progress.c:228
5 0x000000000003b168 ompi_sync_wait_mt()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/opal/../../../../opal/threads/wait_sync.c:85
6 0x000000000005cd64 ompi_request_wait_completion()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/../../../../ompi/request/request.h:403
7 0x000000000005ce28 ompi_request_default_wait()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/../../../../ompi/request/req_wait.c:42
8 0x00000000001142d9 ompi_coll_base_sendrecv_zero()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_barrier.c:64
9 0x0000000000114763 ompi_coll_base_barrier_intra_recursivedoubling()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_barrier.c:215
10 0x0000000000004cad ompi_coll_tuned_barrier_intra_dec_fixed()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mca/coll/tuned/../../../../../../../ompi/mca/coll/tuned/coll_tuned_decision_fixed.c:212
11 0x00000000000831ac PMPI_Barrier()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-1/ompi/mpi/c/profile/pbarrier.c:63
12 0x0000000000044041 ompi_barrier_f()  
/short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/intel/debug-1/ompi/mpi/fortran/mpif-h/profile/pbarrier_f.c:76
13 0x00000000005c79de comms_barrier()  
/short/z00/aab900/onetep/src/comms_mod.F90:1543
14 0x00000000005c79de comms_bcast_logical_0()  
/short/z00/aab900/onetep/src/comms_mod.F90:10756
15 0x0000000001c21509 utils_devel_code_logical()  
/short/z00/aab900/onetep/src/utils_mod.F90:2646
16 0x0000000001309ddb multigrid_bc_for_dlmg()  
/short/z00/aab900/onetep/src/multigrid_methods_mod.F90:260
17 0x0000000001309ddb multigrid_initialise()  
/short/z00/aab900/onetep/src/multigrid_methods_mod.F90:174
18 0x0000000000f0c885 hartree_via_multigrid()  
/short/z00/aab900/onetep/src/hartree_mod.F90:181
19 0x0000000000a0c62a electronic_init_pot()  
/short/z00/aab900/onetep/src/electronic_init_mod.F90:1123
20 0x0000000000a14d62 electronic_init_denskern()  
/short/z00/aab900/onetep/src/electronic_init_mod.F90:334
21 0x0000000000a50136 energy_and_force_calculate()  
/short/z00/aab900/onetep/src/energy_and_force_mod.F90:1702
22 0x00000000014f46e7 onetep()  /short/z00/aab900/onetep/src/onetep.F90:277
23 0x000000000041465e main()  ???:0
24 0x000000000001ed1d __libc_start_main()  ???:0
25 0x0000000000414569 _start()  ???:0
===================



> On 12 Jul 2018, at 1:36 pm, Ben Menadue <ben.mena...@nci.org.au> wrote:
> 
> Hi,
> 
> Perhaps related — we’re seeing this one with 3.1.1. I’ll see if I can get the 
> application run against our --enable-debug build.
> 
> Cheers,
> Ben
> 
> [raijin7:1943 :0:1943] Caught signal 11 (Segmentation fault: address not 
> mapped to object at address 0x45)
> 
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_recvfrag.c:
>  [ append_frag_to_ordered_list() ]
>      ...
>      118      * account for this rollover or the matching will fail.
>      119      * Extract the items from the list to order them safely */
>      120     if( hdr->hdr_seq < prior->hdr.hdr_match.hdr_seq ) {
> ==>   121         uint16_t d1, d2 = prior->hdr.hdr_match.hdr_seq - 
> hdr->hdr_seq;
>      122         do {
>      123             d1 = d2;
>      124             prior = 
> (mca_pml_ob1_recv_frag_t*)(prior->super.super.opal_list_prev);
> 
> ==== backtrace ====
> 0 0x0000000000012d5f append_frag_to_ordered_list()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_recvfrag.c:121
> 1 0x0000000000013a06 mca_pml_ob1_recv_frag_callback_match()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/pml/ob1/../../../../../../../ompi/mca/pml/ob1/pml_ob1_recvfrag.c:390
> 2 0x00000000000044ef mca_btl_vader_check_fboxes()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/opal/mca/btl/vader/../../../../../../../opal/mca/btl/vader/btl_vader_fbox.h:208
> 3 0x000000000000602f mca_btl_vader_component_progress()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/opal/mca/btl/vader/../../../../../../../opal/mca/btl/vader/btl_vader_component.c:689
> 4 0x000000000002b554 opal_progress()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/opal/../../../../opal/runtime/opal_progress.c:228
> 5 0x00000000000331cc ompi_sync_wait_mt()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/opal/../../../../opal/threads/wait_sync.c:85
> 6 0x000000000004a989 ompi_request_wait_completion()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/../../../../ompi/request/request.h:403
> 7 0x000000000004aa1d ompi_request_default_wait()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/../../../../ompi/request/req_wait.c:42
> 8 0x00000000000d3486 ompi_coll_base_sendrecv_actual()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_util.c:59
> 9 0x00000000000d0d2b ompi_coll_base_sendrecv()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_util.h:67
> 10 0x00000000000d14c7 ompi_coll_base_allgather_intra_recursivedoubling()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/coll/../../../../../../ompi/mca/coll/base/coll_base_allgather.c:329
> 11 0x00000000000056dc ompi_coll_tuned_allgather_intra_dec_fixed()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mca/coll/tuned/../../../../../../../ompi/mca/coll/tuned/coll_tuned_decision_fixed.c:551
> 12 0x000000000006185d PMPI_Allgather()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/gcc/debug-0/ompi/mpi/c/profile/pallgather.c:122
> 13 0x000000000004362c ompi_allgather_f()  
> /short/z00/bjm900/build/openmpi-mofed4.2/openmpi-3.1.1/build/intel/debug-0/ompi/mpi/fortran/mpif-h/profile/pallgather_f.c:86
> 14 0x00000000005ed3cb comms_allgather_integer_0()  
> /short/z00/aab900/onetep/src/comms_mod.F90:14795
> 15 0x0000000001309fe1 multigrid_bc_for_dlmg()  
> /short/z00/aab900/onetep/src/multigrid_methods_mod.F90:270
> 16 0x0000000001309fe1 multigrid_initialise()  
> /short/z00/aab900/onetep/src/multigrid_methods_mod.F90:174
> 17 0x0000000000f0c885 hartree_via_multigrid()  
> /short/z00/aab900/onetep/src/hartree_mod.F90:181
> 18 0x0000000000a0c62a electronic_init_pot()  
> /short/z00/aab900/onetep/src/electronic_init_mod.F90:1123
> 19 0x0000000000a14d62 electronic_init_denskern()  
> /short/z00/aab900/onetep/src/electronic_init_mod.F90:334
> 20 0x0000000000a50136 energy_and_force_calculate()  
> /short/z00/aab900/onetep/src/energy_and_force_mod.F90:1702
> 21 0x00000000014f46e7 onetep()  /short/z00/aab900/onetep/src/onetep.F90:277
> 22 0x000000000041465e main()  ???:0
> 23 0x000000000001ed1d __libc_start_main()  ???:0
> 24 0x0000000000414569 _start()  ???:0
> ===================
> -------------------------------------------------------
> Primary job  terminated normally, but 1 process returned
> a non-zero exit code. Per user-direction, the job has been aborted.
> -------------------------------------------------------
> forrtl: error (78): process killed (SIGTERM)
> Image              PC                Routine            Line        Source    
>          
> onetep.nci         0000000001DCC6DE  Unknown               Unknown  Unknown
> libpthread-2.12.s  00002B6D46ED07E0  Unknown               Unknown  Unknown
> libmlx4-rdmav2.so  00002B6D570E3B18  Unknown               Unknown  Unknown
> --------------------------------------------------------------------------
> mpirun noticed that process rank 0 with PID 0 on node raijin7 exited on 
> signal 11 (Segmentation fault).
> --------------------------------------------------------------------------
> 
> 
> 
> 
>> On 12 Jul 2018, at 8:16 am, Nathan Hjelm via users <users@lists.open-mpi.org 
>> <mailto:users@lists.open-mpi.org>> wrote:
>> 
>> Might be also worth testing a master snapshot and see if that fixes the 
>> issue. There are a couple of fixes being backported from master to v3.0.x 
>> and v3.1.x now.
>> 
>> -Nathan
>> 
>> On Jul 11, 2018, at 03:16 PM, Noam Bernstein <noam.bernst...@nrl.navy.mil 
>> <mailto:noam.bernst...@nrl.navy.mil>> wrote:
>> 
>>>> On Jul 11, 2018, at 11:29 AM, Jeff Squyres (jsquyres) via users 
>>>> <users@lists.open-mpi.org <mailto:users@lists.open-mpi.org>> wrote:
>>>> Ok, that would be great -- thanks.
>>>> 
>>>> Recompiling Open MPI with --enable-debug will turn on several 
>>>> debugging/sanity checks inside Open MPI, and it will also enable debugging 
>>>> symbols.  Hence, If you can get a failure when a debug Open MPI build, it 
>>>> might give you a core file that can be used to get a more detailed stack 
>>>> trace, poke around and see if there's a NULL pointer somewhere, …etc.
>>> 
>>> I haven’t tried to get a core file yes, but it’s not producing any more 
>>> info from the runtime stack trace, despite configure with —enable-debug:
>>> 
>>> Image              PC                Routine            Line        Source
>>> vasp.gamma_para.i  0000000002DCE8C1  Unknown               Unknown  Unknown
>>> vasp.gamma_para.i  0000000002DCC9FB  Unknown               Unknown  Unknown
>>> vasp.gamma_para.i  0000000002D409E4  Unknown               Unknown  Unknown
>>> vasp.gamma_para.i  0000000002D407F6  Unknown               Unknown  Unknown
>>> vasp.gamma_para.i  0000000002CDCED9  Unknown               Unknown  Unknown
>>> vasp.gamma_para.i  0000000002CE3DB6  Unknown               Unknown  Unknown
>>> libpthread-2.12.s  0000003F8E60F7E0  Unknown               Unknown  Unknown
>>> mca_btl_vader.so   00002B1AFA5FAC30  Unknown               Unknown  Unknown
>>> mca_btl_vader.so   00002B1AFA5FD00D  Unknown               Unknown  Unknown
>>> libopen-pal.so.40  00002B1AE884327C  opal_progress         Unknown  Unknown
>>> mca_pml_ob1.so     00002B1AFB855DCE  Unknown               Unknown  Unknown
>>> mca_pml_ob1.so     00002B1AFB858305  mca_pml_ob1_send      Unknown  Unknown
>>> libmpi.so.40.10.1  00002B1AE823A5DA  ompi_coll_base_al     Unknown  Unknown
>>> mca_coll_tuned.so  00002B1AFC6F0842  ompi_coll_tuned_a     Unknown  Unknown
>>> libmpi.so.40.10.1  00002B1AE81B66F5  PMPI_Allreduce        Unknown  Unknown
>>> libmpi_mpifh.so.4  00002B1AE7F2259B  mpi_allreduce_        Unknown  Unknown
>>> vasp.gamma_para.i  000000000042D1ED  m_sum_d_                 1300  mpi.F
>>> vasp.gamma_para.i  000000000089947D  nonl_mp_vnlacc_.R        1754  nonl.F
>>> vasp.gamma_para.i  0000000000972C51  hamil_mp_hamiltmu         825  hamil.F
>>> vasp.gamma_para.i  0000000001BD2608  david_mp_eddav_.R         419  
>>> davidson.F
>>> vasp.gamma_para.i  0000000001D2179E  elmin_.R                  424  
>>> electron.F
>>> vasp.gamma_para.i  0000000002B92452  vamp_IP_electroni        4783  main.F
>>> vasp.gamma_para.i  0000000002B6E173  MAIN__                   2800  main.F
>>> vasp.gamma_para.i  000000000041325E  Unknown               Unknown  Unknown
>>> libc-2.12.so       0000003F8E21ED1D  __libc_start_main     Unknown  Unknown
>>> vasp.gamma_para.i  0000000000413169  Unknown               Unknown  Unknown
>>> 
>>> This is the configure line that was supposedly used to create the library:
>>>   ./configure 
>>> --prefix=/usr/local/openmpi/3.1.1_debug/x86_64/ib/intel/11.1.080 
>>> --with-tm=/usr/local/torque --enable-mpirun-prefix-by-default 
>>> --with-verbs=/usr --with-verbs-libdir=/usr/lib64 --enable-debug
>>> 
>>> Is there any way I can confirm that the version of the openmpi library I 
>>> think I’m using really was compiled with debugging?
>>> 
>>>  Noam
>>> 
>>> 
>>> ____________
>>> 
>>> |
>>> |
>>> 
>>> |U.S. NAVAL|
>>> 
>>> |_RESEARCH_|
>>> 
>>> 
>>> LABORATORY
>>> 
>>> 
>>> Noam Bernstein, Ph.D.
>>> Center for Materials Physics and Technology
>>> U.S. Naval Research Laboratory
>>> T +1 202 404 8628  F +1 202 404 7546
>>> https://www.nrl.navy.mil <https://www.nrl.navy.mil/>
>>> _______________________________________________
>>> users mailing list
>>> users@lists.open-mpi.org <mailto:users@lists.open-mpi.org>
>>> https://lists.open-mpi.org/mailman/listinfo/users 
>>> <https://lists.open-mpi.org/mailman/listinfo/users>_______________________________________________
>> users mailing list
>> users@lists.open-mpi.org <mailto:users@lists.open-mpi.org>
>> https://lists.open-mpi.org/mailman/listinfo/users
> 
> _______________________________________________
> users mailing list
> users@lists.open-mpi.org
> https://lists.open-mpi.org/mailman/listinfo/users

_______________________________________________
users mailing list
users@lists.open-mpi.org
https://lists.open-mpi.org/mailman/listinfo/users

Reply via email to