This looks like a segv in mpirun itself -- can you file an issue on github so 
that we can track this?

Thanks.


> On Jun 28, 2016, at 3:33 AM, Potnuri Bharat Teja <bha...@chelsio.com> wrote:
> 
> Hi All,
> I am seeing the following segfault with openmpi-master.
> 
> 
> [root@maneybhanjang ~]# /usr/mpi/gcc/openmpi-2.0-dev/bin/mpirun
> --allow-run-as-root --hostfile /root/mpd.hosts -np 8 --prefix
> /usr/mpi/gcc/openmpi-2.0-dev/ --map-by node --display-allocation
> --oversubscribe --mca btl openib,sm,self
> /usr/mpi/gcc/openmpi-2.0-dev/tests/IMB/IMB-MPI1
> 
> ======================   ALLOCATED NODES   ======================
> maneybhanjang: flags=0x01 slots=8 max_slots=0 slots_inuse=0 state=UP
> 10.193.184.162: flags=0x03 slots=4 max_slots=0 slots_inuse=0 state=UNKNOWN
> =================================================================
>               [maneybhanjang:28532] *** Process received signal ***
>               [maneybhanjang:28532] Signal: Segmentation fault (11)
>               [maneybhanjang:28532] Signal code: Invalid permissions (2)
>               [maneybhanjang:28532] Failing at address: 0x106ca70
>               [maneybhanjang:28532] [ 0]
>               /lib64/libpthread.so.0[0x3aea40f710]
>               [maneybhanjang:28532] [ 1] [0x106ca70]
>               [maneybhanjang:28532] *** End of error message ***
>               [tonglu:02068] *** Process received signal ***
>               [tonglu:02068] Signal: Segmentation fault (11)
>               [tonglu:02068] Signal code: Invalid permissions (2)
>               [tonglu:02068] Failing at address: 0x2478500
>               [tonglu:02068] [ 0] /lib64/libpthread.so.0[0x3ef5c0f710]
>               [tonglu:02068] [ 1] [0x2478500]
>               [tonglu:02068] *** End of error message ***
>               bash: line 1:  2068 Segmentation fault      (core
>               dumped) /usr/mpi/gcc/openmpi-2.0-dev/bin/orted
>               --hnp-topo-sig 0N:2S:0L3:4L2:8L1:8C:8H:x86_64 -mca ess
>               "env" -mca ess_base_jobid "3921674240" -mca
>               ess_base_vpid 1 -mca ess_base_num_procs "2" -mca
>               orte_hnp_uri
>               
> "3921674240.0;usock;tcp://10.193.184.161,102.1.1.161,102.2.2.161:43160"
>               --mca btl "openib,sm,self" -mca plm "rsh" -mca
>               rmaps_base_mapping_policy "node" -mca orte_display_alloc
>               "1" -mca rmaps_base_oversubscribe "1"
>               Segmentation fault (core dumped)
> [root@maneybhanjang ~]# dmesg
> mpirun[28532]: segfault at 106ca70 ip 000000000106ca70 sp 00007fffc00a7f28 
> error 15
> 
> Segfault is seen on the other peer too.
> [root@tonglu ~]# dmesg
> orted[2068]: segfault at 2478500 ip 0000000002478500 sp 00007fff521c2e68 
> error 15
> 
> gdb on coredump points me to orted/pmix/pmix_server_gen.c:80
> Following is the Back trace.
> [root@maneybhanjang ~]# gdb /usr/mpi/gcc/openmpi-2.0-dev/bin/mpirun core.28532
> Program terminated with signal 11, Segmentation fault.
> #0  0x000000000106ca70 in ?? ()
> Missing separate debuginfos, use: debuginfo-install
> glibc-2.12-1.149.el6.x86_64 libgcc-4.4.7-11.el6.x86_64
> libudev-147-2.57.el6.x86_64
> (gdb) bt
> #0  0x000000000106ca70 in ?? ()
> #1  0x00002b217f7a43aa in _client_conn (sd=-1, args=4,
> cbdata=0x2b2188022260)
>    at orted/pmix/pmix_server_gen.c:80
> #2  0x00002b217fad5a7c in event_process_active_single_queue
>    (base=0xfcc730, flags=1)
>        at event.c:1370
> #3  event_process_active (base=0xfcc730, flags=1) at
>       event.c:1440
> #4  opal_libevent2022_event_base_loop (base=0xfcc730, flags=1)
>       at event.c:1644
> #5  0x00000000004014d3 in orterun (argc=16, argv=0x7fffc00a81e8)
>       at orterun.c:192
> #6  0x0000000000400f04 in main (argc=16, argv=0x7fffc00a81e8) at
>       main.c:13
> (gdb) frame
> #0  0x000000000106ca70 in ?? ()
> (gdb) up
> #1  0x00002b217f7a43aa in _client_conn (sd=-1, args=4,
> cbdata=0x2b2188022260) at orted/pmix/pmix_server_gen.c:80
>           80              cd->cbfunc(OPAL_SUCCESS, cd->cbdata);
> 
> 
> Here is the backtrace of peer machine, pointing to same line:
> 
> [root@tonglu ~]# gdb /usr/mpi/gcc/openmpi-2.0-dev/bin/orted core.2068
> Program terminated with signal 11, Segmentation fault.
> #0  0x0000000002478500 in ?? ()
> Missing separate debuginfos, use: debuginfo-install
> glibc-2.12-1.149.el6.x86_64 libgcc-4.4.7-11.el6.x86_64
> libudev-147-2.57.el6.x86_64 numactl-2.0.9-2.el6.x86_64
> (gdb) bt
> #0  0x0000000002478500 in ?? ()
> #1  0x00002af4511433ba in _client_conn (sd=-1, args=4,
> cbdata=0x2af458022260)
>    at orted/pmix/pmix_server_gen.c:80
> #2  0x00002af451474cac in event_process_active_single_queue
>    (base=0x2408e90, flags=1)
>        at event.c:1370
> #3  event_process_active (base=0x2408e90, flags=1) at
>       event.c:1440
> #4  opal_libevent2022_event_base_loop (base=0x2408e90, flags=1)
>       at event.c:1644
> #5  0x00002af451123c57 in orte_daemon (argc=33,
>       argv=0x7fff521c33d8)
>           at orted/orted_main.c:859
> #6  0x000000000040081a in main (argc=33,
>           argv=0x7fff521c33d8) at orted.c:60
> (gdb) frame
> #0  0x0000000002478500 in ?? ()
> (gdb) up
> #1  0x00002af4511433ba in _client_conn (sd=-1, args=4,
>    cbdata=0x2af458022260)
> at orted/pmix/pmix_server_gen.c:80
>               80              cd->cbfunc(OPAL_SUCCESS, cd->cbdata);
> 
> I am using the tot of openmpi-master :
> commit 5795682aa56ce8f22e518462b22cfee49d407216
> Merge: 5d32282 1bb7788
> Author: Joshua Ladd <jladd.m...@gmail.com>
> Date:   Mon Jun 27 12:59:20 2016 -0400
> Merge pull request #1817 from shamisp/topic/oshmem_init
> OSHMEM: Removing erroneous initialization check
> 
> I am happy to provide any further information and would appreciate any 
> suggestions regarding the issue.
> 
> Thanks,
> Bharat.
> _______________________________________________
> devel mailing list
> de...@open-mpi.org
> Subscription: https://www.open-mpi.org/mailman/listinfo.cgi/devel
> Link to this post: 
> http://www.open-mpi.org/community/lists/devel/2016/06/19137.php


-- 
Jeff Squyres
jsquy...@cisco.com
For corporate legal information go to: 
http://www.cisco.com/web/about/doing_business/legal/cri/

Reply via email to