Hello all,
Any help would be appreciated on the problem below.
Starting openmpi 1.4.2 run stalls before any output to console. It is not every time, and in this case was on the 15th run of IMB-MPI1 version 3.2. There is another test running on the eth0 interface at the same time as this test. It is not obvious immediately why the test has been stalled for 9 hours and presumably indefinitely until we manually interfere with the process.
The processes are all in interruptible sleep states. The ones from orte-ps that are in undef have no process started, however. This includes n016 which appears to be the problem node.
Using the command /usr/bin/mpirun --mca pml ob1 -mca btl tcp,self -mca btl_tcp_if_include eth1 -np 80 -machinefile /cluster/pallas/x86_64-eth1/machines /tmp/IMB-MPI1 -npmin 80
The machinefile is a list of 80 nodes, n001-n080.
strace output from head node:
[root@n001 ~]# strace -p 7308 #mpirun process
Process 7308 attached - interrupt to quit
poll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=11, events=POLLIN}, {fd=13, events=POLLIN}, {fd=12, events=POLLIN}, {fd=14, events=POLLIN}, {fd=15, events=POLLIN}, {fd=17, events=POLLIN}, {fd=16, events=POLLIN}, {fd=18, events=POLLIN}, {fd=19, events=POLLIN}, {fd=20, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=25, events=POLLIN}, {fd=26, events=POLLIN}, {fd=27, events=POLLIN}, {fd=28, events=POLLIN}, {fd=29, events=POLLIN}, {fd=30, events=POLLIN}, {fd=31, events=POLLIN}, {fd=32, events=POLLIN}, {fd=33, events=POLLIN}, {fd=34, events=POLLIN}, {fd=35, events=POLLIN}, {fd=36, events=POLLIN}, {fd=37, events=POLLIN}, ...], 88, 1000) = 0 (Timeout)
poll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=9, events=POLLIN}, {fd=11, events=POLLIN}, {fd=13, events=POLLIN}, {fd=12, events=POLLIN}, {fd=14, events=POLLIN}, {fd=15, events=POLLIN}, {fd=17, events=POLLIN}, {fd=16, events=POLLIN}, {fd=18, events=POLLIN}, {fd=19, events=POLLIN}, {fd=20, events=POLLIN}, {fd=21, events=POLLIN}, {fd=22, events=POLLIN}, {fd=23, events=POLLIN}, {fd=24, events=POLLIN}, {fd=25, events=POLLIN}, {fd=26, events=POLLIN}, {fd=27, events=POLLIN}, {fd=28, events=POLLIN}, {fd=29, events=POLLIN}, {fd=30, events=POLLIN}, {fd=31, events=POLLIN}, {fd=32, events=POLLIN}, {fd=33, events=POLLIN}, {fd=34, events=POLLIN}, {fd=35, events=POLLIN}, {fd=36, events=POLLIN}, {fd=37, events=POLLIN}, ...], 88, 1000 <unfinished ...>
Process 7308 detached
[root@n001 ~]# strace -p 7389 #IMB-MPI1 process
Process 7389 attached - interrupt to quit
sched_yield() = 0
poll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=9, events=POLLIN}], 6, 1000) = 0 (Timeout)
sched_yield() = 0
poll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=7, events=POLLIN}, {fd=8, events=POLLIN}, {fd=9, events=POLLIN}], 6, 1000 <unfinished ...>
Process 7389 detached
lsof output from mpirun on n001:
mpirun 7308 root cwd DIR 0,24 4096 616382599 /cluster/pallas/x86_64-eth1 (10.16.0.1:/storage/cluster)
mpirun 7308 root rtd DIR 8,1 4096 2 /
mpirun 7308 root txt REG 8,1 60599 7063963 /usr/bin/orterun
mpirun 7308 root mem REG 8,1 139416 48628016 /lib64/ld-2.5.so
mpirun 7308 root mem REG 8,1 1717800 48628017 /lib64/libc-2.5.so
mpirun 7308 root mem REG 8,1 23360 48628018 /lib64/libdl-2.5.so
mpirun 7308 root mem REG 8,1 615136 48628023 /lib64/libm-2.5.so
mpirun 7308 root mem REG 8,1 145824 48628022 /lib64/libpthread-2.5.so
mpirun 7308 root mem REG 8,1 114352 48627895 /lib64/libnsl-2.5.so
mpirun 7308 root mem REG 8,1 92736 48628041 /lib64/libresolv-2.5.so
mpirun 7308 root mem REG 8,1 18152 48628034 /lib64/libutil-2.5.so
mpirun 7308 root DEL REG 8,1 7064031 /usr/lib64/libopen-rte.so.0.0.0.#prelink#.GqONGz
mpirun 7308 root DEL REG 8,1 7064027 /usr/lib64/libopen-pal.so.0.0.0.#prelink#.TE3OgZ
mpirun 7308 root mem REG 8,1 64448 7538208 /usr/lib64/openmpi/mca_paffinity_linux.so
mpirun 7308 root mem REG 8,1 18544 7538140 /usr/lib64/openmpi/mca_carto_auto_detect.so
mpirun 7308 root mem REG 8,1 51802 7538166 /usr/lib64/openmpi/mca_ess_hnp.so
mpirun 7308 root mem REG 8,1 92258 7538210 /usr/lib64/openmpi/mca_plm_rsh.so
mpirun 7308 root mem REG 8,1 114184 7538236 /usr/lib64/openmpi/mca_rml_oob.so
mpirun 7308 root mem REG 8,1 221031 7538202 /usr/lib64/openmpi/mca_oob_tcp.so
mpirun 7308 root mem REG 8,1 62331 7538238 /usr/lib64/openmpi/mca_routed_binomial.so
mpirun 7308 root mem REG 8,1 43713 7538176 /usr/lib64/openmpi/mca_grpcomm_bad.so
mpirun 7308 root mem REG 8,1 24891 7538162 /usr/lib64/openmpi/mca_errmgr_default.so
mpirun 7308 root mem REG 8,1 49797 7538200 /usr/lib64/openmpi/mca_odls_default.so
mpirun 7308 root mem REG 8,1 27404 7538232 /usr/lib64/openmpi/mca_rmaps_round_robin.so
mpirun 7308 root mem REG 8,1 53880 48627739 /lib64/libnss_files-2.5.so
mpirun 7308 root mem REG 8,1 101686 7538182 /usr/lib64/openmpi/mca_iof_hnp.so
mpirun 7308 root mem REG 8,1 70112 7538174 /usr/lib64/openmpi/mca_filem_rsh.so
mpirun 7308 root mem REG 8,1 23736 48627737 /lib64/libnss_dns-2.5.so
mpirun 7308 root 0u CHR 136,0 2 /dev/pts/0
mpirun 7308 root 1w FIFO 0,6 232900 pipe
mpirun 7308 root 2w FIFO 0,6 232900 pipe
mpirun 7308 root 3u unix 0xffff8102fee4fa00 232901 socket
mpirun 7308 root 4u unix 0xffff8102fee50180 232902 socket
mpirun 7308 root 5u IPv4 232904 TCP *:54075 (LISTEN)
mpirun 7308 root 6u IPv6 232905 TCP *:59821 (LISTEN)
mpirun 7308 root 7r FIFO 0,6 232912 pipe
mpirun 7308 root 8w FIFO 0,6 232912 pipe
mpirun 7308 root 9r FIFO 0,6 232913 pipe
mpirun 7308 root 10w FIFO 0,6 232913 pipe
mpirun 7308 root 11u IPv4 235043 TCP n001:54075->n040.cluster1.e1350:55283 (ESTABLISHED)
mpirun 7308 root 12u IPv4 235045 TCP n001:54075->n039.cluster1.e1350:47537 (ESTABLISHED)
mpirun 7308 root 13u IPv4 235046 TCP n001:54075->n041.cluster1.e1350:34625 (ESTABLISHED)
mpirun 7308 root 14u IPv4 235048 TCP n001:54075->n042.cluster1.e1350:57469 (ESTABLISHED)
mpirun 7308 root 15u IPv4 235050 TCP n001:54075->n036.cluster1.e1350:48121 (ESTABLISHED)
mpirun 7308 root 16u IPv4 235052 TCP n001:54075->n037.cluster1.e1350:43462 (ESTABLISHED)
mpirun 7308 root 17u IPv4 235054 TCP n001:54075->n038.cluster1.e1350:44078 (ESTABLISHED)
mpirun 7308 root 18u IPv4 235056 TCP n001:54075->n035.cluster1.e1350:34846 (ESTABLISHED)
mpirun 7308 root 19u IPv4 235066 TCP n001:54075->n004.cluster1.e1350:35097 (ESTABLISHED)
mpirun 7308 root 20u IPv4 235068 TCP n001:54075->n003.cluster1.e1350:56015 (ESTABLISHED)
mpirun 7308 root 21u IPv4 235070 TCP n001:54075->n005.cluster1.e1350:39706 (ESTABLISHED)
mpirun 7308 root 22u IPv4 235072 TCP n001:54075->n006.cluster1.e1350:50717 (ESTABLISHED)
mpirun 7308 root 23u IPv4 235074 TCP n001:54075->n007.cluster1.e1350:59961 (ESTABLISHED)
mpirun 7308 root 24u IPv4 235076 TCP n001:54075->n009.cluster1.e1350:57906 (ESTABLISHED)
mpirun 7308 root 25u IPv4 235078 TCP n001:54075->n008.cluster1.e1350:38919 (ESTABLISHED)
mpirun 7308 root 26u IPv4 235080 TCP n001:54075->n012.cluster1.e1350:59023 (ESTABLISHED)
mpirun 7308 root 27u IPv4 235082 TCP n001:54075->n011.cluster1.e1350:56019 (ESTABLISHED)
mpirun 7308 root 28u IPv4 235084 TCP n001:54075->n022.cluster1.e1350:54015 (ESTABLISHED)
mpirun 7308 root 29u IPv4 235086 TCP n001:54075->n018.cluster1.e1350:33217 (ESTABLISHED)
mpirun 7308 root 30u IPv4 235088 TCP n001:54075->n020.cluster1.e1350:41843 (ESTABLISHED)
mpirun 7308 root 31u IPv4 235090 TCP n001:54075->n028.cluster1.e1350:49379 (ESTABLISHED)
mpirun 7308 root 32u IPv4 235092 TCP n001:54075->n002.cluster1.e1350:41220 (ESTABLISHED)
mpirun 7308 root 33u IPv4 235094 TCP n001:54075->n025.cluster1.e1350:49443 (ESTABLISHED)
mpirun 7308 root 34u IPv4 235096 TCP n001:54075->n027.cluster1.e1350:56766 (ESTABLISHED)
mpirun 7308 root 35u IPv4 235098 TCP n001:54075->n015.cluster1.e1350:36566 (ESTABLISHED)
mpirun 7308 root 36u IPv4 235100 TCP n001:54075->n021.cluster1.e1350:59628 (ESTABLISHED)
mpirun 7308 root 37u IPv4 235102 TCP n001:54075->n033.cluster1.e1350:45705 (ESTABLISHED)
mpirun 7308 root 38u IPv4 235104 TCP n001:54075->n023.cluster1.e1350:37982 (ESTABLISHED)
mpirun 7308 root 39u IPv4 235106 TCP n001:54075->n014.cluster1.e1350:35627 (ESTABLISHED)
mpirun 7308 root 40u IPv4 235108 TCP n001:54075->n017.cluster1.e1350:39171 (ESTABLISHED)
mpirun 7308 root 41u IPv4 235110 TCP n001:54075->n029.cluster1.e1350:46895 (ESTABLISHED)
mpirun 7308 root 42u IPv4 235112 TCP n001:54075->n031.cluster1.e1350:38771 (ESTABLISHED)
mpirun 7308 root 43u IPv4 235114 TCP n001:54075->n030.cluster1.e1350:35790 (ESTABLISHED)
mpirun 7308 root 44u IPv4 235116 TCP n001:54075->n016.cluster1.e1350:40315 (ESTABLISHED)
mpirun 7308 root 45u IPv4 235118 TCP n001:54075->n026.cluster1.e1350:39845 (ESTABLISHED)
mpirun 7308 root 46u IPv4 235120 TCP n001:54075->n013.cluster1.e1350:41310 (ESTABLISHED)
mpirun 7308 root 47u IPv4 235122 TCP n001:54075->n019.cluster1.e1350:53389 (ESTABLISHED)
mpirun 7308 root 48u IPv4 235124 TCP n001:54075->n024.cluster1.e1350:41917 (ESTABLISHED)
mpirun 7308 root 49u IPv4 235126 TCP n001:54075->n057.cluster1.e1350:44308 (ESTABLISHED)
mpirun 7308 root 50u IPv4 235128 TCP n001:54075->n056.cluster1.e1350:44379 (ESTABLISHED)
mpirun 7308 root 51u IPv4 235130 TCP n001:54075->n032.cluster1.e1350:52503 (ESTABLISHED)
mpirun 7308 root 52u IPv4 235132 TCP n001:54075->n054.cluster1.e1350:58904 (ESTABLISHED)
mpirun 7308 root 53u IPv4 235134 TCP n001:54075->n059.cluster1.e1350:57729 (ESTABLISHED)
mpirun 7308 root 54u IPv4 235136 TCP n001:54075->n043.cluster1.e1350:56650 (ESTABLISHED)
mpirun 7308 root 55u IPv4 235138 TCP n001:54075->n045.cluster1.e1350:37641 (ESTABLISHED)
mpirun 7308 root 56u IPv4 235140 TCP n001:54075->n062.cluster1.e1350:46471 (ESTABLISHED)
mpirun 7308 root 57u IPv4 235142 TCP n001:54075->n061.cluster1.e1350:58095 (ESTABLISHED)
mpirun 7308 root 58u IPv4 235144 TCP n001:54075->n044.cluster1.e1350:51446 (ESTABLISHED)
mpirun 7308 root 59u IPv4 235146 TCP n001:54075->n053.cluster1.e1350:49553 (ESTABLISHED)
mpirun 7308 root 60u IPv4 235148 TCP n001:54075->n048.cluster1.e1350:35878 (ESTABLISHED)
mpirun 7308 root 61u IPv4 235150 TCP n001:54075->n010.cluster1.e1350:42228 (ESTABLISHED)
mpirun 7308 root 62u IPv4 235152 TCP n001:54075->n046.cluster1.e1350:58293 (ESTABLISHED)
mpirun 7308 root 63u IPv4 235154 TCP n001:54075->n034.cluster1.e1350:55908 (ESTABLISHED)
mpirun 7308 root 64u IPv4 235156 TCP n001:54075->n064.cluster1.e1350:44344 (ESTABLISHED)
mpirun 7308 root 65u IPv4 235158 TCP n001:54075->n047.cluster1.e1350:41773 (ESTABLISHED)
mpirun 7308 root 66u IPv4 235160 TCP n001:54075->n071.cluster1.e1350:52199 (ESTABLISHED)
mpirun 7308 root 67u IPv4 235162 TCP n001:54075->n065.cluster1.e1350:34375 (ESTABLISHED)
mpirun 7308 root 68u IPv4 235164 TCP n001:54075->n069.cluster1.e1350:38277 (ESTABLISHED)
mpirun 7308 root 69u IPv4 235166 TCP n001:54075->n067.cluster1.e1350:50562 (ESTABLISHED)
mpirun 7308 root 70u IPv4 235168 TCP n001:54075->n049.cluster1.e1350:50486 (ESTABLISHED)
mpirun 7308 root 71u IPv4 235170 TCP n001:54075->n055.cluster1.e1350:35862 (ESTABLISHED)
mpirun 7308 root 72u IPv4 235172 TCP n001:54075->n070.cluster1.e1350:34786 (ESTABLISHED)
mpirun 7308 root 73u IPv4 235174 TCP n001:54075->n052.cluster1.e1350:33843 (ESTABLISHED)
mpirun 7308 root 74u IPv4 235176 TCP n001:54075->n068.cluster1.e1350:45205 (ESTABLISHED)
mpirun 7308 root 75u IPv4 235178 TCP n001:54075->n060.cluster1.e1350:50364 (ESTABLISHED)
mpirun 7308 root 76u IPv4 235180 TCP n001:54075->n080.cluster1.e1350:42459 (ESTABLISHED)
mpirun 7308 root 77u IPv4 235182 TCP n001:54075->n072.cluster1.e1350:51882 (ESTABLISHED)
mpirun 7308 root 78u IPv4 235184 TCP n001:54075->n050.cluster1.e1350:45385 (ESTABLISHED)
mpirun 7308 root 79u IPv4 235186 TCP n001:54075->n051.cluster1.e1350:41801 (ESTABLISHED)
mpirun 7308 root 80u IPv4 235188 TCP n001:54075->n073.cluster1.e1350:58911 (ESTABLISHED)
mpirun 7308 root 81u IPv4 235190 TCP n001:54075->n077.cluster1.e1350:48042 (ESTABLISHED)
mpirun 7308 root 82u IPv4 235192 TCP n001:54075->n079.cluster1.e1350:41265 (ESTABLISHED)
mpirun 7308 root 83u IPv4 235194 TCP n001:54075->n074.cluster1.e1350:45921 (ESTABLISHED)
mpirun 7308 root 84u IPv4 235196 TCP n001:54075->n075.cluster1.e1350:53530 (ESTABLISHED)
mpirun 7308 root 85u IPv4 235198 TCP n001:54075->n063.cluster1.e1350:53179 (ESTABLISHED)
mpirun 7308 root 86u IPv4 235200 TCP n001:54075->n076.cluster1.e1350:54171 (ESTABLISHED)
mpirun 7308 root 87u IPv4 235202 TCP n001:54075->n058.cluster1.e1350:34660 (ESTABLISHED)
mpirun 7308 root 88u IPv4 235204 TCP n001:54075->n066.cluster1.e1350:41855 (ESTABLISHED)
mpirun 7308 root 89u IPv4 235206 TCP n001:54075->n078.cluster1.e1350:50203 (ESTABLISHED)
mpirun 7308 root 90u CHR 5,2 1016 /dev/ptmx
mpirun 7308 root 91u IPv4 235223 TCP 169.254.95.120:54075->169.254.95.120:52607 (ESTABLISHED)
mpirun 7308 root 93w FIFO 0,6 235209 pipe
mpirun 7308 root 94r FIFO 0,6 235210 pipe
mpirun 7308 root 96r FIFO 0,6 235211 pipe
grep 18423 root 1w REG 8,1 0 45449399 /tmp/lsof7308
lsof output from IMB-MPI1 process on n001:
IMB-MPI1 7389 root cwd DIR 0,24 4096 616382599 /cluster/pallas/x86_64-eth1 (10.16.0.1:/storage/cluster)
IMB-MPI1 7389 root rtd DIR 8,1 4096 2 /
IMB-MPI1 7389 root txt REG 8,1 78018 45449334 /tmp/IMB-MPI1
IMB-MPI1 7389 root mem REG 8,1 139416 48628016 /lib64/ld-2.5.so
IMB-MPI1 7389 root mem REG 8,1 1717800 48628017 /lib64/libc-2.5.so
IMB-MPI1 7389 root mem REG 8,1 23360 48628018 /lib64/libdl-2.5.so
IMB-MPI1 7389 root mem REG 8,1 615136 48628023 /lib64/libm-2.5.so
IMB-MPI1 7389 root mem REG 8,1 145824 48628022 /lib64/libpthread-2.5.so
IMB-MPI1 7389 root mem REG 8,1 114352 48627895 /lib64/libnsl-2.5.so
IMB-MPI1 7389 root mem REG 8,1 18152 48628034 /lib64/libutil-2.5.so
IMB-MPI1 7389 root mem REG 8,1 7064011 /usr/lib64/libmpi.so.0.0.2 (path inode=7063888)
IMB-MPI1 7389 root DEL REG 8,1 7064031 /usr/lib64/libopen-rte.so.0.0.0.#prelink#.GqONGz
IMB-MPI1 7389 root DEL REG 8,1 7064027 /usr/lib64/libopen-pal.so.0.0.0.#prelink#.TE3OgZ
IMB-MPI1 7389 root mem REG 8,1 64448 7538208 /usr/lib64/openmpi/mca_paffinity_linux.so
IMB-MPI1 7389 root mem REG 8,1 18544 7538140 /usr/lib64/openmpi/mca_carto_auto_detect.so
IMB-MPI1 7389 root mem REG 8,1 34144 7538164 /usr/lib64/openmpi/mca_ess_env.so
IMB-MPI1 7389 root mem REG 8,1 114184 7538236 /usr/lib64/openmpi/mca_rml_oob.so
IMB-MPI1 7389 root mem REG 8,1 221031 7538202 /usr/lib64/openmpi/mca_oob_tcp.so
IMB-MPI1 7389 root mem REG 8,1 62331 7538238 /usr/lib64/openmpi/mca_routed_binomial.so
IMB-MPI1 7389 root mem REG 8,1 43713 7538176 /usr/lib64/openmpi/mca_grpcomm_bad.so
IMB-MPI1 7389 root mem REG 8,1 53880 48627739 /lib64/libnss_files-2.5.so
IMB-MPI1 7389 root mem REG 8,1 33854 7538124 /usr/lib64/openmpi/mca_allocator_basic.so
IMB-MPI1 7389 root mem REG 8,1 34642 7538126 /usr/lib64/openmpi/mca_allocator_bucket.so
IMB-MPI1 7389 root mem REG 8,1 68562 7538226 /usr/lib64/openmpi/mca_rcache_vma.so
IMB-MPI1 7389 root mem REG 8,1 19414 7538190 /usr/lib64/openmpi/mca_mpool_fake.so
IMB-MPI1 7389 root mem REG 8,1 55970 7538192 /usr/lib64/openmpi/mca_mpool_rdma.so
IMB-MPI1 7389 root mem REG 8,1 33243 7538194 /usr/lib64/openmpi/mca_mpool_sm.so
IMB-MPI1 7389 root mem REG 8,1 43223 7064007 /usr/lib64/libmca_common_sm.so.1.0.0
IMB-MPI1 7389 root mem REG 8,1 634374 7538218 /usr/lib64/openmpi/mca_pml_ob1.so
IMB-MPI1 7389 root mem REG 8,1 73429 7538128 /usr/lib64/openmpi/mca_bml_r2.so
IMB-MPI1 7389 root mem REG 8,1 64556 7538134 /usr/lib64/openmpi/mca_btl_self.so
IMB-MPI1 7389 root mem REG 8,1 202493 7538138 /usr/lib64/openmpi/mca_btl_tcp.so
IMB-MPI1 7389 root mem REG 8,1 320592 7538144 /usr/lib64/openmpi/mca_coll_basic.so
IMB-MPI1 7389 root mem REG 8,1 142427 7538146 /usr/lib64/openmpi/mca_coll_hierarch.so
IMB-MPI1 7389 root mem REG 8,1 163269 7538148 /usr/lib64/openmpi/mca_coll_inter.so
IMB-MPI1 7389 root mem REG 8,1 123498 7538150 /usr/lib64/openmpi/mca_coll_self.so
IMB-MPI1 7389 root mem REG 8,1 127173 7538154 /usr/lib64/openmpi/mca_coll_sync.so
IMB-MPI1 7389 root mem REG 8,1 618120 7538156 /usr/lib64/openmpi/mca_coll_tuned.so
IMB-MPI1 7389 root mem REG 8,1 283187 7538204 /usr/lib64/openmpi/mca_osc_pt2pt.so
IMB-MPI1 7389 root mem REG 8,1 346264 7538206 /usr/lib64/openmpi/mca_osc_rdma.so
IMB-MPI1 7389 root 0r FIFO 0,6 235209 pipe
IMB-MPI1 7389 root 1u CHR 136,1 3 /dev/pts/1
IMB-MPI1 7389 root 2w FIFO 0,6 235210 pipe
IMB-MPI1 7389 root 3u unix 0xffff8101be73c400 235213 socket
IMB-MPI1 7389 root 4u unix 0xffff8101be73dcc0 235214 socket
IMB-MPI1 7389 root 5u IPv4 235216 TCP *:34867 (LISTEN)
IMB-MPI1 7389 root 6u IPv6 235217 TCP *:37714 (LISTEN)
IMB-MPI1 7389 root 7u IPv4 235222 TCP 169.254.95.120:52607->169.254.95.120:54075 (ESTABLISHED)
IMB-MPI1 7389 root 8u IPv4 235225 TCP *:cap (LISTEN)
IMB-MPI1 7389 root 9u IPv6 235226 TCP *:1024 (LISTEN)
IMB-MPI1 7389 root 97w FIFO 0,6 235211 pipe
grep 18438 root 1w REG 8,1 0 45449400 /tmp/lsof7389
strace output from n016:
[root@n016 ~]# strace -p 29430
Process 29430 attached - interrupt to quit
sched_yield() = 0
poll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=7, events=POLLIN}], 4, 1000) = 0 (Timeout)
sched_yield() = 0
poll([{fd=4, events=POLLIN}, {fd=5, events=POLLIN}, {fd=6, events=POLLIN}, {fd=7, events=POLLIN}], 4, 1000 <unfinished ...>
Process 29430 detached
lsof output from n016:
[root@n016 ~]# lsof | grep 29430
IMB-MPI1 29430 root cwd DIR 0,24 4096 616382599 /cluster/pallas/x86_64-eth1 (10.16.0.1:/storage/cluster)
IMB-MPI1 29430 root rtd DIR 8,1 4096 2 /
IMB-MPI1 29430 root txt REG 8,1 78018 23068790 /tmp/IMB-MPI1
IMB-MPI1 29430 root mem REG 8,1 139416 8487216 /lib64/ld-2.5.so
IMB-MPI1 29430 root mem REG 8,1 1717800 8487217 /lib64/libc-2.5.so
IMB-MPI1 29430 root mem REG 8,1 23360 8487218 /lib64/libdl-2.5.so
IMB-MPI1 29430 root mem REG 8,1 615136 8487223 /lib64/libm-2.5.so
IMB-MPI1 29430 root mem REG 8,1 145824 8487222 /lib64/libpthread-2.5.so
IMB-MPI1 29430 root mem REG 8,1 114352 8487095 /lib64/libnsl-2.5.so
IMB-MPI1 29430 root mem REG 8,1 18152 8487234 /lib64/libutil-2.5.so
IMB-MPI1 29430 root mem REG 8,1 25968851 /usr/lib64/libmpi.so.0.0.2 (path inode=25968728)
IMB-MPI1 29430 root DEL REG 8,1 25968871 /usr/lib64/libopen-rte.so.0.0.0.#prelink#.d0FGSq
IMB-MPI1 29430 root DEL REG 8,1 25968867 /usr/lib64/libopen-pal.so.0.0.0.#prelink#.24mfzT
IMB-MPI1 29430 root mem REG 8,1 64448 26581710 /usr/lib64/openmpi/mca_paffinity_linux.so
IMB-MPI1 29430 root mem REG 8,1 18544 26581642 /usr/lib64/openmpi/mca_carto_auto_detect.so
IMB-MPI1 29430 root mem REG 8,1 34144 26581666 /usr/lib64/openmpi/mca_ess_env.so
IMB-MPI1 29430 root mem REG 8,1 114184 26581738 /usr/lib64/openmpi/mca_rml_oob.so
IMB-MPI1 29430 root mem REG 8,1 221031 26581704 /usr/lib64/openmpi/mca_oob_tcp.so
IMB-MPI1 29430 root mem REG 8,1 62331 26581740 /usr/lib64/openmpi/mca_routed_binomial.so
IMB-MPI1 29430 root mem REG 8,1 43713 26581678 /usr/lib64/openmpi/mca_grpcomm_bad.so
IMB-MPI1 29430 root mem REG 8,1 53880 8486939 /lib64/libnss_files-2.5.so
IMB-MPI1 29430 root 0r CHR 1,3 2944 /dev/null
IMB-MPI1 29430 root 1u CHR 136,0 2 /dev/pts/0
IMB-MPI1 29430 root 2w FIFO 0,6 104967 pipe
IMB-MPI1 29430 root 3u unix 0xffff81013e802900 104973 socket
IMB-MPI1 29430 root 4u unix 0xffff81013e1fae40 104974 socket
IMB-MPI1 29430 root 5u IPv4 104976 TCP *:46222 (LISTEN)
IMB-MPI1 29430 root 6u IPv6 104977 TCP *:40269 (LISTEN)
IMB-MPI1 29430 root 7u IPv4 104982 TCP 169.254.95.120:46723->169.254.95.120:45676 (ESTABLISHED)
IMB-MPI1 29430 root 18w FIFO 0,6 104968 pipe
orte-ps output:
Information from mpirun [7600,0]
----------------------------------
JobID | State | Slots | Num Procs |
---------------------------------------
[7600,1] | Undef | 80 | 80 |
Process Name | ORTE Name | Local Rank | PID | Node | State |
--------------------------------------------------------------------------
/tmp/IMB-MPI1 | [[7600,1],0] | 0 | 7389 | n001 | Running |
/tmp/IMB-MPI1 | [[7600,1],1] | 0 | 30479 | n002 | Running |
/tmp/IMB-MPI1 | [[7600,1],2] | 0 | 28918 | n003 | Running |
/tmp/IMB-MPI1 | [[7600,1],3] | 0 | 29033 | n004 | Running |
/tmp/IMB-MPI1 | [[7600,1],4] | 0 | 30351 | n005 | Running |
/tmp/IMB-MPI1 | [[7600,1],5] | 0 | 28751 | n006 | Running |
/tmp/IMB-MPI1 | [[7600,1],6] | 0 | 28707 | n007 | Running |
/tmp/IMB-MPI1 | [[7600,1],7] | 0 | 29060 | n008 | Running |
/tmp/IMB-MPI1 | [[7600,1],8] | 0 | 28953 | n009 | Running |
/tmp/IMB-MPI1 | [[7600,1],9] | 0 | 32434 | n010 | Running |
/tmp/IMB-MPI1 | [[7600,1],10] | 0 | 28858 | n011 | Running |
/tmp/IMB-MPI1 | [[7600,1],11] | 0 | 29156 | n012 | Running |
/tmp/IMB-MPI1 | [[7600,1],12] | 0 | 29776 | n013 | Running |
/tmp/IMB-MPI1 | [[7600,1],13] | 0 | 29540 | n014 | Running |
/tmp/IMB-MPI1 | [[7600,1],14] | 0 | 29129 | n015 | Running |
/tmp/IMB-MPI1 | [[7600,1],15] | 0 | 29430 | n016 | Launched |
/tmp/IMB-MPI1 | [[7600,1],16] | 0 | 29431 | n017 | Running |
/tmp/IMB-MPI1 | [[7600,1],17] | 0 | 28771 | n018 | Running |
/tmp/IMB-MPI1 | [[7600,1],18] | 0 | 29986 | n019 | Running |
/tmp/IMB-MPI1 | [[7600,1],19] | 0 | 27431 | n020 | Running |
/tmp/IMB-MPI1 | [[7600,1],20] | 0 | 8288 | n021 | Running |
/tmp/IMB-MPI1 | [[7600,1],21] | 0 | 28700 | n022 | Running |
/tmp/IMB-MPI1 | [[7600,1],22] | 0 | 28197 | n023 | Running |
/tmp/IMB-MPI1 | [[7600,1],23] | 0 | 27226 | n024 | Running |
/tmp/IMB-MPI1 | [[7600,1],24] | 0 | 28706 | n025 | Running |
/tmp/IMB-MPI1 | [[7600,1],25] | 0 | 27374 | n026 | Running |
/tmp/IMB-MPI1 | [[7600,1],26] | 0 | 29004 | n027 | Running |
/tmp/IMB-MPI1 | [[7600,1],27] | 0 | 26775 | n028 | Running |
/tmp/IMB-MPI1 | [[7600,1],28] | 0 | 27204 | n029 | Running |
/tmp/IMB-MPI1 | [[7600,1],29] | 0 | 27077 | n030 | Running |
/tmp/IMB-MPI1 | [[7600,1],30] | 0 | 28924 | n031 | Running |
/tmp/IMB-MPI1 | [[7600,1],31] | 0 | 0 | n032 | Undef |
/tmp/IMB-MPI1 | [[7600,1],32] | 0 | 15522 | n033 | Running |
/tmp/IMB-MPI1 | [[7600,1],33] | 0 | 27431 | n034 | Running |
/tmp/IMB-MPI1 | [[7600,1],34] | 0 | 5341 | n035 | Running |
/tmp/IMB-MPI1 | [[7600,1],35] | 0 | 14809 | n036 | Running |
/tmp/IMB-MPI1 | [[7600,1],36] | 0 | 17284 | n037 | Running |
/tmp/IMB-MPI1 | [[7600,1],37] | 0 | 17543 | n038 | Running |
/tmp/IMB-MPI1 | [[7600,1],38] | 0 | 16534 | n039 | Running |
/tmp/IMB-MPI1 | [[7600,1],39] | 0 | 16443 | n040 | Running |
/tmp/IMB-MPI1 | [[7600,1],40] | 0 | 16630 | n041 | Running |
/tmp/IMB-MPI1 | [[7600,1],41] | 0 | 18190 | n042 | Running |
/tmp/IMB-MPI1 | [[7600,1],42] | 0 | 28173 | n043 | Running |
/tmp/IMB-MPI1 | [[7600,1],43] | 0 | 28539 | n044 | Running |
/tmp/IMB-MPI1 | [[7600,1],44] | 0 | 21718 | n045 | Running |
/tmp/IMB-MPI1 | [[7600,1],45] | 0 | 27610 | n046 | Running |
/tmp/IMB-MPI1 | [[7600,1],46] | 0 | 28542 | n047 | Running |
/tmp/IMB-MPI1 | [[7600,1],47] | 0 | 0 | n048 | Undef |
/tmp/IMB-MPI1 | [[7600,1],48] | 0 | 2000 | n049 | Running |
/tmp/IMB-MPI1 | [[7600,1],49] | 0 | 32344 | n050 | Running |
/tmp/IMB-MPI1 | [[7600,1],50] | 0 | 26061 | n051 | Running |
/tmp/IMB-MPI1 | [[7600,1],51] | 0 | 26635 | n052 | Running |
/tmp/IMB-MPI1 | [[7600,1],52] | 0 | 26296 | n053 | Running |
/tmp/IMB-MPI1 | [[7600,1],53] | 0 | 26731 | n054 | Running |
/tmp/IMB-MPI1 | [[7600,1],54] | 0 | 27317 | n055 | Running |
/tmp/IMB-MPI1 | [[7600,1],55] | 0 | 26238 | n056 | Running |
/tmp/IMB-MPI1 | [[7600,1],56] | 0 | 26652 | n057 | Running |
/tmp/IMB-MPI1 | [[7600,1],57] | 0 | 21492 | n058 | Running |
/tmp/IMB-MPI1 | [[7600,1],58] | 0 | 25562 | n059 | Running |
/tmp/IMB-MPI1 | [[7600,1],59] | 0 | 26412 | n060 | Running |
/tmp/IMB-MPI1 | [[7600,1],60] | 0 | 16763 | n061 | Running |
/tmp/IMB-MPI1 | [[7600,1],61] | 0 | 26736 | n062 | Running |
/tmp/IMB-MPI1 | [[7600,1],62] | 0 | 26868 | n063 | Running |
/tmp/IMB-MPI1 | [[7600,1],63] | 0 | 0 | n064 | Undef |
/tmp/IMB-MPI1 | [[7600,1],64] | 0 | 26735 | n065 | Running |
/tmp/IMB-MPI1 | [[7600,1],65] | 0 | 25913 | n066 | Running |
/tmp/IMB-MPI1 | [[7600,1],66] | 0 | 26135 | n067 | Running |
/tmp/IMB-MPI1 | [[7600,1],67] | 0 | 17790 | n068 | Running |
/tmp/IMB-MPI1 | [[7600,1],68] | 0 | 25963 | n069 | Running |
/tmp/IMB-MPI1 | [[7600,1],69] | 0 | 26713 | n070 | Running |
/tmp/IMB-MPI1 | [[7600,1],70] | 0 | 16266 | n071 | Running |
/tmp/IMB-MPI1 | [[7600,1],71] | 0 | 12630 | n072 | Running |
/tmp/IMB-MPI1 | [[7600,1],72] | 0 | 27463 | n073 | Running |
/tmp/IMB-MPI1 | [[7600,1],73] | 0 | 27534 | n074 | Running |
/tmp/IMB-MPI1 | [[7600,1],74] | 0 | 18902 | n075 | Running |
/tmp/IMB-MPI1 | [[7600,1],75] | 0 | 27471 | n076 | Running |
/tmp/IMB-MPI1 | [[7600,1],76] | 0 | 13115 | n077 | Running |
/tmp/IMB-MPI1 | [[7600,1],77] | 0 | 14448 | n078 | Running |
/tmp/IMB-MPI1 | [[7600,1],78] | 0 | 27148 | n079 | Running |
/tmp/IMB-MPI1 | [[7600,1],79] | 0 | 0 | n080 | Undef |
[n001:18160] [[18380,0],0]-[[30538,0],0] oob-tcp: Communication retries exceeded. Can not communicate with peer
[n001:18160] [[18380,0],0] ORTE_ERROR_LOG: Unreachable in file util/comm/comm.c at line 62
[n001:18160] [[18380,0],0] ORTE_ERROR_LOG: Unreachable in file orte-ps.c at line 799
[n001:18160] [[18380,0],0]-[[30538,0],0] oob-tcp: Communication retries exceeded. Can not communicate with peer
ompi_info output:
Package: Open MPI root@n041 Distribution
Open MPI: 1.4.2
Open MPI SVN revision: r23093
Open MPI release date: May 04, 2010
Open RTE: 1.4.2
Open RTE SVN revision: r23093
Open RTE release date: May 04, 2010
OPAL: 1.4.2
OPAL SVN revision: r23093
OPAL release date: May 04, 2010
Ident string: 1.4.2
Prefix: /usr
Configured architecture: x86_64-redhat-linux-gnu
Configure host: n041
Configured by: root
Configured on: Sat Oct 16 18:47:20 EDT 2010
Configure host: n041
Built by: root
Built on: Sat Oct 16 18:59:48 EDT 2010
Built host: n041
C bindings: yes
C++ bindings: yes
Fortran77 bindings: yes (all)
Fortran90 bindings: yes
Fortran90 bindings size: small
C compiler: gcc
C compiler absolute: /usr/bin/gcc
C++ compiler: g++
C++ compiler absolute: /usr/bin/g++
Fortran77 compiler: gfortran
Fortran77 compiler abs: /usr/bin/gfortran
Fortran90 compiler: gfortran
Fortran90 compiler abs: /usr/bin/gfortran
C profiling: yes
C++ profiling: yes
Fortran77 profiling: yes
Fortran90 profiling: yes
C++ exceptions: no
Thread support: posix (mpi: no, progress: no)
Sparse Groups: no
Internal debug support: no
MPI parameter check: runtime
Memory profiling support: no
Memory debugging support: no
libltdl support: yes
Heterogeneous support: no
mpirun default --prefix: no
MPI I/O support: yes
MPI_WTIME support: gettimeofday
Symbol visibility support: yes
FT Checkpoint support: no (checkpoint thread: no)
MCA backtrace: execinfo (MCA v2.0, API v2.0, Component v1.4.2)
MCA memory: ptmalloc2 (MCA v2.0, API v2.0, Component v1.4.2)
MCA paffinity: linux (MCA v2.0, API v2.0, Component v1.4.2)
MCA carto: auto_detect (MCA v2.0, API v2.0, Component v1.4.2)
MCA carto: file (MCA v2.0, API v2.0, Component v1.4.2)
MCA maffinity: first_use (MCA v2.0, API v2.0, Component v1.4.2)
MCA timer: linux (MCA v2.0, API v2.0, Component v1.4.2)
MCA installdirs: env (MCA v2.0, API v2.0, Component v1.4.2)
MCA installdirs: config (MCA v2.0, API v2.0, Component v1.4.2)
MCA dpm: orte (MCA v2.0, API v2.0, Component v1.4.2)
MCA pubsub: orte (MCA v2.0, API v2.0, Component v1.4.2)
MCA allocator: basic (MCA v2.0, API v2.0, Component v1.4.2)
MCA allocator: bucket (MCA v2.0, API v2.0, Component v1.4.2)
MCA coll: basic (MCA v2.0, API v2.0, Component v1.4.2)
MCA coll: hierarch (MCA v2.0, API v2.0, Component v1.4.2)
MCA coll: inter (MCA v2.0, API v2.0, Component v1.4.2)
MCA coll: self (MCA v2.0, API v2.0, Component v1.4.2)
MCA coll: sm (MCA v2.0, API v2.0, Component v1.4.2)
MCA coll: sync (MCA v2.0, API v2.0, Component v1.4.2)
MCA coll: tuned (MCA v2.0, API v2.0, Component v1.4.2)
MCA io: romio (MCA v2.0, API v2.0, Component v1.4.2)
MCA mpool: fake (MCA v2.0, API v2.0, Component v1.4.2)
MCA mpool: rdma (MCA v2.0, API v2.0, Component v1.4.2)
MCA mpool: sm (MCA v2.0, API v2.0, Component v1.4.2)
MCA pml: cm (MCA v2.0, API v2.0, Component v1.4.2)
MCA pml: csum (MCA v2.0, API v2.0, Component v1.4.2)
MCA pml: ob1 (MCA v2.0, API v2.0, Component v1.4.2)
MCA pml: v (MCA v2.0, API v2.0, Component v1.4.2)
MCA bml: r2 (MCA v2.0, API v2.0, Component v1.4.2)
MCA rcache: vma (MCA v2.0, API v2.0, Component v1.4.2)
MCA btl: self (MCA v2.0, API v2.0, Component v1.4.2)
MCA btl: sm (MCA v2.0, API v2.0, Component v1.4.2)
MCA btl: tcp (MCA v2.0, API v2.0, Component v1.4.2)
MCA topo: unity (MCA v2.0, API v2.0, Component v1.4.2)
MCA osc: pt2pt (MCA v2.0, API v2.0, Component v1.4.2)
MCA osc: rdma (MCA v2.0, API v2.0, Component v1.4.2)
MCA iof: hnp (MCA v2.0, API v2.0, Component v1.4.2)
MCA iof: orted (MCA v2.0, API v2.0, Component v1.4.2)
MCA iof: tool (MCA v2.0, API v2.0, Component v1.4.2)
MCA oob: tcp (MCA v2.0, API v2.0, Component v1.4.2)
MCA odls: default (MCA v2.0, API v2.0, Component v1.4.2)
MCA ras: slurm (MCA v2.0, API v2.0, Component v1.4.2)
MCA rmaps: load_balance (MCA v2.0, API v2.0, Component v1.4.2)
MCA rmaps: rank_file (MCA v2.0, API v2.0, Component v1.4.2)
MCA rmaps: round_robin (MCA v2.0, API v2.0, Component v1.4.2)
MCA rmaps: seq (MCA v2.0, API v2.0, Component v1.4.2)
MCA rml: oob (MCA v2.0, API v2.0, Component v1.4.2)
MCA routed: binomial (MCA v2.0, API v2.0, Component v1.4.2)
MCA routed: direct (MCA v2.0, API v2.0, Component v1.4.2)
MCA routed: linear (MCA v2.0, API v2.0, Component v1.4.2)
MCA plm: rsh (MCA v2.0, API v2.0, Component v1.4.2)
MCA plm: slurm (MCA v2.0, API v2.0, Component v1.4.2)
MCA filem: rsh (MCA v2.0, API v2.0, Component v1.4.2)
MCA errmgr: default (MCA v2.0, API v2.0, Component v1.4.2)
MCA ess: env (MCA v2.0, API v2.0, Component v1.4.2)
MCA ess: hnp (MCA v2.0, API v2.0, Component v1.4.2)
MCA ess: singleton (MCA v2.0, API v2.0, Component v1.4.2)
MCA ess: slurm (MCA v2.0, API v2.0, Component v1.4.2)
MCA ess: tool (MCA v2.0, API v2.0, Component v1.4.2)
MCA grpcomm: bad (MCA v2.0, API v2.0, Component v1.4.2)
MCA grpcomm: basic (MCA v2.0, API v2.0, Component v1.4.2)
Packet errors recorded on the interfaces:
[root@mn2 x86_64]# psh compute "ifconfig eth1 | grep dropped | grep -v dropped:0" 2> /dev/null #dropped packets
n036: RX packets:208526209 errors:0 dropped:276 overruns:0 frame:0
n041: RX packets:208124469 errors:0 dropped:490 overruns:0 frame:0
n049: RX packets:205210459 errors:0 dropped:3 overruns:0 frame:0
[root@mn2 x86_64]# psh compute "ifconfig eth1 | grep errors | grep -v errors:0" 2> /dev/null #errors
n015: RX packets:187001146 errors:2 dropped:0 overruns:1 frame:1
[root@mn2 x86_64]# psh compute "ifconfig eth1 | grep overruns | grep -v overruns:0" 2> /dev/null #overruns
n005: RX packets:187277767 errors:0 dropped:0 overruns:1 frame:0
n006: RX packets:194125904 errors:0 dropped:0 overruns:1 frame:0
n007: RX packets:186417641 errors:0 dropped:0 overruns:1 frame:0
n008: RX packets:194454153 errors:0 dropped:0 overruns:1 frame:0
n010: RX packets:195436029 errors:0 dropped:0 overruns:1 frame:0
n011: RX packets:188181596 errors:0 dropped:0 overruns:1 frame:0
n012: RX packets:195258014 errors:0 dropped:0 overruns:1 frame:0
n013: RX packets:220247195 errors:0 dropped:0 overruns:1 frame:0
n014: RX packets:194602129 errors:0 dropped:0 overruns:1 frame:0
n015: RX packets:187001146 errors:2 dropped:0 overruns:1 frame:1
n017: RX packets:186450932 errors:0 dropped:0 overruns:1 frame:0
n019: RX packets:188611197 errors:0 dropped:0 overruns:1 frame:0
n020: RX packets:203277086 errors:0 dropped:0 overruns:1 frame:0
[root@mn2 x86_64]# psh compute "ifconfig eth1 | grep frame | grep -v frame:0" 2> /dev/null #frame errors
n015: RX packets:187001146 errors:2 dropped:0 overruns:1 frame:1
[root@mn2 x86_64]# psh compute "ifconfig eth0 | grep dropped | grep -v dropped:0" 2> /dev/null #eth0 drops
n041: RX packets:363602943 errors:0 dropped:4 overruns:0 frame:0
n049: RX packets:355928515 errors:0 dropped:1946 overruns:0 frame:0
n065: RX packets:363838354 errors:0 dropped:149 overruns:0 frame:0
gdb output:
[root@n001 ~]# gdb program 7308 #node with completed launch, gdb on mpirun
GNU gdb (GDB) Red Hat Enterprise Linux (7.0.1-23.el5)
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
program: No such file or directory.
Attaching to process 7308
Reading symbols from /usr/bin/orterun...(no debugging symbols found)...done.
warning: .dynamic section for "/usr/lib64/libopen-rte.so.0" is not at the expected address
warning: difference appears to be caused by prelink, adjusting expectations
warning: .dynamic section for "/usr/lib64/libopen-pal.so.0" is not at the expected address
warning: difference appears to be caused by prelink, adjusting expectations
Reading symbols from /usr/lib64/libopen-rte.so.0...done.
Loaded symbols for /usr/lib64/libopen-rte.so.0
Reading symbols from /usr/lib64/libopen-pal.so.0...done.
Loaded symbols for /usr/lib64/libopen-pal.so.0
Reading symbols from /lib64/libdl.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libdl.so.2
Reading symbols from /lib64/libnsl.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib64/libnsl.so.1
Reading symbols from /lib64/libutil.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib64/libutil.so.1
Reading symbols from /lib64/libm.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libm.so.6
Reading symbols from /lib64/libpthread.so.0...(no debugging symbols found)...done.
[Thread debugging using libthread_db enabled]
Loaded symbols for /lib64/libpthread.so.0
Reading symbols from /lib64/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Reading symbols from /usr/lib64/openmpi/mca_paffinity_linux.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_paffinity_linux.so
Reading symbols from /usr/lib64/openmpi/mca_carto_auto_detect.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_carto_auto_detect.so
Reading symbols from /usr/lib64/openmpi/mca_ess_hnp.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_ess_hnp.so
Reading symbols from /usr/lib64/openmpi/mca_plm_rsh.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_plm_rsh.so
Reading symbols from /usr/lib64/openmpi/mca_rml_oob.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_rml_oob.so
Reading symbols from /usr/lib64/openmpi/mca_oob_tcp.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_oob_tcp.so
Reading symbols from /usr/lib64/openmpi/mca_routed_binomial.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_routed_binomial.so
Reading symbols from /usr/lib64/openmpi/mca_grpcomm_bad.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_grpcomm_bad.so
Reading symbols from /usr/lib64/openmpi/mca_rmaps_round_robin.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_rmaps_round_robin.so
Reading symbols from /usr/lib64/openmpi/mca_errmgr_default.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_errmgr_default.so
Reading symbols from /usr/lib64/openmpi/mca_odls_default.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_odls_default.so
Reading symbols from /lib64/libnss_files.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libnss_files.so.2
Reading symbols from /usr/lib64/openmpi/mca_iof_hnp.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_iof_hnp.so
Reading symbols from /usr/lib64/openmpi/mca_filem_rsh.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_filem_rsh.so
Reading symbols from /lib64/libnss_dns.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libnss_dns.so.2
Reading symbols from /lib64/libresolv.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libresolv.so.2
0x00000037fb6caeff in poll () from /lib64/libc.so.6
(gdb) bt
#0 0x00000037fb6caeff in poll () from /lib64/libc.so.6
#1 0x00002ab5d8a97dd6 in poll_dispatch (base=0x1f6654b0, arg=0x1f66a8a0, tv=0xffffffffffffffff) at poll.c:168
#2 0x00002ab5d8a96d87 in opal_event_base_loop (base=0x1f6654b0, flags=1) at event.c:807
#3 0x00002ab5d8a8bebe in opal_progress () at runtime/opal_progress.c:189
#4 0x00002ab5d8862e05 in orte_plm_base_report_launched (job=498073601) at base/plm_base_launch_support.c:754
#5 orte_plm_base_launch_apps (job=498073601) at base/plm_base_launch_support.c:204
#6 0x00002ab5d94eb924 in orte_plm_rsh_launch (jdata=<value optimized out>) at plm_rsh_module.c:1234
#7 0x000000000040333a in orterun ()
#8 0x00000000004029d3 in main ()
[root@n016 ~]# gdb #Node with stalled launch?
GNU gdb (GDB) Red Hat Enterprise Linux (7.0.1-23.el5)
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
(gdb) exit
Undefined command: "exit". Try "help".
(gdb) exit program 29429
Undefined command: "exit". Try "help".
(gdb) quit
[root@n016 ~]# gdb program 29472
GNU gdb (GDB) Red Hat Enterprise Linux (7.0.1-23.el5)
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
program: No such file or directory.
Attaching to process 29472
ptrace: No such process.
/root/29472: No such file or directory.
(gdb) quit
[root@n016 ~]# gdb program 29429
GNU gdb (GDB) Red Hat Enterprise Linux (7.0.1-23.el5)
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
program: No such file or directory.
Attaching to process 29429
Reading symbols from /usr/bin/orted...(no debugging symbols found)...done.
warning: .dynamic section for "/usr/lib64/libopen-rte.so.0" is not at the expected address
warning: difference appears to be caused by prelink, adjusting expectations
warning: .dynamic section for "/usr/lib64/libopen-pal.so.0" is not at the expected address
warning: difference appears to be caused by prelink, adjusting expectations
Reading symbols from /usr/lib64/libopen-rte.so.0...done.
Loaded symbols for /usr/lib64/libopen-rte.so.0
Reading symbols from /usr/lib64/libopen-pal.so.0...done.
Loaded symbols for /usr/lib64/libopen-pal.so.0
Reading symbols from /lib64/libdl.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libdl.so.2
Reading symbols from /lib64/libnsl.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib64/libnsl.so.1
Reading symbols from /lib64/libutil.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib64/libutil.so.1
Reading symbols from /lib64/libm.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libm.so.6
Reading symbols from /lib64/libpthread.so.0...(no debugging symbols found)...done.
[Thread debugging using libthread_db enabled]
Loaded symbols for /lib64/libpthread.so.0
Reading symbols from /lib64/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Reading symbols from /usr/lib64/openmpi/mca_paffinity_linux.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_paffinity_linux.so
Reading symbols from /usr/lib64/openmpi/mca_carto_auto_detect.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_carto_auto_detect.so
Reading symbols from /usr/lib64/openmpi/mca_ess_env.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_ess_env.so
Reading symbols from /usr/lib64/openmpi/mca_rml_oob.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_rml_oob.so
Reading symbols from /usr/lib64/openmpi/mca_oob_tcp.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_oob_tcp.so
Reading symbols from /usr/lib64/openmpi/mca_routed_binomial.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_routed_binomial.so
Reading symbols from /usr/lib64/openmpi/mca_grpcomm_bad.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_grpcomm_bad.so
Reading symbols from /usr/lib64/openmpi/mca_odls_default.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_odls_default.so
Reading symbols from /lib64/libnss_files.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libnss_files.so.2
Reading symbols from /usr/lib64/openmpi/mca_iof_orted.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_iof_orted.so
Reading symbols from /usr/lib64/openmpi/mca_filem_rsh.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_filem_rsh.so
0x0000003c79acaeff in poll () from /lib64/libc.so.6
(gdb) bt
#0 0x0000003c79acaeff in poll () from /lib64/libc.so.6
#1 0x00002b2254814dd6 in poll_dispatch (base=0x1ce73410, arg=0x1ce8b350, tv=0xffffffffffffffff) at poll.c:168
#2 0x00002b2254813d87 in opal_event_base_loop (base=0x1ce73410, flags=1) at event.c:807
#3 0x00002b2254808ebe in opal_progress () at runtime/opal_progress.c:189
#4 0x00002b2255063285 in opal_condition_wait (peer=0x7fffca151d60, iov=<value optimized out>, count=<value optimized out>, tag=1, flags=0) at ../../../../opal/threads/condition.h:99
#5 orte_rml_oob_send (peer=0x7fffca151d60, iov=<value optimized out>, count=<value optimized out>, tag=1, flags=0) at rml_oob_send.c:153
#6 0x00002b2255063402 in orte_rml_oob_send_buffer (peer=0x7fffca151d60, buffer=0x1ceb6530, tag=1, flags=0) at rml_oob_send.c:269
#7 0x00002b22545d12f3 in send_relay (fd=<value optimized out>, opal_event=<value optimized out>, data="" optimized out>) at orted/orted_comm.c:129
#8 orte_daemon_cmd_processor (fd=<value optimized out>, opal_event=<value optimized out>, data="" optimized out>) at orted/orted_comm.c:291
#9 0x00002b2254813e5c in event_process_active (base=0x1ce73410, flags=0) at event.c:651
#10 opal_event_base_loop (base=0x1ce73410, flags=0) at event.c:823
#11 0x00002b22545ced6a in orte_daemon (argc=<value optimized out>, argv=<value optimized out>) at orted/orted_main.c:635
#12 0x0000000000400889 in main ()
(gdb) c
Continuing.
[root@n032 ~]# gdb program 21842 #node without a completed launch of IMB-MPI1
GNU gdb (GDB) Red Hat Enterprise Linux (7.0.1-23.el5)
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
program: No such file or directory.
Attaching to process 21842
Reading symbols from /usr/bin/orted...(no debugging symbols found)...done.
warning: .dynamic section for "/usr/lib64/libopen-rte.so.0" is not at the expected address
warning: difference appears to be caused by prelink, adjusting expectations
warning: .dynamic section for "/usr/lib64/libopen-pal.so.0" is not at the expected address
warning: difference appears to be caused by prelink, adjusting expectations
Reading symbols from /usr/lib64/libopen-rte.so.0...done.
Loaded symbols for /usr/lib64/libopen-rte.so.0
Reading symbols from /usr/lib64/libopen-pal.so.0...done.
Loaded symbols for /usr/lib64/libopen-pal.so.0
Reading symbols from /lib64/libdl.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libdl.so.2
Reading symbols from /lib64/libnsl.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib64/libnsl.so.1
Reading symbols from /lib64/libutil.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib64/libutil.so.1
Reading symbols from /lib64/libm.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libm.so.6
Reading symbols from /lib64/libpthread.so.0...(no debugging symbols found)...done.
[Thread debugging using libthread_db enabled]
Loaded symbols for /lib64/libpthread.so.0
Reading symbols from /lib64/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Reading symbols from /usr/lib64/openmpi/mca_paffinity_linux.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_paffinity_linux.so
Reading symbols from /usr/lib64/openmpi/mca_carto_auto_detect.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_carto_auto_detect.so
Reading symbols from /usr/lib64/openmpi/mca_ess_env.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_ess_env.so
Reading symbols from /usr/lib64/openmpi/mca_rml_oob.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_rml_oob.so
Reading symbols from /usr/lib64/openmpi/mca_oob_tcp.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_oob_tcp.so
Reading symbols from /usr/lib64/openmpi/mca_routed_binomial.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_routed_binomial.so
Reading symbols from /usr/lib64/openmpi/mca_grpcomm_bad.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_grpcomm_bad.so
Reading symbols from /usr/lib64/openmpi/mca_odls_default.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_odls_default.so
Reading symbols from /lib64/libnss_files.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libnss_files.so.2
Reading symbols from /usr/lib64/openmpi/mca_iof_orted.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_iof_orted.so
Reading symbols from /usr/lib64/openmpi/mca_filem_rsh.so...done.
Loaded symbols for /usr/lib64/openmpi/mca_filem_rsh.so
0x00000034670caeff in poll () from /lib64/libc.so.6
(gdb) bt
#0 0x00000034670caeff in poll () from /lib64/libc.so.6
#1 0x00002abb07442dd6 in poll_dispatch (base=0x32cb480, arg=0x32cb330, tv=0xffffffffffffffff) at poll.c:168
#2 0x00002abb07441d87 in opal_event_base_loop (base=0x32cb480, flags=0) at event.c:807
#3 0x00002abb071fcd6a in orte_daemon (argc=<value optimized out>, argv=<value optimized out>) at orted/orted_main.c:635
#4 0x0000000000400889 in main ()
(gdb) c
Continuing.
Bryan Reese
e1350/xCAT Linux cluster test
e1350/xCAT Linux cluster test