Hi,

it looks like the connection to nodes is set to `ssh`. Does your output of:

$ qconf -sconf
#global:
qlogin_command
qlogin_daemon
rlogin_command
rlogin_daemon
rsh_command
rsh_daemon

reflect this? Do you need `ssh` to access nodes by SGE for X11 forwarding?

-- Reuti


> Am 26.02.2018 um 15:29 schrieb Nils Giordano <[email protected]>:
> 
> Dear all,
> 
> I try to run a simple Makefile with qmake (SGE 8.1.9) but it fails
> everytime after the first round of commands with the following error:
> ------------------------------------------
> $ qmake -cwd -v PATH -pe make 1 -verbose --
> [...]
> reading exit code from shepherd ... timeout (60 s) expired while waiting
> on socket fd 4
> error: error reading returncode of remote command
> cleaning up after abnormal exit of /usr/bin/ssh -o LogLevel=ERROR
> ------------------------------------------
> ------------------------------------------
> $ cat Makefile
> all: a.y b.y c.y d.y e.y f.y
> 
> %.y: %.x
>     touch $@; sleep 3
> ------------------------------------------
> 
> Overall, only a.y is created. If I use N slots (with -pe make 1-N), only
> N files are created. It seems to me that qmake gets stuck because it
> fails to close opened connections. Note that I have the same problem
> when I do not use the -pe option, or when I try to run qmake -inherit in
> a qsub script. Apart from that, qsub and qlogin work fine.
> 
> I think I narrowed the problem to be related to qrsh, as I have a
> similar error with this command:
> ------------------------------------------
> $ qrsh -cwd -v PATH -verbose hostname
> Your job 121477 ("hostname") has been submitted
> waiting for interactive job to be scheduled ...
> Your interactive job 121477 has been successfully scheduled.
> Establishing /usr/bin/ssh -o LogLevel=ERROR session to host XXX.prive ...
> XXX.prive
> /usr/bin/ssh -o LogLevel=ERROR exited with exit code 0
> reading exit code from shepherd ... timeout (60 s) expired while waiting
> on socket fd 4
> error: error reading returncode of remote command
> cleaning up after abnormal exit of /usr/bin/ssh -o LogLevel=ERROR
> ------------------------------------------
> 
> Any idea what might cause this problem? You can find below the complete
> output.
> 
> Sincerely,
> −Nils
> 
> ------------------------------------------
> Complete output:
> $ qstat -help
> SGE 8.1.9
> $ qmake -cwd -verbose -v PATH -pe make 1 --
> dynamic task allocation mode
> sge_argv[0] = qmake
> sge_argv[1] = -cwd
> sge_argv[2] = -verbose
> sge_argv[3] = -v
> sge_argv[4] = PATH
> sge_argv[5] = -pe
> sge_argv[6] = make
> sge_argv[7] = 1
> gmake_argv[0]  = qmake
> determine qmake startmode
> setting default options: -l arch=lx-amd64
> creating scheduled qmake
> argv[  0] = qrsh
> argv[  1] = -noshell
> argv[  2] = -cwd
> argv[  3] = -verbose
> argv[  4] = -v
> argv[  5] = PATH
> argv[  6] = -pe
> argv[  7] = make
> argv[  8] = 1
> argv[  9] = -l
> argv[ 10] = arch=lx-amd64
> argv[ 11] = qmake
> argv[ 12] = -inherit
> argv[ 13] = -verbose
> argv[ 14] = -cwd
> argv[ 15] = -v
> argv[ 16] = PATH
> argv[ 17] = -l
> argv[ 18] = arch=lx-amd64
> argv[ 19] = --
> Your job 121548 ("qmake") has been submitted
> waiting for interactive job to be scheduled ...
> Your interactive job 121548 has been successfully scheduled.
> Establishing /usr/bin/ssh -o LogLevel=ERROR session to host
> gknzwd2.XXX.prive ...
> sge_argv[0] = qmake
> sge_argv[1] = -inherit
> sge_argv[2] = -verbose
> sge_argv[3] = -cwd
> sge_argv[4] = -v
> sge_argv[5] = PATH
> sge_argv[6] = -l
> sge_argv[7] = arch=lx-amd64
> gmake_argv[0]  = qmake
> determine qmake startmode
> inserting -j option from NSLOTS environment: -j 1
> sge hostfile =
> /opt/sge/BiRD_v2/spool/gknzwd2/active_jobs/121548.1/pe_hostfile
> qmake  hostfile = /tmp/121548.1.max-24h.q/qmake_hostfile
> qmake  lockfile = /tmp/121548.1.max-24h.q/qmake_lockfile
> creating qmake hostfile
> number of slots for qmake execution is 1
> enabling next task to be executed as Grid Engine parallel task
> touch a.y; sleep 3
> export the following environment variables:
> SGE_RSH_COMMAND,BASH_FUNC_module(),MAKEFLAGS,MFLAGS,MAKELEVEL
> obtained lock to qmake lockfile
> clearing lock to hostfile
> next host for qmake job is gknzwd2.XXX.prive
> gknzwd2.XXX.prive
> gmake requesting status of dead child processes
> gmake requesting status of dead child processes
> waiting for child failed: timeout
> starting job:
> args[  0] = qrsh
> args[  1] = -noshell
> args[  2] = -verbose
> args[  3] = -inherit
> args[  4] = -cwd
> args[  5] = -v
> args[  6] = SGE_RSH_COMMAND,BASH_FUNC_module(),MAKEFLAGS,MFLAGS,MAKELEVEL
> args[  7] = -v
> args[  8] = PATH
> args[  9] = gknzwd2.XXX.prive
> args[ 10] = /bin/sh
> args[ 11] = -c
> args[ 12] = touch a.y; sleep 3
> Starting server daemon at host "gknzwd2.XXX.prive"
> Server daemon successfully started with task id "1.gknzwd2"
> Establishing /usr/bin/ssh -o LogLevel=ERROR session to host
> gknzwd2.XXX.prive ...
> /usr/bin/ssh -o LogLevel=ERROR exited with exit code 0
> reading exit code from shepherd ... timeout (60 s) expired while waiting
> on socket fd 4
> error: error reading returncode of remote command
> obtained lock to qmake lockfile
> unlock_hostentry 0
> clearing lock to hostfile
> qmake: *** [a.y] Error 255
> cleanup of remote mechanism
> /usr/bin/ssh -o LogLevel=ERROR exited with exit code 0
> reading exit code from shepherd ... timeout (60 s) expired while waiting
> on socket fd 4
> error: error reading returncode of remote command
> cleaning up after abnormal exit of /usr/bin/ssh -o LogLevel=ERROR
> 
> 
> 
> _______________________________________________
> users mailing list
> [email protected]
> https://gridengine.org/mailman/listinfo/users
> 


_______________________________________________
users mailing list
[email protected]
https://gridengine.org/mailman/listinfo/users

Reply via email to