> Begin forwarded message: > > From: "Gary E. Gorbet" <gegor...@gmail.com> > Subject: cluster status function in Airavata > Date: October 7, 2015 at 5:30:22 PM CDT > To: SciGaP Dev <d...@scigap.org> > > The two text files attached illustrate a Gateway function that is currently > performed by direct ssh commands. I would like to see that functionality > moved to Airavata for two main reasons: > > (1) Any refinements for changes on clusters or for new clusters would be > centralized; and > > (2) Gateway servers on multiple hosts would referenced common APIs. > > The first attached file - cluster_status.php-local_status.txt - is the code > portion from a local_status() function within the UltraScan gateway script > cluster_status.php. This code portion shows the ssh commands issued for each > of a list of clusters. The parsed command output reveals the count of running > jobs and queued jobs for each cluster. That information is used on the > gateway submit page to hint at likely wait-in-queue-status time. My proposal > is that a Thrift client API would return this information, using code on the > Thrift server similar to that in the sample PHP script. > > The second attached file shows a bash shell script executed remotely on the > Jureca cluster at the PRACE Juelich center in Germany. This special script > was found to be necessary because none of the normal queue status commands ( > sinfo, qstat, ... ) were found to return reliable information. The script > shown basically just issues squeue commands and counts the lines returned as > a way of counting PENDING and RUNNING jobs in the “batch” queue on Jureca. > > The attached image file shows how the job count information is used at the > submit stage of the UltraScan gateway. > > - Gary > > > > > >
... // Get local cluster status
function local_status() { global $self; global $data; //$clusters = array( "alamo", "lonestar", "stampede", "comet", "gordon" ); $clusters = array( "alamo", "lonestar", "stampede", "comet", "gordon", "jureca", "jacinto" ); foreach ( $clusters as $clname ) { $a = Array(); switch( $clname ) { case 'alamo': { $host = "u...@alamo.uthscsa.edu"; $qstat = `ssh $host '/usr/bin/qstat -B 2>&1|tail -1'`; $sparts = preg_split( '/\s+/', $qstat ); $que = $sparts[ 3 ]; $run = $sparts[ 4 ]; $sta = $sparts[ 10 ]; if ( $sta == "Active" ) $sta = "up"; else $sta = "down"; break; } case 'jacinto': { $host = "u...@jacinto.uthscsa.edu"; $qstat = `ssh $host '/opt/torque/bin/qstat -B 2>&1|tail -1'`; $sparts = preg_split( '/\s+/', $qstat ); $que = $sparts[ 3 ]; $run = $sparts[ 4 ]; $sta = $sparts[ 9 ]; if ( $sta == "Active" ) $sta = "up"; else $sta = "down"; break; } case 'stampede': { $host = "u...@stampede.tacc.utexas.edu"; $qstat = `ssh $host '/usr/local/bin/showq 2>&1|tail -1'`; $sparts = preg_split( '/\s+/', $qstat ); $tot = $sparts[ 2 ]; $run = $sparts[ 5 ]; $que = $sparts[ 8 ]; $sta = "up"; if ( $tot == '' || $tot == '0' ) $sta = "down"; break; } case 'lonestar': { $host = "u...@lonestar.tacc.utexas.edu"; $qstat = `ssh $host 'showq 2>&1|tail -1'`; $sparts = preg_split( '/\s+/', $qstat ); $tot = $sparts[ 2 ]; $run = '0'; $que = '0'; $sta = "up"; if ( $tot == '' || $tot == '0' ) { $sta = "down"; } else { $run = $sparts[ 5 ]; $que = $sparts[ 8 ]; } break; } case 'comet': { $host = "u...@comet.sdsc.edu"; $qstat = `ssh $host '/usr/bin/sinfo -s -p compute -o "%a %F" |tail -1'`; $sparts = preg_split( '/\s+/', $qstat ); $sta = $sparts[ 0 ]; $knts = $sparts[ 1 ]; $sparts = preg_split( '/\//', $knts ); $run = $sparts[ 0 ]; $que = $sparts[ 1 ]; if ( $sta == "" ) $sta = "down"; break; } case 'gordon': { $host = "u...@gordon.sdsc.edu"; $qstat = `ssh $host '/opt/torque/bin/qstat -B 2>&1|tail -1'`; $sparts = preg_split( '/\s+/', $qstat ); $que = $sparts[ 3 ]; $run = $sparts[ 4 ]; $sta = $sparts[ 10 ]; if ( $sta == "Active" ) $sta = "up"; else $sta = "down"; break; } case 'jureca': { $host = "sw...@jureca.fz-juelich.de"; $qstat = `ssh $host '~swus1/scripts/qstat-jureca 2>&1'`; $sparts = preg_split( '/\s+/', $qstat ); $sta = $sparts[ 0 ]; $run = $sparts[ 1 ]; $que = $sparts[ 2 ]; break; } } if ( $sta == "down" ) { $que = "0"; $run = "0"; } $a[ 'cluster' ] = $clname; $a[ 'queued' ] = $que; $a[ 'running' ] = $run; $a[ 'status' ] = $sta; $data[] = $a; if ( $clname == 'alamo' || $clname == 'jacinto' ) { $a[ 'cluster' ] = $clname . "-local"; $data[] = $a; } } } ...
#!/bin/bash # qstat-jureca - count queued/running jobs on Jureca JRUN=`squeue -t RUNNING -p batch|wc -l` JQUE=`squeue -t PENDING -p batch|wc -l` JQUE=`expr $JQUE - 1` JRUN=`expr $JRUN - 1` JOBS=`expr $JQUE + $JRUN` QSTA="up" if [ $JOBS -lt 1 ]; then QSTA="down" fi echo "$QSTA $JRUN $JQUE $JOBS"
> >
>