1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
#!/usr/bin/perl -w
use strict;
# Usage: waitForSGEQJobs.pl <verbose [1 or 0]> <delay in seconds in range 10-600> [job IDs]
#
#
# Takes as args a string of qsub job IDs and periodically monitors them. Once they all finish, it returns 0
#
# If any of the jobs go into error state, an error is printed to stderr and the program waits for the non-error
# jobs to finish, then returns 1
#
# Usual qstat format - check this at run time
# job-ID prior name user state submit/start at queue slots ja-task-ID
# First thing to do is parse our input
my ( $verbose, $delay, @jobIDs ) = @ARGV;
# Check for user stupidity
if( $delay < 10 )
{
print STDERR "Sleep period is too short, will poll queue once every 10 seconds\n";
$delay = 10;
}
elsif( $delay > 3600 )
{
print STDERR "Sleep period is too long, will poll queue once every 60 minutes\n";
$delay = 3600;
}
print " Waiting for " . scalar( @jobIDs ) . " jobs: @jobIDs\n";
my $user=`whoami`;
my $qstatOutput = `qstat -u $user`;
if( !scalar(@jobIDs) || !$qstatOutput )
{
# Nothing to do
exit 0;
}
my @qstatLines = split("\n", $qstatOutput);
# my @header = split('\s+', trim($qstatLines[0]));
# Position in qstat output of tokens we want
# Here we hardcode the values that work at UVa
my $jobID_Pos = 0;
my $statePos = 9;
# foreach my $i (0..$#header)
# {
# if ( $header[$i] eq "Job ID" )
# {
# $jobID_Pos = $i;
# }
# elsif ($header[$i] eq "state")
# {
# $statePos = $i;
# }
# }
# If we can't parse the job IDs, something is very wrong
# if ($jobID_Pos < 0 || $statePos < 0)
# {
# die "Cannot find job-ID and state field in qstat output, cannot monitor jobs\n";
# }
# Now check on all of our jobs
my $jobsIncomplete = 1;
# Set to 1 for any job in an error state
my $haveErrors = 0;
while( $jobsIncomplete )
{
# Jobs that are still showing up in qstat
$jobsIncomplete = 0;
foreach my $job (@jobIDs)
{
# iterate over all user jobs in the queue
qstatLine: foreach my $line ( @qstatLines )
{
# trim string for trailing white space so that the tokens are in the correct sequence
# We are being paranoid by matching tokens to job-IDs this way. Less elegant than a
# match but also less chance of a false-positive match
my @tokens = split( '\s+', trim( $line ) );
# The qstat command only prints the first 15 characters of the job
# so we only compare the first 15 characters
my $job_short = substr( $job, 0, 15 );
if( @tokens > 0 && ( $tokens[$jobID_Pos] =~ m/$job_short/ ) )
{
# Check status - there's no error state in PBS
# so we simply skip over this check
# if( $tokens[$statePos] =~ m/E/ )
# {
# $haveErrors = 1;
# }
# else
# {
$jobsIncomplete = $jobsIncomplete + 1;
# }
if( $verbose )
{
print " Job $job is in state $tokens[$statePos]\n";
}
}
last qstatLine if ( @tokens > 0 && ( $tokens[$jobID_Pos] =~ m/$job_short/ ) );
}
}
if( $jobsIncomplete )
{
if( $verbose )
{
my $timestamp = `date`;
chomp $timestamp;
print " ($timestamp) Still waiting for $jobsIncomplete jobs\n\n";
}
# Use of backticks rather than system permits a ctrl+c to work
`sleep $delay`;
$qstatOutput = `qstat -u $user`;
@qstatLines = split("\n", $qstatOutput);
}
}
if ($haveErrors) {
print " No more jobs to run - some jobs had errors\n\n";
exit 1;
}
else {
print " No more jobs in queue\n\n";
exit 0;
}
sub trim {
my ($string) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
|