1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
|
#!/usr/bin/perl -w
use strict;
# Usage: waitForSlurmJobs.pl <verbose [1 or 0]> <delay in seconds in range 10-600> [job IDs]
#
#
# Takes as args a string of sbatch job IDs and periodically monitors them. Once they all finish, it returns 0
#
# If any of the jobs go into error state, an error is printed to stderr and the program waits for the non-error
# jobs to finish, then returns 1
my ( $verbose, $delay, @jobIDs ) = @ARGV;
my %COMPLETION_STATES = map {
$_ => 1
} qw( COMPLETED );
my %FAILURE_STATES = map {
$_ => 1
} qw(
CANCELLED FAILED NODE_FAIL PREEMPTED TIMEOUT
);
# Validate that the delay is within the acceptable range
if ($delay < 10) {
print STDERR "Sleep period is too short, will poll queue once every 10 seconds\n";
$delay = 10;
} elsif ($delay > 3600) {
print STDERR "Sleep period is too long, will poll queue once every 60 minutes\n";
$delay = 3600;
}
print " Waiting for " . scalar( @jobIDs ) . " jobs: @jobIDs\n";
my $errorsEncountered = 0;
wait_for_all_jobs_to_complete(@jobIDs);
if ($errorsEncountered) {
print " No more jobs to run - some jobs had errors\n\n";
exit 1;
}
else {
print " No more jobs in queue\n\n";
exit 0;
}
sub wait_for_all_jobs_to_complete {
my @pendingJobs = update_pending_jobs(@_);
while (@pendingJobs) {
if ($verbose) {
my $timestamp = `date`;
chomp $timestamp;
printf " ($timestamp) Still waiting for %d jobs\n\n", scalar(@pendingJobs);
}
# Use of backticks rather than system permits a ctrl+c to work
`sleep $delay`;
@pendingJobs = update_pending_jobs(@pendingJobs);
};
}
sub update_pending_jobs {
my (@jobsToQuery) = @_;
my %jobStatuses = query_job_statuses(@jobsToQuery);
if (!scalar(@jobsToQuery) || !%jobStatuses)
{
# No more jobs remain
return ();
}
if ($verbose) {
while (my ($job, $status) = each(%jobStatuses)) {
print(" Job $job is in state $status\n");
}
}
my @terminatedJobs = grep {
!exists($jobStatuses{$jobsToQuery[$_]}) || exists($COMPLETION_STATES{$jobStatuses{$jobsToQuery[$_]}})
} 0..$#jobsToQuery;
my @failedJobs = grep {
exists($jobStatuses{$jobsToQuery[$_]}) && exists($FAILURE_STATES{$jobStatuses{$jobsToQuery[$_]}})
} 0..$#jobsToQuery;
if (@failedJobs) {
$errorsEncountered = 1;
}
push @terminatedJobs, @failedJobs;
foreach my $index (reverse(@terminatedJobs)) {
splice @jobsToQuery, $index, 1;
}
return @jobsToQuery;
}
sub query_job_statuses {
my (@jobsToQuery) = @_;
my $user = trim(`whoami`);
my $squeueOutput = qx/squeue --noheader --user="$user" --format="%i,%T" --jobs=${\join(',', @jobsToQuery)}/;
my $exitcode = $? >> 8;
my %jobStatuses = ();
if ($exitcode == 0) {
%jobStatuses = map {
my @statusParts = split(",", $_);
$statusParts[0] => $statusParts[1];
} split("\n", trim($squeueOutput));
}
return %jobStatuses;
}
sub trim {
my ($string) = @_;
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
|