File: waitForSlurmJobs.pl

package info (click to toggle)
ants 2.5.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 11,672 kB
  • sloc: cpp: 85,685; sh: 15,850; perl: 863; xml: 115; python: 111; makefile: 68
file content (129 lines) | stat: -rwxr-xr-x 3,191 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/perl -w

use strict;

# Usage: waitForSlurmJobs.pl <verbose [1 or 0]> <delay in seconds in range 10-600> [job IDs]
#
#
# Takes as args a string of sbatch job IDs and periodically monitors them. Once they all finish, it returns 0
#
# If any of the jobs go into error state, an error is printed to stderr and the program waits for the non-error
# jobs to finish, then returns 1

my ( $verbose, $delay, @jobIDs ) = @ARGV;

my %COMPLETION_STATES = map {
    $_ => 1
} qw( COMPLETED );

my %FAILURE_STATES = map {
    $_ => 1
} qw(
    CANCELLED FAILED NODE_FAIL PREEMPTED TIMEOUT
    );

# Validate that the delay is within the acceptable range
if ($delay < 10) {
    print STDERR "Sleep period is too short, will poll queue once every 10 seconds\n";
    $delay = 10;
} elsif ($delay > 3600) {
    print STDERR "Sleep period is too long, will poll queue once every 60 minutes\n";
    $delay = 3600;
}


print "  Waiting for " . scalar( @jobIDs ) . " jobs: @jobIDs\n";

my $errorsEncountered = 0;

wait_for_all_jobs_to_complete(@jobIDs);

if ($errorsEncountered) {
    print "  No more jobs to run - some jobs had errors\n\n";
    exit 1;
}
else {
    print "  No more jobs in queue\n\n";
    exit 0;
}



sub wait_for_all_jobs_to_complete {
    my @pendingJobs = update_pending_jobs(@_);

    while (@pendingJobs) {
        if ($verbose) {
            my $timestamp = `date`;
            chomp $timestamp;
            printf "  ($timestamp) Still waiting for %d jobs\n\n", scalar(@pendingJobs);
        }

        # Use of backticks rather than system permits a ctrl+c to work
        `sleep $delay`;

        @pendingJobs = update_pending_jobs(@pendingJobs);
    };
}

sub update_pending_jobs {
    my (@jobsToQuery) = @_;
    my %jobStatuses = query_job_statuses(@jobsToQuery);

    if (!scalar(@jobsToQuery) || !%jobStatuses)
    {
        # No more jobs remain
        return ();
    }

    if ($verbose) {
        while (my ($job, $status) = each(%jobStatuses)) {
            print("    Job $job is in state $status\n");
        }
    }

    my @terminatedJobs = grep {
        !exists($jobStatuses{$jobsToQuery[$_]}) || exists($COMPLETION_STATES{$jobStatuses{$jobsToQuery[$_]}})
    } 0..$#jobsToQuery;

    my @failedJobs = grep {
        exists($jobStatuses{$jobsToQuery[$_]}) && exists($FAILURE_STATES{$jobStatuses{$jobsToQuery[$_]}})
    } 0..$#jobsToQuery;

    if (@failedJobs) {
        $errorsEncountered = 1;
    }

    push @terminatedJobs, @failedJobs;
    foreach my $index (reverse(@terminatedJobs)) {
        splice @jobsToQuery, $index, 1;
    }

    return @jobsToQuery;
}

sub query_job_statuses {
    my (@jobsToQuery) = @_;
    my $user = trim(`whoami`);
    my $squeueOutput = qx/squeue --noheader --user="$user" --format="%i,%T" --jobs=${\join(',', @jobsToQuery)}/;
    my $exitcode = $? >> 8;
    my %jobStatuses = ();

    if ($exitcode == 0) {
        %jobStatuses = map {
            my @statusParts = split(",", $_);
            $statusParts[0] => $statusParts[1];
        } split("\n", trim($squeueOutput));
    }

    return %jobStatuses;
}

sub trim {
    my ($string) = @_;

    $string =~ s/^\s+//;
    $string =~ s/\s+$//;

    return $string;
}