File: waitForSGEQJobs.pl

package info (click to toggle)
ants 2.1.0-5
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 10,656 kB
  • sloc: cpp: 84,137; sh: 11,419; perl: 694; xml: 115; makefile: 74; python: 48
file content (142 lines) | stat: -rwxr-xr-x 3,228 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/perl -w

use strict;

# Usage: waitForSGEQJobs.pl <verbose [1 or 0]> <delay in seconds in range 10-600> [job IDs]
#
#
# Takes as args a string of qsub job IDs and periodically monitors them. Once they all finish, it returns 0
#
# If any of the jobs go into error state, an error is printed to stderr and the program waits for the non-error
# jobs to finish, then returns 1
#

# Usual qstat format - check this at run time
# job-ID  prior   name       user         state submit/start at     queue                          slots ja-task-ID


# First thing to do is parse our input

my ($verbose, $delay, @jobIDs) = @ARGV;

# Check for user stupidity
if ($delay < 10) {
    print STDERR "Sleep period is too short, will poll queue once every 10 seconds\n";
    $delay = 10;
}
elsif ($delay > 3600) {
    print STDERR "Sleep period is too long, will poll queue once every 60 minutes\n";
    $delay = 3600;
}

print "  Waiting for " . scalar(@jobIDs) . " jobs: @jobIDs\n";

my $user=`whoami`;

my $qstatOutput = `qstat -u $user`;

if (!scalar(@jobIDs) || !$qstatOutput) {
    # Nothing to do
    exit 0;
}

my @qstatLines = split("\n", $qstatOutput);

my @header = split('\s+', trim($qstatLines[0]));

# Position in qstat output of tokens we want
my $jobID_Pos = -1;
my $statePos = -1;

foreach my $i (0..$#header) {
    if ($header[$i] eq "job-ID") {
	$jobID_Pos = $i;
    }
    elsif ($header[$i] eq "state") {
	$statePos = $i;
    }
}


# If we can't parse the job IDs, something is very wrong
if ($jobID_Pos < 0 || $statePos < 0) {
    die "Cannot find job-ID and state field in qstat output, cannot monitor jobs\n";
}



# Now check on all of our jobs
my $jobsIncomplete = 1;

# Set to 1 for any job in an error state
my $haveErrors = 0;

while ($jobsIncomplete) {

    # Jobs that are still showing up in qstat
    $jobsIncomplete = 0;

    foreach my $job (@jobIDs) {
	# iterate over all user jobs in the queue
      qstatLine: foreach my $line (@qstatLines) {

	  # trim string for trailing white space so that the tokens are in the correct sequence
	  # We are being paranoid by matching tokens to job-IDs this way. Less elegant than a
	  # match but also less chance of a false-positive match
	  my @tokens = split('\s+', trim($line));

	  if ( $tokens[$jobID_Pos] eq $job) {
	      # Check status
	      if ($tokens[$statePos] =~ m/E/) {
		  $haveErrors = 1;
	      }
	      else {
		  $jobsIncomplete = $jobsIncomplete + 1;
	      }
	      if ($verbose) {
		  print "    Job $job is in state $tokens[$statePos]\n";
	      }
	  }

	  last qstatLine if ( $tokens[$jobID_Pos] eq $job );
      }

    }


    if ($jobsIncomplete) {
	if ($verbose) {
	    my $timestamp = `date`;
	    chomp $timestamp;
	    print "  ($timestamp) Still waiting for $jobsIncomplete jobs\n\n";
	}

	# Use of backticks rather than system permits a ctrl+c to work
	`sleep $delay`;
	$qstatOutput = `qstat -u $user`;
	@qstatLines = split("\n", $qstatOutput);
    }

}

if ($haveErrors) {
    print "  No more jobs to run - some jobs had errors\n\n";
    exit 1;
}
else {
    print "  No more jobs in queue\n\n";
    exit 0;
}




sub trim {

    my ($string) = @_;

    $string =~ s/^\s+//;
    $string =~ s/\s+$//;

    return $string;
}