1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
|
#!/usr/bin/env expect
############################################################################
# Purpose: Test for accounting records of specific job names with their ID
############################################################################
# Copyright (C) 2015 SchedMD LLC.
# Written by Nathan Yee <nyee32@schedmd.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
#
# Supplemental function to test21.21 that test a job with
# resources within the allowed limit in the association
#
proc inc21_21_good { test_type limit } {
global number bin_id ta srun test_node selectparam nthreads is_skip re_word_str
set job_id 0
set val 0
set add ""
# Wait for old jobs to clean up
sleep 2
log_info "====== Test $test_type ======"
if {($test_type eq "maxcpus" || $test_type eq "maxcpumins") && [default_part_exclusive] != 0} {
log_warn "Unable to perform test with exclusive node allocations"
set is_skip 1
return
}
set select_type_param [get_select_type_params]
if { [string first "CR_SOCKET" $select_type_param] != -1} {
log_warn "This test can't be run SelectTypeParameters=CR_SOCKET"
set is_skip 1
return
}
if {$test_type eq "maxnode"} {
set add "--exclusive"
} else {
set add "-w$test_node"
}
set matches 0
spawn $srun -v -t1 $add [lindex $limit 0][lindex $limit 1] \
--account=$ta $bin_id
expect {
-re "launching StepId=($number)\\.$re_word_str" {
set job_id $expect_out(1,string)
incr matches
exp_continue
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
if {$job_id == 0} {
wait_for_job -fail $job_id "DONE"
}
subtest -fail { $matches == 1 } "Job launches with correct limit"
}
#
# Supplemental function to test21.21 that test a job with
# resources larger than allowed limit in the association
#
proc inc21_21_bad { test_type limit } {
global number bin_id ta srun test_node nthreads selectparam re_word_str
set job_id 0
set over_lim [expr [lindex $limit 1] + 1]
set add ""
log_info "====== Test $test_type ======"
if {$test_type eq "maxnode"} {
set add "--exclusive"
} else {
set add "-w$test_node"
}
set matches 0
spawn $srun -v $add -t1 [lindex $limit 0]$over_lim --account=$ta \
-I $bin_id
expect {
-re "Job violates accounting/QOS policy" {
log_info "This error is expected, not a problem"
exp_continue
}
-re "launching StepId=($number)\\.$re_word_str" {
set job_id $expect_out(1,string)
fail "Job ($job_id) should not have run"
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
if {$job_id != 0} {
wait_for_job -fail $job_id "DONE"
}
}
proc inc21_21_grp_test { test_type limit } {
global number bin_id ta srun sbatch test_node selectparam nthreads is_skip
global file_in squeue scancel bin_bash bin_chmod job_list
set val 0
set exclusive ""
log_info "===== Test $test_type ====="
if { $test_type eq "grpcpumins" &&
![param_contains [get_config_param "AccountingStorageEnforce"] "safe"] } {
log_warn "This test can't be run without AccountingStorageEnforce having \"safe\" in it"
set is_skip 1
return
}
if { [default_part_exclusive] != 0} {
log_warn "This test can't be run Exclusive node allocations"
set is_skip 1
return
}
set select_type_param [get_select_type_params]
if { [string first "CR_SOCKET" $select_type_param] != -1} {
log_warn "This test can't be run SelectTypeParameters=CR_SOCKET"
set is_skip 1
return
}
# Check and see if it is a CPU test
if {$test_type eq "grpcpus" || $test_type eq "grpcpumins" || $test_type eq "grpcpurunmins"} {
if {$selectparam} {
set val [expr [lindex $limit 1] / $nthreads]
} else {
set val [lindex $limit 1]
}
} else {
set exclusive "#SBATCH --exclusive"
set val [lindex $limit 1]
}
make_bash_script $file_in "
$exclusive
sleep 10"
#
# First we will submit n jobs that should be below the association limit
# and should run. We wait for these to start before submitting the
# over-limit job. If we were to submit them all at once, periodically the
# earlier submitted jobs can take longer to start than later submitted jobs
# such as when an epilog is still in progress on the assigned nodes.
#
for {set inx 0} {$inx < $val} {incr inx} {
set job_id($inx) [submit_job -fail "-t1 [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"]
lappend job_list $job_id($inx)
}
# Wait for the expected jobs to start running
for {set inx 0} {$inx < $val} {incr inx} {
wait_for -fail -timeout 30 -pollinterval .2 {$state eq "RUNNING"} {
set state [get_job_param $job_id($inx) "JobState"]
}
}
#
# Submit an additional job. This job should pend since it will be past the
# association limit. This job gets a longer time limit to avoid having it
# prematurely start after _decay_thread() runs and decays the values of the
# other running jobs.
#
set job_id($inx) [submit_job -fail "-t$val [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"]
lappend job_list $job_id($inx)
set pending 0
set running 0
spawn $squeue -A $ta -h -o "\%t \%r"
expect {
-re "PD." {
incr pending
exp_continue
}
-re "R.None" {
incr running
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
}
subtest -fail { $pending == 1 && $running == $val } "$test_type limit" "Found $pending jobs pending and $running jobs running while expecting 1 and $val"
#
# Cancel test jobs
#
spawn $scancel --quiet --account=$ta
expect {
eof {
wait
}
}
}
#
# Supplemental function to test21.21 that test for max/grp
# submit and jobs
#
proc inc21_21_submit_test { limit } {
global file_in srun sbatch squeue scancel bin_id number bin_sleep is_skip
global bin_rm ta maxjob_lim maxsub_lim
global acct_mod_desc acct_mod_acct_vals acct_mod_assoc_vals
global acct_mod_assoc_test_vals job_list
set limit_job ""
set limit_sub ""
if {$limit eq "grpjobsub" && [default_part_exclusive] != 0} {
log_warn "Unable to perform test with exclusive node allocations"
set is_skip 1
return
}
if {$limit eq "maxjobsub"} {
set limit_job "maxjob"
set limit_sub "maxsubmit"
} else {
set limit_job "grpjob"
set limit_sub "grpsubmit"
}
set acct_mod_assoc_test_vals($limit_job) \
[lindex $acct_mod_assoc_vals($limit) 0]
set acct_mod_assoc_test_vals($limit_sub) \
[lindex $acct_mod_assoc_vals($limit) 1]
if [mod_acct $ta [array get acct_mod_desc] \
[array get acct_mod_assoc_test_vals] \
[array get acct_mod_acct_vals]] {
fail "Unable to modify account ($ta)"
}
make_bash_script $file_in "
$bin_sleep 10
"
# Test to make sure that the grpsubmit and maxsubmit
# are enforced with jobs
log_info "==== Test $limit ===="
# Submit jobs to test the limit set in the association
for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
set job_id($inx) [submit_job -fail "-N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
if { !$job_id($inx) } {
fail "sbatch didn't return jobid"
}
lappend job_list $job_id($inx)
# We need to sleep because of the way the scheduler works
# if we don't sleep then we could
sleep 1
}
# Wait for the allowed jobs to start running
for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_job)} {incr inx} {
wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING"
}
# Then submit one more over the limit and it should fail
set result [run_command "$sbatch -N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail"
subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error"
set matches 0
spawn $squeue -A$ta -h -o "\%i \%t \%r"
expect {
-re "($job_id(2)|$job_id(3)).PD.AssocMaxJobsLimit" {
incr matches
exp_continue
}
-re "($job_id(2)|$job_id(3)).PD.AssocGrpJobsLimit" {
incr matches
exp_continue
}
-re "($job_id(0)|$job_id(1)).R.None" {
incr matches
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
}
spawn $scancel --quiet --account=$ta
expect {
eof {
wait
}
}
if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} {
log_warn "Only started $matches of 4 possible jobs"
} elseif { $matches != 4 } {
fail "Jobs are not in the expected state (expected $matches != 4)"
}
# Test to make sure that the grpsubmit and maxsubmit
# are enforced with job arrays
log_info "==== Test $limit with job arrays ===="
# Submit jobs to test the limit set in the association
for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
set job_id($inx) [submit_job -fail "-N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
if { !$job_id($inx) } {
fail "sbatch didn't return jobid"
}
lappend job_list $job_id($inx)
# We need to sleep because of the way the scheduler works
# if we don't sleep then we could
sleep 1
}
# Wait for the allowed job arrays to start running
for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_job)} {incr inx} {
wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING"
}
# Then submit one more over the limit and it should fail
set result [run_command "$sbatch -N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail"
subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error"
set matches 0
spawn $squeue -A$ta -h -o "\%i \%t \%r"
expect {
-re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocMaxJobsLimit" {
incr matches
exp_continue
}
-re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocGrpJobsLimit" {
incr matches
exp_continue
}
-re "($job_id(0)|$job_id(1))_0.R.None" {
incr matches
exp_continue
}
timeout {
fail "squeue not responding"
}
eof {
wait
}
}
spawn $scancel --quiet --account=$ta
expect {
eof {
wait
}
}
if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} {
log_warn "Only started $matches of 4 possible jobs"
} elseif { $matches != 4 } {
fail "Jobs are not in the expected state (expected $matches != 4)"
}
# Clear the limits
set acct_mod_assoc_test_vals($limit_job) "-1"
set acct_mod_assoc_test_vals($limit_sub) "-1"
}
#
# Function that tests an association's grpwall limit
#
proc inc21_21_grpwall { test_type limit } {
global bin_sleep ta test_qos job_list
set local_job_list [list]
set jobs 5.0
set grpwall_num [lindex $limit 1]
set grpwall_per_job [expr $grpwall_num * 1.1 / $jobs]
set sleep_time [expr int(ceil($grpwall_per_job * 60))]
set job_time [expr int(ceil($grpwall_per_job))]
set timeout 120
log_info "====== Test $test_type ======"
# Wait for old jobs to clean up
sleep 2
# Since wall is a decayed variable lets reset it to make sure the test
# gets exactly what we would expect.
reset_qos_usage "" $test_qos
log_debug "Running $jobs jobs of $sleep_time seconds of duration to ensure that we reach the Grpwall limit of $grpwall_num minutes"
for {set i 0} {$i < $jobs} {incr i} {
set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"]
lappend local_job_list $job_id
lappend job_list $job_id
}
foreach job_id $local_job_list {
if {[wait_job_reason $job_id COMPLETED] != $::RETURN_SUCCESS} {
fail "Job ($job_id) did not complete"
}
}
log_debug "Submitting the final job and check that it is set Pending with Reason AssocGrpWallLimit"
set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"]
lappend local_job_list $job_id
lappend job_list $job_id
# Subtest of the limit
if {[wait_job_reason $job_id PENDING AssocGrpWallLimit] != $::RETURN_SUCCESS} {
cancel_job $local_job_list
fail "Job should not have run"
}
# Cancel jobs
cancel_job $local_job_list
}
|