1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
|
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Validate that preemption by qos is enforced
############################################################################
# Copyright (C) 2011-2014 SchedMD LLC
# Written by Nathan Yee <nyee32@schedmd.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
set user ""
set acct_1 "${test_name}_acct1"
set acct_2 "${test_name}_acct2"
set qos_1 "${test_name}_qos1"
set qos_2 "${test_name}_qos2"
set file_in "$test_dir/job_script"
set nodes [llength [get_nodes_by_state]]
set job_id 0
set qos_1_id 0
set qos_2_id 0
# job states
set done_state "DONE"
set pending_state "PENDING"
set preempted_state "PREEMPTED"
set running_state "RUNNING"
set eligible_now_job_directive "EligibleTime=now"
if {[get_config_param "PreemptType"] ne "preempt/qos"} {
skip "This test requires that PreemptType=preempt/qos"
}
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test requires use of Slurmdbd"
}
if {$nodes < 2} {
skip "Not enough available nodes ($nodes < 2)"
}
regexp "($number)" [get_config_param "MinJobAge"] {} min_job_age
if {$min_job_age < 10} {
skip "MinJobAge configured too low for this test ($min_job_age < 10)"
}
proc acct_setup { acct_name qos_name pre_qos pre_mode } {
global user nodes sacctmgr
set added 0
spawn $sacctmgr -i create qos $qos_name preempt=$pre_qos preemptmode=$pre_mode maxnodes=$nodes
expect {
-re "Adding QOS" {
incr added 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
spawn $sacctmgr -i create account $acct_name qos=$qos_name
expect {
-re "Adding Account" {
incr added 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
spawn $sacctmgr -i add user $user account=$acct_name
expect {
-re "Associations" {
incr added 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
if {$added != 3} {
fail "Account was not created properly"
}
}
proc sub_job { acct1 {het_job false} } {
global nodes file_in sbatch number
set job_id 0
if { $het_job } {
set nodesless1 [expr $nodes - 1]
spawn $sbatch -o/dev/null --exclusive -N1 -A$acct1 : -N$nodesless1 -A$acct1 $file_in
} else {
spawn $sbatch -o/dev/null --exclusive -N$nodes -A$acct1 $file_in
}
expect {
-re "Submitted batch job ($number)" {
set job_id $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch is not responding"
}
eof {
wait
}
}
if { $job_id == 0 } {
fail "sbatch did not submit job"
} else {
return $job_id
}
}
proc cleanup {} {
global user acct_1 acct_2 qos_1 qos_2 qos_1_id qos_2_id sacctmgr
set cleanacct(0) $acct_1
set cleanacct(1) $acct_2
set cleanqos(0) $qos_1
set cleanqos(1) $qos_2
set clean 0
wait_for_account_done $acct_1,$acct_2
for {set i 0} {$i<2} {incr i 1} {
run_command "$sacctmgr -i remove user $user where account=$cleanacct($i)"
spawn $sacctmgr delete -i account $cleanacct($i)
expect {
-re "(Deleting accounts|Nothing deleted)" {
incr clean 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
spawn $sacctmgr delete -i qos $cleanqos($i)
expect {
-re "(Deleting QOS|Nothing deleted)" {
incr clean 1
exp_continue
}
timeout {
fail "sacctmgr is not responding"
}
eof {
wait
}
}
}
if { $clean != 4 } {
log_warn "Unable to clean up accounts and qos"
}
}
cleanup
make_bash_script $file_in "sleep 30"
set user [get_my_user_name]
# Preempt modes to test
set preempt_mode_cancel "cancel"
set preempt_mode_requeue "requeue"
set preempt_mode_list "$preempt_mode_cancel $preempt_mode_requeue"
# Job types to test for each mode
set job_type_batch "batch"
set job_type_het "het"
set job_type_list "$job_type_batch $job_type_het"
log_info "**** Cleanup from previous run ****"
acct_setup $acct_1 $qos_1 "" $preempt_mode_cancel
acct_setup $acct_2 $qos_2 $qos_1 cluster
foreach preempt_mode $preempt_mode_list {
foreach job_type $job_type_list {
set mode_type_string [string toupper "$preempt_mode $job_type"]
log_info "*** TEST PREEMPT $mode_type_string JOB ***"
set mod_qos_vals(preemptmode) $preempt_mode
mod_qos $qos_1 [array get mod_qos_vals]
if { $job_type == $job_type_batch } {
set have_het_job 0
} else {
set have_het_job 1
}
set qos_1_id [sub_job $acct_1 $have_het_job]
wait_for_job -fail $qos_1_id $running_state $have_het_job
set qos_2_id [sub_job $acct_2]
wait_for_job -fail $qos_2_id $running_state
if { $preempt_mode == $preempt_mode_cancel } {
set jobs_ok false
wait_for {$jobs_ok} {
set jobs_ok [check_job_state $qos_1_id $preempted_state $have_het_job]
}
subtest {$jobs_ok} "Job ($qos_1_id) should be fully in the $preempted_state state"
} else {
# Requeue state
set jobs_ok false
wait_for {$jobs_ok} {
set jobs_ok [check_job_state $qos_1_id $pending_state $have_het_job]
}
subtest {$jobs_ok} "Job ($qos_1_id) should be fully in the $pending_state state"
# Make job eligible to run now to avoid delay
run_command "$scontrol update job $qos_2_id $eligible_now_job_directive"
wait_for_job -fail $qos_2_id $done_state
# Wait for requeued job to restart
# Make job eligible to run now to avoid delay
run_command "$scontrol update job $qos_1_id $eligible_now_job_directive"
wait_for_job -fail $qos_1_id $running_state $have_het_job
set jobs_ok false
wait_for {$jobs_ok} {
set jobs_ok [check_job_state $qos_1_id $running_state $have_het_job]
}
subtest {$jobs_ok} "Job ($qos_1_id) should be fully in the $running_state state"
}
wait_for_account_done $acct_1,$acct_2
}
}
|