1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
|
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Increase size of job with allocated MPS
############################################################################
# Copyright (C) 2019 SchedMD LLC
# Written by Morris Jette
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_in1 "$test_dir/input1"
set file_in2 "$test_dir/input2"
set file_in3 "$test_dir/input3"
set file_out1 "$test_dir/output1"
set file_out2 "$test_dir/output2"
set job_id1 0
set job_id2 0
if {![param_contains [get_config_param "SchedulerParameters"] "permit_job_expansion"]} {
skip "This test is only compatible with SchedulerParameters=permit_job_expansion"
}
if {![check_config_select "cons_tres"]} {
skip "This test is only compatible with select/cons_tres"
}
if {[get_config_param "FrontendName"] ne "MISSING"} {
skip "This test is incompatible with front-end systems"
}
set constrain_devices [expr {[get_config_param "ConstrainDevices"] eq "yes"}]
if {$constrain_devices} {
log_debug "Devices files are constrained by cgroups"
} else {
log_debug "Devices files are NOT constrained by cgroups"
}
if {[llength [get_nodes_by_request "--gres=mps:100 -N2 -t2"]] == 0} {
skip "This test requires being able to submit job with --gres=mps:100 -N2"
}
proc cleanup {} {
global job_id1 job_id2 bin_rm
cancel_job [list $job_id1 $job_id2]
if {$job_id2 > 0} {
exec $bin_rm -f slurm_job_${job_id2}_resize.csh
exec $bin_rm -f slurm_job_${job_id2}_resize.sh
}
}
#
# Build input scripts
# file_in1: Determine MPS allocated, wait for dependent job to exit,
# expand allocation and run another job
# file_in2: Determine allocated, shrink to size 0 and exit
# file_in3: Print the hostname and MPS info
#
exec $bin_rm -f $file_out1 $file_out2
make_bash_script $file_in1 "
$scontrol -dd show job \${SLURM_JOBID}
$srun $file_in3
sleep 20 # Wait for job 2 submission
while true; do
$squeue -h -n test_child_$test_id | wc | grep -e ' *0 *0 *0'
if \[ \${?} -eq 0 \]; then
break
fi
sleep 5
done
$scontrol update JobId=\${SLURM_JOBID} NumNodes=ALL
. slurm_job_\${SLURM_JOBID}_resize.sh
$scontrol -dd show job \${SLURM_JOBID}
$srun $file_in3
$bin_rm -f slurm_job_\${SLURM_JOBID}_resize.csh
$bin_rm -f slurm_job_\${SLURM_JOBID}_resize.sh
exit 0"
make_bash_script $file_in2 "
$scontrol -dd show job \${SLURM_JOBID}
$scontrol update JobId=\${SLURM_JOBID} NumNodes=0
. slurm_job_\${SLURM_JOBID}_resize.sh
# JOB GETS CANCELLED HERE AS BATCH HOST GETS REMOVED FROM JOB ALLOCATION
$bin_rm -f slurm_job_\${SLURM_JOBID}_resize.csh
$bin_rm -f slurm_job_\${SLURM_JOBID}_resize.sh
exit 0"
# NOTE: We pipe stderr from slurmd and discard stdout
make_bash_script $file_in3 "
$slurmd -N \$SLURMD_NODENAME -G 2>&1 >/dev/null | grep 'Gres Name=mps' | grep 'Index='\$CUDA_VISIBLE_DEVICES
echo 'HOST:'\$SLURMD_NODENAME 'CUDA_VISIBLE_DEVICES:'\$CUDA_VISIBLE_DEVICES 'CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:'\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
#
# Submit job to expand: uses 10 gres/mps on one node
#
spawn $sbatch -N1 --exclusive -J $test_name -t2 --gres=mps:10 --output=$file_out1 $file_in1
expect {
-re "Submitted batch job ($number)" {
set job_id1 $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id1 == 0} {
fail "Job 1 not submitted"
}
wait_for_job -fail $job_id1 "RUNNING"
#
# Submit job to give up resources: uses 10 gres/mps on one node
#
spawn $sbatch -N1 --exclusive -J "test_child_$test_id" --dependency=expand:$job_id1 -t1 --gres=mps:10 --output=$file_out2 $file_in2
expect {
-re "Submitted batch job ($number)" {
set job_id2 $expect_out(1,string)
exp_continue
}
timeout {
fail "sbatch not responding"
}
eof {
wait
}
}
if {$job_id2 == 0} {
fail "Job 2 not submitted"
}
wait_for_job -fail $job_id1 "DONE"
wait_for_job -fail $job_id2 "DONE"
#
# Parse the output files from job 1
#
log_info "Parse job 1 output"
wait_for_file -fail $file_out1
set match 0
set percentage -1
spawn $bin_cat $file_out1
expect {
-re "CUDA_VISIBLE_DEVICES:($number) CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
incr match
set percentage $expect_out(2,string)
exp_continue
}
eof {
wait
}
}
if {$match != 3} {
fail "Bad CUDA information about job 1 ($match != 3)"
}
#
# If devices not constrained and Count on all allocated devices is the same
# then confirm the CUDA_MPS_ACTIVE_THREAD_PERCENTAGE value above is correct
#
if {$constrain_devices == 0} {
set count -1
log_user 0
spawn $bin_cat $file_out1
expect {
-re "Count=($number)" {
if {$count == -1} {
set count $expect_out(1,string)
} elseif {$count != $expect_out(1,string)} {
set count -1
}
exp_continue
}
eof {
wait
}
}
log_user 1
if {$count > 0} {
set count [expr 10 * 100 / $count]
if {$percentage != $count} {
fail "Bad CUDA_MPS_ACTIVE_THREAD_PERCENTAGE value ($percentage != $count)"
} else {
log_debug "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE value is good"
}
}
}
#
# Parse the output files from job 2
# Not currently looking for anything, but do log it's contents before purge
#
log_info "Parse job 2 output"
wait_for_file -fail $file_out2
set match 0
spawn $bin_cat $file_out2
expect {
eof {
wait
}
}
|