1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
|
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test sacct functionality and accuracy.
############################################################################
# Copyright (C) 2005 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_in "$test_dir/input"
set file_out "$test_dir/output"
set file_err "$test_dir/error"
set file_prog "$test_name.prog"
set file_stem $test_name
set job_id 0
set step_id 0
set matches 0
# job parameters
set mem_size 102400; # 100 MiB
set file_size 10485760
set sleep_time 32
set ret_code 42
set num_tasks 3
set max_rss_tolerance 4300; # 4.2 MiB
set ave_rss_tolerance 10240; # 10 MiB
set max_time_error 10
set job_mem_limit [expr ($mem_size + $max_rss_tolerance) / 1024 + 10]
# Expected values in sacct
set expected_state "FAILED"
set expected_ret_code $ret_code
if {[get_config_param "KillOnBadExit"] == 1} {
set expected_state "CANCELLED"
set expected_ret_code "0:15"
}
#
# Check requirements
#
if {[get_config_param "FrontendName"] ne "MISSING"} {
skip "This test is incompatible with front-end systems"
}
set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"]
if {[param_contains $accounting_storage_enforce "nosteps"] || [param_contains $accounting_storage_enforce "nojobs"]} {
skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)"
}
if {[get_config_param JobAcctGatherType] != "jobacct_gather/linux" &&
[get_config_param JobAcctGatherType] != "jobacct_gather/cgroup"} {
skip "Job accounting information not gathered on this system"
}
if {[get_config_param AccountingStorageType] == "accounting_storage/none"} {
skip "Job accounting information not stored on this system"
}
set nodes [get_nodes_by_request "-n$num_tasks --mem-per-cpu=$job_mem_limit -t2"]
if {![llength $nodes]} {
skip "Unable to test with current node configuration"
}
proc cleanup {} {
global file_prog file_stem
file delete $file_prog $file_stem.read $file_stem.write
}
proc _get_mem {prog} {
global float number mem_size job_id step_id max_rss_tolerance ave_rss_tolerance num_tasks
set result [list]
set max_rss -1
set mem_task -1
set ave_rss -1
set prog_base [file rootname [file tail $prog]]
set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format maxrss,maxrsstask,averss --noconvert"]
dict set subtest cond [regexp "($number)\\D($number)\\D($number)" $output - max_rss mem_task ave_rss]
dict set subtest desc "$prog_base should provide the right output format"
dict set subtest diag "$output"
lappend result $subtest
if {![dict get $subtest cond]} {
return $result
}
set max_rss [scale_to_ks $max_rss ""]
# Since we will be multiplying ave_rss by num_tasks below and expecting it to be greater than
# max_rss, we must add 1 to ave_rss to compensate for the small potential truncation loss that
# can occur in the slurm code when calculating AveRSS (effectively int(TotRSS / tasks)).
set ave_rss [scale_to_ks [expr $ave_rss + 1] ""]
dict set subtest cond [tolerance $mem_size $max_rss $max_rss_tolerance]
dict set subtest desc "MaxRSS is within expected tolerance for $prog_base"
dict set subtest diag "Observed MaxRSS ($max_rss) is not within tolerance $max_rss_tolerance of expected value ($mem_size)"
lappend result $subtest
# We expect ave_rss to be greater than or equal to max_rss / num_tasks.
# We are assuming that the two non-memory tasks will not climb above a total of 10 MiB.
dict set subtest cond [tolerance $max_rss [expr $ave_rss * $num_tasks] "+$ave_rss_tolerance"]
dict set subtest desc "AveRSS is within expected tolerance for $prog_base"
dict set subtest diag "Observed AveRSS ($ave_rss) is not within tolerance +$ave_rss_tolerance of expected value ([expr $max_rss / 3])"
lappend result $subtest
return $result
}
# Check the job written and read file size.
proc _get_file_size {prog} {
global number float job_id step_id file_size
set result [list]
set max_disk_write -1
set ave_disk_write -1
set max_disk_read -1
set ave_disk_read -1
set prog_base [file rootname [file tail $prog]]
set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format MaxDiskWrite,AveDiskWrite,MaxDiskRead,AveDiskRead,MaxDiskWriteTask,MaxDiskReadTask --noconvert"]
dict set subtest cond [regexp "($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($number)\\D($number)" $output \
- \
max_disk_write scale1 \
ave_disk_write scale2 \
max_disk_read scale3 \
ave_disk_read scale4 \
w_task r_task]
dict set subtest desc "$prog_base should provide the right output format"
dict set subtest diag "$output"
lappend result $subtest
if {![dict get $subtest cond]} {
return $result
}
set max_disk_write [scale_to_megs $max_disk_write $scale1]
set ave_disk_write [scale_to_megs $ave_disk_write $scale2]
set max_disk_read [scale_to_megs $max_disk_read $scale3]
set ave_disk_read [scale_to_megs $ave_disk_read $scale4]
dict set subtest cond [expr {$w_task == 1}]
dict set subtest desc "$prog_base should provide MaxDiskWriteTask equal to 1"
dict set subtest diag "$w_task != 1"
lappend result $subtest
dict set subtest cond [expr {$r_task == 2}]
dict set subtest desc "$prog_base should provide MaxDiskReadTask equal to 2"
dict set subtest diag "$r_task != 2"
lappend result $subtest
dict set subtest cond [tolerance $max_disk_write $max_disk_read "0.3"]
dict set subtest desc "$prog_base should provide MaxDiskWrite close to MaxDiskRead with 0.3 tolerance"
dict set subtest diag "$max_disk_write is too different from $max_disk_read"
lappend result $subtest
dict set subtest cond [tolerance $ave_disk_write $ave_disk_read "0.3"]
dict set subtest desc "$prog_base should provide AveDiskWrite close to AveDiskRead with 0.3 tolerance"
dict set subtest diag "$ave_disk_write is too different from $ave_disk_read"
lappend result $subtest
return $result
}
proc wait_and_subtest {test prog} {
set all_passed false
wait_for -timeout 15 {$all_passed} {
set result [$test $prog]
set all_passed true
foreach subtest $result {
if {![dict get $subtest cond]} {
set all_passed false
}
}
}
foreach subtest $result {
subtest [dict get $subtest cond] [dict get $subtest desc] [dict get $subtest diag]
}
}
#
# Delete left-over program and rebuild it
# Compilation is not optimized to avoid memset to be skipped.
#
run_command -fail "$bin_cc -o $file_prog ${file_prog}.c"
run_command -fail "$bin_chmod 700 $file_prog"
make_bash_script $file_in "
$srun ./$file_prog $ret_code $sleep_time $mem_size $file_size $file_stem
"
# Create a file to read
set fd [open ${file_stem}.read "wb"]
for {set i 0} {$i < $file_size} {incr i} {
puts -nonewline $fd [binary format n $i]
}
close $fd
#
# Run a simple job
# Usage: test12.2.prog <exit_code> <sleep_secs> <mem_kb>
# <file_size> <file_stem>
#
set config_prob 0
set timeout [expr $max_job_delay + $sleep_time]
set job_id [submit_job -fail "-n$num_tasks --mem-per-cpu=$job_mem_limit --output=$file_out --error=$file_err -t2 $file_in"]
#
# Wait for job to run
#
wait_for_job -fail $job_id "RUNNING"
wait_and_subtest _get_mem $sstat
wait_and_subtest _get_file_size $sstat
#
# Wait for job to complete
#
wait_for_job -fail $job_id "DONE"
wait_and_subtest _get_mem $sacct
wait_and_subtest _get_file_size $sacct
#
# Report basic sacct info
#
set output "COMPLETING"
wait_for -fail {![regexp "COMPLETING" $output]} {
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format jobid,jobname,state,exitcode --starttime=00:00"]
}
subtest {[regexp "$job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" $output]} "sacct should report $job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" "$output"
#
# Report the sacct accounting info: Elapsed
#
set mins 0
set secs 0
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format elapsed --starttime=00:00"]
subtest {[regexp {(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?} $output - hours mins secs msecs]} "sacct should report the elapsed time in the format \[HH:\]MM:SS\[.sss\]" "$output"
# Use scan to avoid TCL octals
set value [expr ([scan $mins "%d"] * 60) + [scan $secs "%d"]]
subtest {[tolerance $sleep_time $value +$max_time_error]} "Elapsed time reported by sacct should be close to $sleep_time" "$value too different from $sleep_time"
#
# Report the sacct accounting info: TotalCPU
#
set mins 0
set secs 0
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format totalcpu --starttime=00:00"]
subtest {[regexp {(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?} $output - hours mins secs msecs]} "sacct should report the TotalCPU time in the format \[HH:\]MM:SS\[.sss\]" "$output"
# Use scan to avoid TCL octals
set value [expr ([scan $mins "%d"] * 60) + [scan $secs "%d"]]
subtest {$value <= $sleep_time} "TotalCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"
#
# Report the sacct accounting info: MinCPU
#
set mins 0
set secs 0
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format mincpu --starttime=00:00"]
subtest {[regexp {(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?} $output - hours mins secs msecs]} "sacct should report the MinCPU time in the format \[HH:\]MM:SS\[.sss\]" "$output"
# Use scan to avoid TCL octals
set value [expr ([scan $mins "%d"] * 60) + [scan $secs "%d"]]
subtest {$value <= $sleep_time} "MinCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"
|