1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
|
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test hdf5 acct_gather_profile (--profile=task)
############################################################################
# Copyright (C) 2013 Bull S. A. S.
# Bull, Rue Jean Jaures, B.P.68, 78340, Les Clayes-sous-Bois.
#
# Written by Rod Schultz <rod.schultz@bull.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_out "$test_dir/output"
set file_prog "$test_name.prog"
set hdf5_file "$test_name.h5"
set job_id 0
if {[get_config_param "FrontendName"] ne "MISSING"} {
skip "This test is incompatible with front-end systems"
}
proc cleanup {} {
global file_prog hdf5_file
file delete $file_prog $hdf5_file
}
# Check if acct_gather_profile/hdf5 is installed
set profile 0
log_user 0
spawn $scontrol show config
expect {
-re "acct_gather_profile/hdf5" {
set profile 1
}
eof {
wait
}
}
log_user 1
if {$profile == 0} {
skip "acct_gather_profile/hdf5 not installed on this system"
}
log_info "Acct_gather_profile/hdf5 plugin installed"
log_debug "Note: this test takes 3 minutes to run"
set task_freq [get_job_acct_freq]
if {$task_freq < 30} {
log_warn "jobacct_gather_freq < 30 ($task_freq), results are unreliable"
}
#
# Build a test program to put a known load on the system
#
exec $bin_rm -f $file_prog
exec $bin_cc -I$build_dir $file_prog.c -lm -o $file_prog
exec $bin_chmod 700 $file_prog
set timeout [expr $max_job_delay + 200]
spawn $srun --acctg-freq=$task_freq --profile=task -t5 ./$file_prog
expect {
-re "SLURM_JobId=($number)" {
set job_id $expect_out(1,string)
exp_continue
}
-re "error: PROFILE: Impossible to create the table" {
exec rm -f $file_prog
skip "This error is cause by a bug in HDF5 1.10.0, and is fixed in 1.10.1. Please upgrade to run this test"
}
-re "error:" {
fail "Something happened on start of job"
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
set timeout 10
spawn $sh5util -j $job_id
expect {
"Output file generated is empty" {
fail "sh5util merge didn't make anything"
}
timeout {
fail "sh5util merge not responding"
}
eof {
wait
}
}
spawn $sh5util -j $job_id -E -l Node:TimeSeries -s Tasks -o $file_out
expect {
"Output file generated is empty" {
fail "sh5util extract didn't make anything"
}
timeout {
fail "sh5util extract not responding"
}
eof {
wait
}
}
set line ""
set nerr 0
set lno 0
set fd 0
set last_et 0
set fd [open $file_out "r"]
set et_col -1
set cpu_util_col -1
set read_disk_col -1
while {$fd > 0 && [gets $fd line] != -1} {
incr lno
set tokens [split $line ","]
if {$lno == 1} {
set et_col [lsearch $tokens "ElapsedTime"]
set cpu_util_col [lsearch $tokens "CPUUtilization"]
set read_disk_col [lsearch $tokens "ReadMB"]
if {$et_col == -1} {
fail "No ElapsedTime column found"
}
if {$cpu_util_col == -1} {
fail "No CPUUtilization column found"
}
if {$read_disk_col == -1} {
fail "No ReadMB column found"
}
continue
}
set et [lindex $tokens $et_col]
set cur_et [expr $et - $last_et]
set last_et $et
if { $lno == 2 } {
continue
}
if {$cur_et < $task_freq} {
log_warn "Poll interval was only $cur_et instead of expected $task_freq on line $lno"
incr nerr
}
set cputil [lindex $tokens $cpu_util_col]
# The range on cpu utilization is pretty wide
# Linux accounting resolution is only to one second, so in a
# typical 30 interval an extra second is 3%. The burn loop
# consumes a bit more that asked for. There is additional type
# managing the I/O portion. Slurm and linux also consume some
# cpu.
if {$cputil < 38.0 || $cputil > 47.0 } {
log_warn "CPU Busy $cputil not near 40% on line $lno"
incr nerr
}
set rdmb [lindex $tokens $read_disk_col]
set low_rd [expr 0.975 * 10 * $cur_et]
set hi_rd [expr 1.025 * 10 * $cur_et]
if {$rdmb < $low_rd || $rdmb > $hi_rd } {
log_warn "Read Megabytes $rdmb not near 100.0 on line $lno"
incr nerr
}
}
close $fd
if {$lno == 0} {
fail "Output file ($file_out) is empty"
}
if {$nerr > 3} {
fail "Too many values out of range ($nerr)"
}
|