File: test12.2

package info (click to toggle)
slurm-wlm 22.05.8-4%2Bdeb12u3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 48,492 kB
  • sloc: ansic: 475,246; exp: 69,020; sh: 8,862; javascript: 6,528; python: 6,444; makefile: 4,185; perl: 4,069; pascal: 131
file content (283 lines) | stat: -rwxr-xr-x 10,837 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Test sacct functionality and accuracy.
############################################################################
# Copyright (C) 2005 The Regents of the University of California.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################
source ./globals

set file_in     "$test_dir/input"
set file_out    "$test_dir/output"
set file_err    "$test_dir/error"
set file_prog   "$test_name.prog"
set file_stem   $test_name
set job_id      0
set step_id     0
set matches     0

# job parameters
set mem_size    102400; # 100 MiB
set file_size   10485760
set sleep_time  32
set ret_code    42
set num_tasks   3
set max_rss_tolerance 4300; # 4.2 MiB
set ave_rss_tolerance 10240; # 10 MiB
set max_time_error 10
set job_mem_limit [expr ($mem_size + $max_rss_tolerance) / 1024 + 10]

# Expected values in sacct
set expected_state    "FAILED"
set expected_ret_code $ret_code
if {[get_config_param "KillOnBadExit"] == 1} {
	set expected_state "CANCELLED"
	set expected_ret_code "0:15"
}

#
# Check requirements
#
if {[get_config_param "FrontendName"] ne "MISSING"} {
	skip "This test is incompatible with front-end systems"
}
set accounting_storage_enforce [get_config_param "AccountingStorageEnforce"]
if {[param_contains $accounting_storage_enforce "nosteps"] || [param_contains $accounting_storage_enforce "nojobs"]} {
	skip "This test can not be run with nosteps or nojobs (AccountingStorageEnforce)"
}
if {[get_config_param JobAcctGatherType] != "jobacct_gather/linux" &&
    [get_config_param JobAcctGatherType] != "jobacct_gather/cgroup"} {
	skip "Job accounting information not gathered on this system"
}
if {[get_config_param AccountingStorageType] == "accounting_storage/none"} {
	skip "Job accounting information not stored on this system"
}

set nodes [get_nodes_by_request "-n$num_tasks --mem-per-cpu=$job_mem_limit -t2"]
if {![llength $nodes]} {
	skip "Unable to test with current node configuration"
}

proc cleanup {} {
	global file_prog file_stem

	file delete $file_prog $file_stem.read $file_stem.write
}

proc _get_mem {prog} {
	global float number mem_size job_id step_id max_rss_tolerance ave_rss_tolerance num_tasks

	set result   [list]
	set max_rss -1
	set mem_task -1
	set ave_rss -1
	set prog_base [file rootname [file tail $prog]]

	set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format maxrss,maxrsstask,averss --noconvert"]

	dict set subtest cond [regexp "($number)\\D($number)\\D($number)" $output - max_rss mem_task ave_rss]
	dict set subtest desc "$prog_base should provide the right output format"
	dict set subtest diag "$output"
	lappend result $subtest
	if {![dict get $subtest cond]} {
		return $result
	}

	set max_rss [scale_to_ks $max_rss ""]
	# Since we will be multiplying ave_rss by num_tasks below and expecting it to be greater than
	# max_rss, we must add 1 to ave_rss to compensate for the small potential truncation loss that
	# can occur in the slurm code when calculating AveRSS (effectively int(TotRSS / tasks)).
	set ave_rss [scale_to_ks [expr $ave_rss + 1] ""]

	dict set subtest cond [tolerance $mem_size $max_rss $max_rss_tolerance]
	dict set subtest desc "MaxRSS is within expected tolerance for $prog_base"
	dict set subtest diag "Observed MaxRSS ($max_rss) is not within tolerance $max_rss_tolerance of expected value ($mem_size)"
	lappend result $subtest

	# We expect ave_rss to be greater than or equal to max_rss / num_tasks.
	# We are assuming that the two non-memory tasks will not climb above a total of 10 MiB.
	dict set subtest cond [tolerance $max_rss [expr $ave_rss * $num_tasks] "+$ave_rss_tolerance"]
	dict set subtest desc "AveRSS is within expected tolerance for $prog_base"
	dict set subtest diag "Observed AveRSS ($ave_rss) is not within tolerance +$ave_rss_tolerance of expected value ([expr $max_rss / 3])"
	lappend result $subtest

	return $result
}

# Check the job written and read file size.
proc _get_file_size {prog} {
	global number float job_id step_id file_size

	set result          [list]
	set max_disk_write  -1
	set ave_disk_write  -1
	set max_disk_read   -1
	set ave_disk_read   -1
	set prog_base [file rootname [file tail $prog]]

	set output [run_command_output -fail "$prog --noheader -p --job=$job_id.$step_id --format MaxDiskWrite,AveDiskWrite,MaxDiskRead,AveDiskRead,MaxDiskWriteTask,MaxDiskReadTask --noconvert"]

	dict set subtest cond [regexp "($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($float)(\[MGT\]*)\\D($number)\\D($number)" $output \
			       - \
			       max_disk_write scale1 \
			       ave_disk_write scale2 \
			       max_disk_read  scale3 \
			       ave_disk_read  scale4 \
			       w_task r_task]
	dict set subtest desc "$prog_base should provide the right output format"
	dict set subtest diag "$output"
	lappend result $subtest
	if {![dict get $subtest cond]} {
		return $result
	}

	set max_disk_write [scale_to_megs $max_disk_write $scale1]
	set ave_disk_write [scale_to_megs $ave_disk_write $scale2]
	set max_disk_read  [scale_to_megs $max_disk_read  $scale3]
	set ave_disk_read  [scale_to_megs $ave_disk_read  $scale4]

	dict set subtest cond [expr {$w_task == 1}]
	dict set subtest desc "$prog_base should provide MaxDiskWriteTask equal to 1"
	dict set subtest diag "$w_task != 1"
	lappend result $subtest

	dict set subtest cond [expr {$r_task == 2}]
	dict set subtest desc "$prog_base should provide MaxDiskReadTask equal to 2"
	dict set subtest diag "$r_task != 2"
	lappend result $subtest

	dict set subtest cond [tolerance $max_disk_write $max_disk_read "0.3"]
	dict set subtest desc "$prog_base should provide MaxDiskWrite close to MaxDiskRead with 0.3 tolerance"
	dict set subtest diag "$max_disk_write is too different from $max_disk_read"
	lappend result $subtest

	dict set subtest cond [tolerance $ave_disk_write $ave_disk_read "0.3"]
	dict set subtest desc "$prog_base should provide AveDiskWrite close to AveDiskRead with 0.3 tolerance"
	dict set subtest diag "$ave_disk_write is too different from $ave_disk_read"
	lappend result $subtest

	return $result
}

proc wait_and_subtest {test prog} {
	set all_passed false
	wait_for -timeout 15 {$all_passed} {
		set result [$test $prog]

		set all_passed true
		foreach subtest $result {
			if {![dict get $subtest cond]} {
				set all_passed false
			}
		}
	}
	foreach subtest $result {
		subtest [dict get $subtest cond] [dict get $subtest desc] [dict get $subtest diag]
	}
}

#
# Delete left-over program and rebuild it
# Compilation is not optimized to avoid memset to be skipped.
#
run_command -fail "$bin_cc -o $file_prog ${file_prog}.c"
run_command -fail "$bin_chmod 700 $file_prog"

make_bash_script $file_in "
    $srun ./$file_prog $ret_code $sleep_time $mem_size $file_size $file_stem
"

# Create a file to read
set fd [open ${file_stem}.read "wb"]
for {set i 0} {$i < $file_size} {incr i} {
    puts -nonewline $fd [binary format n $i]
}
close $fd

#
# Run a simple job
# Usage: test12.2.prog <exit_code> <sleep_secs> <mem_kb>
# <file_size> <file_stem>
#
set config_prob 0
set timeout [expr $max_job_delay + $sleep_time]
set job_id [submit_job -fail "-n$num_tasks --mem-per-cpu=$job_mem_limit --output=$file_out --error=$file_err -t2 $file_in"]

#
# Wait for job to run
#
wait_for_job -fail $job_id "RUNNING"

wait_and_subtest _get_mem       $sstat
wait_and_subtest _get_file_size $sstat

#
# Wait for job to complete
#
wait_for_job -fail $job_id "DONE"

wait_and_subtest _get_mem       $sacct
wait_and_subtest _get_file_size $sacct

#
# Report basic sacct info
#
set output "COMPLETING"
wait_for -fail {![regexp "COMPLETING" $output]} {
	set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format jobid,jobname,state,exitcode --starttime=00:00"]
}
subtest {[regexp "$job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" $output]} "sacct should report $job_id\\.$step_id.$file_prog.$expected_state.$expected_ret_code" "$output"

#
# Report the sacct accounting info: Elapsed
#
set mins 0
set secs 0
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format elapsed --starttime=00:00"]
subtest {[regexp {(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?} $output - hours mins secs msecs]} "sacct should report the elapsed time in the format \[HH:\]MM:SS\[.sss\]" "$output"
# Use scan to avoid TCL octals
set value  [expr ([scan $mins "%d"] * 60) + [scan $secs "%d"]]
subtest {[tolerance $sleep_time $value +$max_time_error]} "Elapsed time reported by sacct should be close to $sleep_time" "$value too different from $sleep_time"

#
# Report the sacct accounting info: TotalCPU
#
set mins 0
set secs 0
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format totalcpu --starttime=00:00"]
subtest {[regexp {(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?} $output - hours mins secs msecs]} "sacct should report the TotalCPU time in the format \[HH:\]MM:SS\[.sss\]" "$output"
# Use scan to avoid TCL octals
set value  [expr ([scan $mins "%d"] * 60) + [scan $secs "%d"]]
subtest {$value <= $sleep_time} "TotalCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"

#
# Report the sacct accounting info: MinCPU
#
set mins 0
set secs 0
set output [run_command_output "$sacct --noheader -P --job=$job_id.$step_id --format mincpu --starttime=00:00"]
subtest {[regexp {(?:(\d+):)?(\d+):(\d+)(?:\.(\d+))?} $output - hours mins secs msecs]} "sacct should report the MinCPU time in the format \[HH:\]MM:SS\[.sss\]" "$output"
# Use scan to avoid TCL octals
set value  [expr ([scan $mins "%d"] * 60) + [scan $secs "%d"]]
subtest {$value <= $sleep_time} "MinCPU reported by sacct should be less than $sleep_time secs" "$value > $sleep_time"