File: inc21.21_tests

package info (click to toggle)
slurm-wlm 22.05.8-4%2Bdeb12u3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 48,492 kB
  • sloc: ansic: 475,246; exp: 69,020; sh: 8,862; javascript: 6,528; python: 6,444; makefile: 4,185; perl: 4,069; pascal: 131
file content (454 lines) | stat: -rw-r--r-- 13,157 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
#!/usr/bin/env expect
############################################################################
# Purpose: Test for accounting records of specific job names with their ID
############################################################################
# Copyright (C) 2015 SchedMD LLC.
# Written by Nathan Yee <nyee32@schedmd.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################


#
# Supplemental function to test21.21 that test a job with
# resources within the allowed limit in the association
#
proc inc21_21_good { test_type limit } {
	global number bin_id ta srun test_node selectparam nthreads is_skip re_word_str

	set job_id 0
	set val 0
	set add ""

	# Wait for old jobs to clean up
	sleep 2

	log_info "====== Test $test_type ======"

	if {($test_type eq "maxcpus" || $test_type eq "maxcpumins") && [default_part_exclusive] != 0} {
		log_warn "Unable to perform test with exclusive node allocations"
		set is_skip 1
		return
	}
	set select_type_param [get_select_type_params]
	if { [string first "CR_SOCKET" $select_type_param] != -1} {
		log_warn "This test can't be run SelectTypeParameters=CR_SOCKET"
		set is_skip 1
		return
	}
	if {$test_type eq "maxnode"} {
		set add "--exclusive"
	} else {
		set add "-w$test_node"
	}

	set matches 0
	spawn $srun -v -t1 $add [lindex $limit 0][lindex $limit 1] \
	    --account=$ta $bin_id
	expect {
		-re "launching StepId=($number)\\.$re_word_str" {
			set job_id $expect_out(1,string)
			incr matches
			exp_continue
		}
		timeout {
			fail "srun not responding"
		}
		eof {
			wait
		}
	}
	if {$job_id == 0} {
		wait_for_job -fail $job_id "DONE"
	}

	subtest -fail { $matches == 1 } "Job launches with correct limit"
}

#
# Supplemental function to test21.21 that test a job with
# resources larger than allowed limit in the association
#
proc inc21_21_bad { test_type limit } {
	global number bin_id ta srun test_node nthreads selectparam re_word_str

	set job_id 0
	set over_lim [expr [lindex $limit 1] + 1]
	set add ""

	log_info "====== Test $test_type ======"

	if {$test_type eq "maxnode"} {
		set add "--exclusive"
	} else {
		set add "-w$test_node"
	}

	set matches 0
	spawn $srun -v $add -t1 [lindex $limit 0]$over_lim --account=$ta \
	    -I $bin_id
	expect {
		-re "Job violates accounting/QOS policy" {
			log_info "This error is expected, not a problem"
			exp_continue
		}
		-re "launching StepId=($number)\\.$re_word_str" {
			set job_id $expect_out(1,string)
			fail "Job ($job_id) should not have run"
		}
		timeout {
			fail "srun not responding"
		}
		eof {
			wait
		}
	}
	if {$job_id != 0} {
		wait_for_job -fail $job_id "DONE"
	}
}

proc inc21_21_grp_test { test_type limit } {
	global number bin_id ta srun sbatch test_node selectparam nthreads is_skip
	global file_in squeue scancel bin_bash bin_chmod job_list

	set val 0
	set exclusive ""

	log_info "===== Test $test_type ====="

	if  { $test_type eq "grpcpumins" &&
	      ![param_contains [get_config_param "AccountingStorageEnforce"] "safe"] } {
		log_warn "This test can't be run without AccountingStorageEnforce having \"safe\" in it"
		set is_skip 1
		return
	}

	if { [default_part_exclusive] != 0} {
		log_warn "This test can't be run Exclusive node allocations"
		set is_skip 1
		return
	}

	set select_type_param [get_select_type_params]
	if { [string first "CR_SOCKET" $select_type_param] != -1} {
		log_warn "This test can't be run SelectTypeParameters=CR_SOCKET"
		set is_skip 1
		return
	}

	# Check and see if it is a CPU test
	if {$test_type eq "grpcpus" || $test_type eq "grpcpumins" || $test_type eq "grpcpurunmins"} {
		if {$selectparam} {
			set val [expr [lindex $limit 1] / $nthreads]
		} else {
			set val [lindex $limit 1]
		}
	} else {
		set exclusive "#SBATCH --exclusive"
		set val [lindex $limit 1]
	}

	make_bash_script $file_in "
$exclusive
sleep 10"

	#
	# First we will submit n jobs that should be below the association limit
	# and should run. We wait for these to start before submitting the
	# over-limit job. If we were to submit them all at once, periodically the
	# earlier submitted jobs can take longer to start than later submitted jobs
	# such as when an epilog is still in progress on the assigned nodes.
	#
	for {set inx 0} {$inx < $val} {incr inx} {
		set job_id($inx) [submit_job -fail "-t1 [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"]
		lappend job_list $job_id($inx)
	}

	# Wait for the expected jobs to start running
	for {set inx 0} {$inx < $val} {incr inx} {
		wait_for -fail -timeout 30 -pollinterval .2 {$state eq "RUNNING"} {
			set state [get_job_param $job_id($inx) "JobState"]
		}
	}

	#
	# Submit an additional job. This job should pend since it will be past the
	# association limit. This job gets a longer time limit to avoid having it
	# prematurely start after _decay_thread() runs and decays the values of the
	# other running jobs.
	#
	set job_id($inx) [submit_job -fail "-t$val [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"]
	lappend job_list $job_id($inx)

	set pending 0
	set running 0
	spawn $squeue -A $ta -h -o "\%t \%r"
	expect {
		-re "PD." {
			incr pending
			exp_continue
		}
		-re "R.None" {
			incr running
			exp_continue
		}
		timeout {
			fail "squeue not responding"
		}
		eof {
			wait
		}
	}

	subtest -fail { $pending == 1 && $running == $val } "$test_type limit" "Found $pending jobs pending and $running jobs running while expecting 1 and $val"

	#
	# Cancel test jobs
	#
	spawn $scancel --quiet --account=$ta
	expect {
		eof {
			wait
		}
	}
}

#
# Supplemental function to test21.21 that test for max/grp
# submit and jobs
#
proc inc21_21_submit_test { limit } {
	global file_in srun sbatch squeue scancel bin_id number bin_sleep is_skip
	global bin_rm ta maxjob_lim maxsub_lim
	global acct_mod_desc acct_mod_acct_vals acct_mod_assoc_vals
	global acct_mod_assoc_test_vals job_list

	set limit_job ""
	set limit_sub ""

	if {$limit eq "grpjobsub" && [default_part_exclusive] != 0} {
		log_warn "Unable to perform test with exclusive node allocations"
		set is_skip 1
		return
	}

	if {$limit eq "maxjobsub"} {
		set limit_job "maxjob"
		set limit_sub "maxsubmit"

	} else {
		set limit_job "grpjob"
		set limit_sub "grpsubmit"
	}

	set acct_mod_assoc_test_vals($limit_job) \
	    [lindex $acct_mod_assoc_vals($limit) 0]
	set acct_mod_assoc_test_vals($limit_sub) \
	    [lindex $acct_mod_assoc_vals($limit) 1]
	if [mod_acct $ta [array get acct_mod_desc] \
				 [array get acct_mod_assoc_test_vals] \
				 [array get acct_mod_acct_vals]] {
		fail "Unable to modify account ($ta)"
	}

	make_bash_script $file_in "
	$bin_sleep 10
	"

	# Test to make sure that the grpsubmit and maxsubmit
	# are enforced with jobs
	log_info "==== Test $limit ===="

	# Submit jobs to test the limit set in the association
	for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
		set job_id($inx) [submit_job -fail "-N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
		if { !$job_id($inx) } {
			fail "sbatch didn't return jobid"
		}
		lappend job_list $job_id($inx)
		# We need to sleep because of the way the scheduler works
		# if we don't sleep then we could
		sleep 1
	}

	# Wait for the allowed jobs to start running
	for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_job)} {incr inx} {
		wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING"
	}

	# Then submit one more over the limit and it should fail
	set result [run_command "$sbatch -N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
	subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail"
	subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error"

	set matches 0
	spawn $squeue -A$ta -h -o "\%i \%t \%r"
	expect {
		-re "($job_id(2)|$job_id(3)).PD.AssocMaxJobsLimit" {
			incr matches
			exp_continue
		}
		-re "($job_id(2)|$job_id(3)).PD.AssocGrpJobsLimit" {
			incr matches
			exp_continue
		}
		-re "($job_id(0)|$job_id(1)).R.None" {
			incr matches
			exp_continue
		}
		timeout {
			fail "squeue not responding"
		}
		eof {
			wait
		}
	}

	spawn $scancel --quiet --account=$ta
	expect {
		eof {
			wait
		}
	}

	if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} {
		log_warn "Only started $matches of 4 possible jobs"
	} elseif { $matches != 4 } {
		fail "Jobs are not in the expected state (expected $matches != 4)"
	}

	# Test to make sure that the grpsubmit and maxsubmit
	# are enforced with job arrays

	log_info "==== Test $limit with job arrays ===="

	# Submit jobs to test the limit set in the association
	for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
		set job_id($inx) [submit_job -fail "-N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
		if { !$job_id($inx) } {
			fail "sbatch didn't return jobid"
		}
		lappend job_list $job_id($inx)

		# We need to sleep because of the way the scheduler works
		# if we don't sleep then we could
		sleep 1
	}

	# Wait for the allowed job arrays to start running
	for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_job)} {incr inx} {
		wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING"
	}

	# Then submit one more over the limit and it should fail
	set result [run_command "$sbatch -N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
	subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail"
	subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error"

	set matches 0
	spawn $squeue -A$ta -h -o "\%i \%t \%r"
	expect {
		-re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocMaxJobsLimit" {
			incr matches
			exp_continue
		}
		-re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocGrpJobsLimit" {
			incr matches
			exp_continue
		}
		-re "($job_id(0)|$job_id(1))_0.R.None" {
			incr matches
			exp_continue
		}
		timeout {
			fail "squeue not responding"
		}
		eof {
			wait
		}
	}

	spawn $scancel --quiet --account=$ta
	expect {
		eof {
			wait
		}
	}

	if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} {
		log_warn "Only started $matches of 4 possible jobs"
	} elseif { $matches != 4 } {
		fail "Jobs are not in the expected state (expected $matches != 4)"
	}

	# Clear the limits
	set acct_mod_assoc_test_vals($limit_job) "-1"
	set acct_mod_assoc_test_vals($limit_sub) "-1"
}

#
# Function that tests an association's grpwall limit
#
proc inc21_21_grpwall { test_type limit } {
	global bin_sleep ta test_qos job_list

	set local_job_list  [list]
	set jobs            5.0
	set grpwall_num     [lindex $limit 1]
	set grpwall_per_job [expr $grpwall_num * 1.1 / $jobs]
	set sleep_time      [expr int(ceil($grpwall_per_job * 60))]
	set job_time        [expr int(ceil($grpwall_per_job))]
	set timeout         120

	log_info "====== Test $test_type ======"

	# Wait for old jobs to clean up
	sleep 2

	# Since wall is a decayed variable lets reset it to make sure the test
	# gets exactly what we would expect.
	reset_qos_usage "" $test_qos

	log_debug "Running $jobs jobs of $sleep_time seconds of duration to ensure that we reach the Grpwall limit of $grpwall_num minutes"
	for {set i 0} {$i < $jobs} {incr i} {
		set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"]
		lappend local_job_list $job_id
		lappend job_list $job_id
	}

	foreach job_id $local_job_list {
		if {[wait_job_reason $job_id COMPLETED] != $::RETURN_SUCCESS} {
			fail "Job ($job_id) did not complete"
		}
	}

	log_debug "Submitting the final job and check that it is set Pending with Reason AssocGrpWallLimit"
	set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"]
	lappend local_job_list $job_id
	lappend job_list $job_id
	# Subtest of the limit
	if {[wait_job_reason $job_id PENDING AssocGrpWallLimit] != $::RETURN_SUCCESS} {
		cancel_job $local_job_list
		fail "Job should not have run"
	}

	# Cancel jobs
	cancel_job $local_job_list
}