File: test40.5

package info (click to toggle)
slurm-wlm 22.05.8-4%2Bdeb12u3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 48,492 kB
  • sloc: ansic: 475,246; exp: 69,020; sh: 8,862; javascript: 6,528; python: 6,444; makefile: 4,185; perl: 4,069; pascal: 131
file content (221 lines) | stat: -rwxr-xr-x 6,247 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Increase size of job with allocated MPS
############################################################################
# Copyright (C) 2019 SchedMD LLC
# Written by Morris Jette
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################
source ./globals

set file_in1         "$test_dir/input1"
set file_in2         "$test_dir/input2"
set file_in3         "$test_dir/input3"
set file_out1        "$test_dir/output1"
set file_out2        "$test_dir/output2"
set job_id1          0
set job_id2          0

if {![param_contains [get_config_param "SchedulerParameters"] "permit_job_expansion"]} {
    skip "This test is only compatible with SchedulerParameters=permit_job_expansion"
}

if {![check_config_select "cons_tres"]} {
    skip "This test is only compatible with select/cons_tres"
}
if {[get_config_param "FrontendName"] ne "MISSING"} {
    skip "This test is incompatible with front-end systems"
}

set constrain_devices [expr {[get_config_param "ConstrainDevices"] eq "yes"}]
if {$constrain_devices} {
	log_debug "Devices files are constrained by cgroups"
} else {
	log_debug "Devices files are NOT constrained by cgroups"
}

if {[llength [get_nodes_by_request "--gres=mps:100 -N2 -t2"]] == 0} {
	skip "This test requires being able to submit job with --gres=mps:100 -N2"
}

proc cleanup {} {
	global job_id1 job_id2 bin_rm

	cancel_job [list $job_id1 $job_id2]

	if {$job_id2 > 0} {
		exec $bin_rm -f slurm_job_${job_id2}_resize.csh
		exec $bin_rm -f slurm_job_${job_id2}_resize.sh
	}
}

#
# Build input scripts
# file_in1: Determine MPS allocated, wait for dependent job to exit,
#	expand allocation and run another job
# file_in2: Determine allocated, shrink to size 0 and exit
# file_in3: Print the hostname and MPS info
#
exec $bin_rm -f $file_out1 $file_out2
make_bash_script $file_in1 "
	$scontrol -dd show job \${SLURM_JOBID}
	$srun $file_in3
	sleep 20	# Wait for job 2 submission
	while true; do
		$squeue -h -n test_child_$test_id | wc | grep -e ' *0 *0 *0'
		if \[ \${?} -eq 0 \]; then
			break
		fi
		sleep 5
	done
	$scontrol update JobId=\${SLURM_JOBID} NumNodes=ALL
	. slurm_job_\${SLURM_JOBID}_resize.sh
	$scontrol -dd show job \${SLURM_JOBID}
	$srun $file_in3
	$bin_rm -f slurm_job_\${SLURM_JOBID}_resize.csh
	$bin_rm -f slurm_job_\${SLURM_JOBID}_resize.sh
	exit 0"

make_bash_script $file_in2 "
	$scontrol -dd show job \${SLURM_JOBID}
	$scontrol update JobId=\${SLURM_JOBID} NumNodes=0
	. slurm_job_\${SLURM_JOBID}_resize.sh
	# JOB GETS CANCELLED HERE AS BATCH HOST GETS REMOVED FROM JOB ALLOCATION
	$bin_rm -f slurm_job_\${SLURM_JOBID}_resize.csh
	$bin_rm -f slurm_job_\${SLURM_JOBID}_resize.sh
	exit 0"

# NOTE: We pipe stderr from slurmd and discard stdout
make_bash_script $file_in3 "
$slurmd -N \$SLURMD_NODENAME -G 2>&1 >/dev/null | grep 'Gres Name=mps' | grep 'Index='\$CUDA_VISIBLE_DEVICES
echo 'HOST:'\$SLURMD_NODENAME 'CUDA_VISIBLE_DEVICES:'\$CUDA_VISIBLE_DEVICES 'CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:'\$CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"

#
# Submit job to expand: uses 10 gres/mps on one node
#
spawn $sbatch -N1 --exclusive -J $test_name -t2 --gres=mps:10 --output=$file_out1 $file_in1
expect {
	-re "Submitted batch job ($number)" {
		set job_id1 $expect_out(1,string)
		exp_continue
	}
	timeout {
		fail "sbatch not responding"
	}
	eof {
		wait
	}
}
if {$job_id1 == 0} {
	fail "Job 1 not submitted"
}
wait_for_job -fail $job_id1 "RUNNING"

#
# Submit job to give up resources: uses 10 gres/mps on one node
#
spawn $sbatch -N1 --exclusive -J "test_child_$test_id" --dependency=expand:$job_id1 -t1 --gres=mps:10 --output=$file_out2 $file_in2
expect {
	-re "Submitted batch job ($number)" {
		set job_id2 $expect_out(1,string)
		exp_continue
	}
	timeout {
		fail "sbatch not responding"
	}
	eof {
		wait
	}
}
if {$job_id2 == 0} {
	fail "Job 2 not submitted"
}
wait_for_job -fail $job_id1 "DONE"
wait_for_job -fail $job_id2 "DONE"

#
# Parse the output files from job 1
#
log_info "Parse job 1 output"
wait_for_file -fail $file_out1

set match 0
set percentage -1
spawn $bin_cat $file_out1
expect {
	-re "CUDA_VISIBLE_DEVICES:($number) CUDA_MPS_ACTIVE_THREAD_PERCENTAGE:($number)" {
		incr match
		set percentage $expect_out(2,string)
		exp_continue
	}
	eof {
		wait
	}
}
if {$match != 3} {
	fail "Bad CUDA information about job 1 ($match != 3)"
}

#
# If devices not constrained and Count on all allocated devices is the same
# then confirm the CUDA_MPS_ACTIVE_THREAD_PERCENTAGE value above is correct
#
if {$constrain_devices == 0} {
	set count -1
	log_user 0
	spawn $bin_cat $file_out1
	expect {
		-re "Count=($number)" {
			if {$count == -1} {
				set count $expect_out(1,string)
			} elseif {$count != $expect_out(1,string)} {
				set count -1
			}
			exp_continue
		}
		eof {
			wait
		}
	}
	log_user 1
	if {$count > 0} {
		set count [expr 10 * 100 / $count]
		if {$percentage != $count} {
			fail "Bad CUDA_MPS_ACTIVE_THREAD_PERCENTAGE value ($percentage != $count)"
		} else {
			log_debug "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE value is good"
		}
	}
}

#
# Parse the output files from job 2
# Not currently looking for anything, but do log it's contents before purge
#
log_info "Parse job 2 output"
wait_for_file -fail $file_out2
set match 0
spawn $bin_cat $file_out2
expect {
	eof {
		wait
	}
}