File: test9.4

package info (click to toggle)
slurm-wlm 22.05.8-4%2Bdeb12u3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 48,492 kB
  • sloc: ansic: 475,246; exp: 69,020; sh: 8,862; javascript: 6,528; python: 6,444; makefile: 4,185; perl: 4,069; pascal: 131
file content (162 lines) | stat: -rwxr-xr-x 5,317 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env expect
#############################################################################
# Purpose: Stress test of per-task output and input files
#
# Note:    To avoid one minute NFS propagation delays in the output files,
#          we create them ahead of time. Without explicity file creation,
#          this test requires about one minute per cycle to execute.
#
# Note:    This script generates and then deletes files in the working
#          directory named test9.4.input, test9.4.[0-9]+.input, and
#          test9.4.[0-9]+.output
############################################################################
# Copyright (C) 2002-2007 The Regents of the University of California.
# Copyright (C) 2008-2010 Lawrence Livermore National Security.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################
source ./globals

set file_in       "$test_dir/input"
set file_in_task  "$test_name.%t.input"
set file_out_task "$test_name.%t.output"
set job_name      $test_name
set cycle_count   [get_cycle_count]
set task_cnt      $max_stress_tasks

# TODO: Temporary debug for bug 15302 (remove once fixed)
set config_dir  [get_conf_path]
set config_file $config_dir/slurm.conf


proc cleanup {} {
	global test_name task_cnt

	# Destroy all input/output files
	for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
		set file_in_glob  "$test_name.$tsk.input"
		set file_out_glob "$test_name.$tsk.output"
		file delete $file_in_glob $file_out_glob
	}

	# TODO: Temporary debug for bug 15302 (remove once fixed)
	global config_file
	restore_conf $config_file
	reconfigure
}


if {[get_config_param "LaunchType"] ne "launch/slurm"} {
	skip "This test is only compatible with systems using launch/slurm"
} else {
	set node_cnt 1-4
}
set other_opts    "-O"


# Execute an srun job to print hostname to output_file with task_cnt tasks
# per node, wait for completion
# Returns 0 on successful completion, returns 1 otherwise
proc run_cat_job { input_file output_file } {
	global bin_cat bin_rm job_name number srun node_cnt other_opts task_cnt timeout

	# TODO: Temporarily add verbosity to troubleshoot bug 15302 (restore once fixed)
	spawn $srun -vvvvv --job-name=$job_name -e - -i $input_file -o $output_file -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat -
	#spawn $srun --job-name=$job_name -e - -i $input_file -o $output_file -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat -
	expect {
		-re "Unable to contact" {
			fail "Slurm appears to be down"
		}
		timeout {
			fail "srun not responding"
		}
		eof {
			wait
		}
	}
}


# TODO: Temporary debug for bug 15302 (remove once fixed)
save_conf $config_file
run_command -none "$bin_echo SlurmdDebug=debug4 >> $config_file"
reconfigure -fail

#
# Create a sizable text file
#
exec $bin_cat /etc/hosts >$file_in
exec $bin_cat /etc/passwd >>$file_in
set stdin_lines [get_line_cnt $file_in]

# Make a text file for each task
set timeout $max_job_delay
spawn $srun -e /dev/null -i $file_in -o $file_in_task -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat
expect {
	-re "Unable to contact" {
		fail "Slurm appears to be down"
	}
	timeout {
		fail "srun not responding"
	}
	eof {
		wait
	}
}

#
# Run cycle_count jobs to copy job input to job output and compare sizes
#
set success_cnt 0
set timeout $max_job_delay
for {set inx 0} {$inx < $cycle_count} {incr inx} {
	for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
		set file_out_glob "$test_name.$tsk.output"
		exec $bin_rm -f $file_out_glob
	}
	run_cat_job $file_in_task $file_out_task
	# Test output file sizes
	for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
		set file_out_glob "$test_name.$tsk.output"
		wait_for_file $file_out_glob
		set stdout_lines [get_line_cnt $file_out_glob]
		if {$stdout_lines != $stdin_lines} {
			exec $bin_sleep 1
			set stdout_lines [get_line_cnt $file_out_glob]
		}
		if {$stdout_lines != $stdin_lines} {
			# TODO: Temporary debug for bug 15302 (remove once fixed)
			set hostname [string trimright [run_command_output "hostname -s"]]
			run_command "sudo dmesg -T | grep -i 'killed process'"
			run_command "free -h"
			run_command "top -o %MEM -n 1"

			if {$stdout_lines == 0} {
				fail "stdout is empty. Is current working directory writable from compute nodes?"
			} else {
				fail "stdout is incomplete"
			}
		} else {
			incr success_cnt
		}
	}
}