1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
#!/usr/bin/env expect
#############################################################################
# Purpose: Stress test of per-task output and input files
#
# Note: To avoid one minute NFS propagation delays in the output files,
# we create them ahead of time. Without explicity file creation,
# this test requires about one minute per cycle to execute.
#
# Note: This script generates and then deletes files in the working
# directory named test9.4.input, test9.4.[0-9]+.input, and
# test9.4.[0-9]+.output
############################################################################
# Copyright (C) 2002-2007 The Regents of the University of California.
# Copyright (C) 2008-2010 Lawrence Livermore National Security.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Morris Jette <jette1@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
set file_in "$test_dir/input"
set file_in_task "$test_name.%t.input"
set file_out_task "$test_name.%t.output"
set job_name $test_name
set cycle_count [get_cycle_count]
set task_cnt $max_stress_tasks
# TODO: Temporary debug for bug 15302 (remove once fixed)
set config_dir [get_conf_path]
set config_file $config_dir/slurm.conf
proc cleanup {} {
global test_name task_cnt
# Destroy all input/output files
for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
set file_in_glob "$test_name.$tsk.input"
set file_out_glob "$test_name.$tsk.output"
file delete $file_in_glob $file_out_glob
}
# TODO: Temporary debug for bug 15302 (remove once fixed)
global config_file
restore_conf $config_file
reconfigure
}
if {[get_config_param "LaunchType"] ne "launch/slurm"} {
skip "This test is only compatible with systems using launch/slurm"
} else {
set node_cnt 1-4
}
set other_opts "-O"
# Execute an srun job to print hostname to output_file with task_cnt tasks
# per node, wait for completion
# Returns 0 on successful completion, returns 1 otherwise
proc run_cat_job { input_file output_file } {
global bin_cat bin_rm job_name number srun node_cnt other_opts task_cnt timeout
# TODO: Temporarily add verbosity to troubleshoot bug 15302 (restore once fixed)
spawn $srun -vvvvv --job-name=$job_name -e - -i $input_file -o $output_file -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat -
#spawn $srun --job-name=$job_name -e - -i $input_file -o $output_file -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat -
expect {
-re "Unable to contact" {
fail "Slurm appears to be down"
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
}
# TODO: Temporary debug for bug 15302 (remove once fixed)
save_conf $config_file
run_command -none "$bin_echo SlurmdDebug=debug4 >> $config_file"
reconfigure -fail
#
# Create a sizable text file
#
exec $bin_cat /etc/hosts >$file_in
exec $bin_cat /etc/passwd >>$file_in
set stdin_lines [get_line_cnt $file_in]
# Make a text file for each task
set timeout $max_job_delay
spawn $srun -e /dev/null -i $file_in -o $file_in_task -N$node_cnt -n$task_cnt $other_opts -t1 $bin_cat
expect {
-re "Unable to contact" {
fail "Slurm appears to be down"
}
timeout {
fail "srun not responding"
}
eof {
wait
}
}
#
# Run cycle_count jobs to copy job input to job output and compare sizes
#
set success_cnt 0
set timeout $max_job_delay
for {set inx 0} {$inx < $cycle_count} {incr inx} {
for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
set file_out_glob "$test_name.$tsk.output"
exec $bin_rm -f $file_out_glob
}
run_cat_job $file_in_task $file_out_task
# Test output file sizes
for {set tsk 0} {$tsk < $task_cnt} {incr tsk} {
set file_out_glob "$test_name.$tsk.output"
wait_for_file $file_out_glob
set stdout_lines [get_line_cnt $file_out_glob]
if {$stdout_lines != $stdin_lines} {
exec $bin_sleep 1
set stdout_lines [get_line_cnt $file_out_glob]
}
if {$stdout_lines != $stdin_lines} {
# TODO: Temporary debug for bug 15302 (remove once fixed)
set hostname [string trimright [run_command_output "hostname -s"]]
run_command "sudo dmesg -T | grep -i 'killed process'"
run_command "free -h"
run_command "top -o %MEM -n 1"
if {$stdout_lines == 0} {
fail "stdout is empty. Is current working directory writable from compute nodes?"
} else {
fail "stdout is incomplete"
}
} else {
incr success_cnt
}
}
}
|