File: inc21.21_tests

package info (click to toggle)
slurm-wlm 22.05.8-4%2Bdeb12u3
links: PTS, VCS
area: main
in suites: bookworm
size: 48,492 kB
sloc: ansic: 475,246; exp: 69,020; sh: 8,862; javascript: 6,528; python: 6,444; makefile: 4,185; perl: 4,069; pascal: 131
file content (454 lines) | stat: -rw-r--r-- 13,157 bytes
parent folder | download | duplicates (2)
#!/usr/bin/env expect
############################################################################
# Purpose: Test for accounting records of specific job names with their ID
############################################################################
# Copyright (C) 2015 SchedMD LLC.
# Written by Nathan Yee <nyee32@schedmd.com>
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
############################################################################


#
# Supplemental function to test21.21 that test a job with
# resources within the allowed limit in the association
#
proc inc21_21_good { test_type limit } {
	global number bin_id ta srun test_node selectparam nthreads is_skip re_word_str

	set job_id 0
	set val 0
	set add ""

	# Wait for old jobs to clean up
	sleep 2

	log_info "====== Test $test_type ======"

	if {($test_type eq "maxcpus" || $test_type eq "maxcpumins") && [default_part_exclusive] != 0} {
		log_warn "Unable to perform test with exclusive node allocations"
		set is_skip 1
		return
	}
	set select_type_param [get_select_type_params]
	if { [string first "CR_SOCKET" $select_type_param] != -1} {
		log_warn "This test can't be run SelectTypeParameters=CR_SOCKET"
		set is_skip 1
		return
	}
	if {$test_type eq "maxnode"} {
		set add "--exclusive"
	} else {
		set add "-w$test_node"
	}

	set matches 0
	spawn $srun -v -t1 $add [lindex $limit 0][lindex $limit 1] \
	    --account=$ta $bin_id
	expect {
		-re "launching StepId=($number)\\.$re_word_str" {
			set job_id $expect_out(1,string)
			incr matches
			exp_continue
		}
		timeout {
			fail "srun not responding"
		}
		eof {
			wait
		}
	}
	if {$job_id == 0} {
		wait_for_job -fail $job_id "DONE"
	}

	subtest -fail { $matches == 1 } "Job launches with correct limit"
}

#
# Supplemental function to test21.21 that test a job with
# resources larger than allowed limit in the association
#
proc inc21_21_bad { test_type limit } {
	global number bin_id ta srun test_node nthreads selectparam re_word_str

	set job_id 0
	set over_lim [expr [lindex $limit 1] + 1]
	set add ""

	log_info "====== Test $test_type ======"

	if {$test_type eq "maxnode"} {
		set add "--exclusive"
	} else {
		set add "-w$test_node"
	}

	set matches 0
	spawn $srun -v $add -t1 [lindex $limit 0]$over_lim --account=$ta \
	    -I $bin_id
	expect {
		-re "Job violates accounting/QOS policy" {
			log_info "This error is expected, not a problem"
			exp_continue
		}
		-re "launching StepId=($number)\\.$re_word_str" {
			set job_id $expect_out(1,string)
			fail "Job ($job_id) should not have run"
		}
		timeout {
			fail "srun not responding"
		}
		eof {
			wait
		}
	}
	if {$job_id != 0} {
		wait_for_job -fail $job_id "DONE"
	}
}

proc inc21_21_grp_test { test_type limit } {
	global number bin_id ta srun sbatch test_node selectparam nthreads is_skip
	global file_in squeue scancel bin_bash bin_chmod job_list

	set val 0
	set exclusive ""

	log_info "===== Test $test_type ====="

	if  { $test_type eq "grpcpumins" &&
	      ![param_contains [get_config_param "AccountingStorageEnforce"] "safe"] } {
		log_warn "This test can't be run without AccountingStorageEnforce having \"safe\" in it"
		set is_skip 1
		return
	}

	if { [default_part_exclusive] != 0} {
		log_warn "This test can't be run Exclusive node allocations"
		set is_skip 1
		return
	}

	set select_type_param [get_select_type_params]
	if { [string first "CR_SOCKET" $select_type_param] != -1} {
		log_warn "This test can't be run SelectTypeParameters=CR_SOCKET"
		set is_skip 1
		return
	}

	# Check and see if it is a CPU test
	if {$test_type eq "grpcpus" || $test_type eq "grpcpumins" || $test_type eq "grpcpurunmins"} {
		if {$selectparam} {
			set val [expr [lindex $limit 1] / $nthreads]
		} else {
			set val [lindex $limit 1]
		}
	} else {
		set exclusive "#SBATCH --exclusive"
		set val [lindex $limit 1]
	}

	make_bash_script $file_in "
$exclusive
sleep 10"

	#
	# First we will submit n jobs that should be below the association limit
	# and should run. We wait for these to start before submitting the
	# over-limit job. If we were to submit them all at once, periodically the
	# earlier submitted jobs can take longer to start than later submitted jobs
	# such as when an epilog is still in progress on the assigned nodes.
	#
	for {set inx 0} {$inx < $val} {incr inx} {
		set job_id($inx) [submit_job -fail "-t1 [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"]
		lappend job_list $job_id($inx)
	}

	# Wait for the expected jobs to start running
	for {set inx 0} {$inx < $val} {incr inx} {
		wait_for -fail -timeout 30 -pollinterval .2 {$state eq "RUNNING"} {
			set state [get_job_param $job_id($inx) "JobState"]
		}
	}

	#
	# Submit an additional job. This job should pend since it will be past the
	# association limit. This job gets a longer time limit to avoid having it
	# prematurely start after _decay_thread() runs and decays the values of the
	# other running jobs.
	#
	set job_id($inx) [submit_job -fail "-t$val [lindex $limit 0]1 --account=$ta --output=/dev/null --error=/dev/null $file_in"]
	lappend job_list $job_id($inx)

	set pending 0
	set running 0
	spawn $squeue -A $ta -h -o "\%t \%r"
	expect {
		-re "PD." {
			incr pending
			exp_continue
		}
		-re "R.None" {
			incr running
			exp_continue
		}
		timeout {
			fail "squeue not responding"
		}
		eof {
			wait
		}
	}

	subtest -fail { $pending == 1 && $running == $val } "$test_type limit" "Found $pending jobs pending and $running jobs running while expecting 1 and $val"

	#
	# Cancel test jobs
	#
	spawn $scancel --quiet --account=$ta
	expect {
		eof {
			wait
		}
	}
}

#
# Supplemental function to test21.21 that test for max/grp
# submit and jobs
#
proc inc21_21_submit_test { limit } {
	global file_in srun sbatch squeue scancel bin_id number bin_sleep is_skip
	global bin_rm ta maxjob_lim maxsub_lim
	global acct_mod_desc acct_mod_acct_vals acct_mod_assoc_vals
	global acct_mod_assoc_test_vals job_list

	set limit_job ""
	set limit_sub ""

	if {$limit eq "grpjobsub" && [default_part_exclusive] != 0} {
		log_warn "Unable to perform test with exclusive node allocations"
		set is_skip 1
		return
	}

	if {$limit eq "maxjobsub"} {
		set limit_job "maxjob"
		set limit_sub "maxsubmit"

	} else {
		set limit_job "grpjob"
		set limit_sub "grpsubmit"
	}

	set acct_mod_assoc_test_vals($limit_job) \
	    [lindex $acct_mod_assoc_vals($limit) 0]
	set acct_mod_assoc_test_vals($limit_sub) \
	    [lindex $acct_mod_assoc_vals($limit) 1]
	if [mod_acct $ta [array get acct_mod_desc] \
				 [array get acct_mod_assoc_test_vals] \
				 [array get acct_mod_acct_vals]] {
		fail "Unable to modify account ($ta)"
	}

	make_bash_script $file_in "
	$bin_sleep 10
	"

	# Test to make sure that the grpsubmit and maxsubmit
	# are enforced with jobs
	log_info "==== Test $limit ===="

	# Submit jobs to test the limit set in the association
	for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
		set job_id($inx) [submit_job -fail "-N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
		if { !$job_id($inx) } {
			fail "sbatch didn't return jobid"
		}
		lappend job_list $job_id($inx)
		# We need to sleep because of the way the scheduler works
		# if we don't sleep then we could
		sleep 1
	}

	# Wait for the allowed jobs to start running
	for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_job)} {incr inx} {
		wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING"
	}

	# Then submit one more over the limit and it should fail
	set result [run_command "$sbatch -N1 -n1 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
	subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail"
	subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error"

	set matches 0
	spawn $squeue -A$ta -h -o "\%i \%t \%r"
	expect {
		-re "($job_id(2)|$job_id(3)).PD.AssocMaxJobsLimit" {
			incr matches
			exp_continue
		}
		-re "($job_id(2)|$job_id(3)).PD.AssocGrpJobsLimit" {
			incr matches
			exp_continue
		}
		-re "($job_id(0)|$job_id(1)).R.None" {
			incr matches
			exp_continue
		}
		timeout {
			fail "squeue not responding"
		}
		eof {
			wait
		}
	}

	spawn $scancel --quiet --account=$ta
	expect {
		eof {
			wait
		}
	}

	if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} {
		log_warn "Only started $matches of 4 possible jobs"
	} elseif { $matches != 4 } {
		fail "Jobs are not in the expected state (expected $matches != 4)"
	}

	# Test to make sure that the grpsubmit and maxsubmit
	# are enforced with job arrays

	log_info "==== Test $limit with job arrays ===="

	# Submit jobs to test the limit set in the association
	for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_sub)} {incr inx} {
		set job_id($inx) [submit_job -fail "-N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
		if { !$job_id($inx) } {
			fail "sbatch didn't return jobid"
		}
		lappend job_list $job_id($inx)

		# We need to sleep because of the way the scheduler works
		# if we don't sleep then we could
		sleep 1
	}

	# Wait for the allowed job arrays to start running
	for {set inx 0} {$inx < $acct_mod_assoc_test_vals($limit_job)} {incr inx} {
		wait_for_job -fail -timeout 10 -pollinterval .1 $job_id($inx) "RUNNING"
	}

	# Then submit one more over the limit and it should fail
	set result [run_command "$sbatch -N1 -a0 --account=$ta --output=/dev/null --error=/dev/null -t5 $file_in"]
	subtest [dict get $result exit_code] "Job submitted in excess of $limit limit should fail"
	subtest {[regexp {Job violates accounting/QOS policy} [dict get $result output]]} "Job submitted in execess of $limit limit should display policy violation error"

	set matches 0
	spawn $squeue -A$ta -h -o "\%i \%t \%r"
	expect {
		-re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocMaxJobsLimit" {
			incr matches
			exp_continue
		}
		-re "($job_id(2)|$job_id(3))_\\\[0\\\].PD.AssocGrpJobsLimit" {
			incr matches
			exp_continue
		}
		-re "($job_id(0)|$job_id(1))_0.R.None" {
			incr matches
			exp_continue
		}
		timeout {
			fail "squeue not responding"
		}
		eof {
			wait
		}
	}

	spawn $scancel --quiet --account=$ta
	expect {
		eof {
			wait
		}
	}

	if {$limit eq "maxjobsub" && $matches > 0 && $matches < 4 && [default_part_exclusive] != 0} {
		log_warn "Only started $matches of 4 possible jobs"
	} elseif { $matches != 4 } {
		fail "Jobs are not in the expected state (expected $matches != 4)"
	}

	# Clear the limits
	set acct_mod_assoc_test_vals($limit_job) "-1"
	set acct_mod_assoc_test_vals($limit_sub) "-1"
}

#
# Function that tests an association's grpwall limit
#
proc inc21_21_grpwall { test_type limit } {
	global bin_sleep ta test_qos job_list

	set local_job_list  [list]
	set jobs            5.0
	set grpwall_num     [lindex $limit 1]
	set grpwall_per_job [expr $grpwall_num * 1.1 / $jobs]
	set sleep_time      [expr int(ceil($grpwall_per_job * 60))]
	set job_time        [expr int(ceil($grpwall_per_job))]
	set timeout         120

	log_info "====== Test $test_type ======"

	# Wait for old jobs to clean up
	sleep 2

	# Since wall is a decayed variable lets reset it to make sure the test
	# gets exactly what we would expect.
	reset_qos_usage "" $test_qos

	log_debug "Running $jobs jobs of $sleep_time seconds of duration to ensure that we reach the Grpwall limit of $grpwall_num minutes"
	for {set i 0} {$i < $jobs} {incr i} {
		set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"]
		lappend local_job_list $job_id
		lappend job_list $job_id
	}

	foreach job_id $local_job_list {
		if {[wait_job_reason $job_id COMPLETED] != $::RETURN_SUCCESS} {
			fail "Job ($job_id) did not complete"
		}
	}

	log_debug "Submitting the final job and check that it is set Pending with Reason AssocGrpWallLimit"
	set job_id [submit_job -fail "--account=$ta -N1 -t$job_time --wrap '$bin_sleep $sleep_time' -o /dev/null -e /dev/null"]
	lappend local_job_list $job_id
	lappend job_list $job_id
	# Subtest of the limit
	if {[wait_job_reason $job_id PENDING AssocGrpWallLimit] != $::RETURN_SUCCESS} {
		cancel_job $local_job_list
		fail "Job should not have run"
	}

	# Cancel jobs
	cancel_job $local_job_list
}