1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562
|
#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
# Test sreport cluster utilization. Also test for
# sreport -M option
############################################################################
# Copyright (C) 2008 Lawrence Livermore National Security.
# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
# Written by Danny Auble <da@llnl.gov>
# CODE-OCEC-09-009. All rights reserved.
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
############################################################################
source ./globals
source ./globals_accounting
source ./globals_federation
source ./inc22.1.1
source ./inc22.1.2
source ./inc22.1.3
source ./inc22.1.4
source ./inc22.1.5
source ./inc22.1.fed
set test_nu "test22-1"
set matches 0
set not_support 0
set sql_in_clus1 "$test_dir/clus1-in.sql"
set sql_rem_clus1 "$test_dir/clus1-rem.sql"
set sql_in_clus2 "$test_dir/clus2-in.sql"
set sql_rem_clus2 "$test_dir/clus2-rem.sql"
set sql_out "$test_name-out.sql"
set cluster1 [format "%s%s" $test_nu "clus1"]
set cluster2 [format "%s%s" $test_nu "clus2"]
set account1 [format "%s%s" $test_nu "acct1"]
set account2 [format "%s%s" $test_nu "acct2"]
set account3 [format "%s%s" $test_nu "acct3"]
set accounts [format "%s,%s,%s" $account1 $account2 $account3]
set wckey1 [format "%s%s" $test_nu "wckey"]
set user1 [format "%s%s" $test_nu "user1"]
set user2 [format "%s%s" $test_nu "user2"]
set users [format "%s,%s" $user1 $user2]
set node0 [format "%s%d" $cluster1 0]
set node1 [format "%s%d" $cluster1 1]
set node_list [format "%s%s" $cluster1 "\[0-1\]"]
set node0_cpus 2
set node1_cpus 2
set test_job1 "${test_name}_job1"
set test_job2 "${test_name}_job2"
set test_job3 "${test_name}_job3"
set test_job4 "${test_name}_job4"
set cluster_cpus [expr $node0_cpus + $node1_cpus]
set access_err 0
set uid [get_my_uid]
set gid [get_my_gid]
set timeout 120
set fedname "test22_1"
# Cluster
array set clus_req {}
# Account 1 Cluster 1
array set acct_req1 {}
set acct_req1(cluster) "$cluster1,$cluster2"
# Account 2 Cluster 1
array set acct_req2 {}
set acct_req2(cluster) "$cluster1,$cluster2"
set acct_req2(parent) $account2
# User Cluster 1
array set user_req {}
set user_req(cluster) "$cluster1,$cluster2"
set user_req(account) $accounts
set user_req(wckey) $wckey1
# Fri Thu Jan 31 00:00:00 2008 - Needed only for the 00:00:00 for timing purposes
set midnight_time "Thu Jan 31 00:00:00 2008"
set midnight [exec date -d $midnight_time +%s]
# Mon Dec 31 23:00:00 2007
set start_date "Mon Dec 31 23:00:00 2007"
set period_start [exec date -d $start_date +%s]
# Thu Jan 31 23:59:59 2008
set end_date "Thu Jan 31 23:59:59 2008"
set period_end [exec date -d $end_date +%s]
set start_str [timestamp -format %Y-%m-%dT%X -seconds $period_start]
set end_str [timestamp -format %Y-%m-%dT%X -seconds $period_end]
# job0 - we really want this to look like job1 but run right before hand.
set job0_start $period_start
set job0_run 1200
set job0_end [expr $job0_start + $job0_run]
# job1
set job1_start [expr $period_start + 1200]
set job1_run 2700
set job1_end [expr $job1_start + $job1_run]
# This will give us the correct time we ran for
set job1_diff_str [timestamp -format %X -seconds [expr $midnight+$job1_run]]
set job1_start_str [timestamp -format %Y-%m-%dT%X -seconds $job1_start]
set job1_end_str [timestamp -format %Y-%m-%dT%X -seconds $job1_end]
set job1_nodes $node1
set job1_cpus $node1_cpus
set job1_alloc [expr ($job1_run + $job0_run) * $job1_cpus]
set job1_acct $account1
# job2
# Make job eligible an hour into the allocation
set job2_elig [expr $period_start+3600]
# Start the job 65 minutes later so we can check reserved time
set job2_start [expr $job2_elig+3900]
# Run for a day
set job2_run 86400
set job2_end [expr $job2_start+$job2_run]
# This will give us the correct time we ran for
set job2_diff_str "1-00:00:00"
set job2_start_str [timestamp -format %Y-%m-%dT%X -seconds $job2_start]
set job2_end_str [timestamp -format %Y-%m-%dT%X -seconds $job2_end]
set job2_nodes [format "%s\[%s\]" $cluster1 "0-1"]
set job2_cpus [expr $node0_cpus + $node1_cpus]
set job2_alloc [expr $job2_run * $job2_cpus]
set job2_acct $account3
# job3
# Make job eligible an hour before the end of job2
set job3_elig [expr $job2_end-3600]
# Start the job at the end of job2
set job3_start $job2_end
# Run for 65 minutes
set job3_run 3900
set job3_end [expr $job3_start+$job3_run]
# This will give us the correct time we ran for
set job3_diff_str [timestamp -format %X -seconds [expr $midnight+$job3_run]]
set job3_start_str [timestamp -format %Y-%m-%dT%X -seconds $job3_start]
set job3_end_str [timestamp -format %Y-%m-%dT%X -seconds $job3_end]
# Run on just node0
set job3_nodes $node0
set job3_cpus $node0_cpus
set job3_alloc [expr $job3_run * $job3_cpus]
set job3_acct $account2
# job4
# Make job eligible right at the end of job3
set job4_elig $job3_elig
# It never starts, so all it's time is 'planned'
set job4_start 0
set job4_run 0
set job4_end 0
set job4_diff_str "0"
# Run on just node0
set job4_nodes $node0
set job4_cpus $node0_cpus
set job4_alloc [expr $job4_run * $job4_cpus]
set job4_acct $account2
set acct1_alloc $job1_alloc
set acct3_alloc $job2_alloc
set acct2_alloc [expr $acct3_alloc + $job3_alloc]
set total_alloc [expr $job1_alloc + $job2_alloc + $job3_alloc]
set wckey1_alloc [expr $job1_alloc + $job2_alloc + $job3_alloc]
set user1_wckey1_alloc [expr $job1_alloc + $job3_alloc]
set user2_wckey1_alloc $job2_alloc
# Node0 down
set node0_down_start [expr $period_start+(60*45)]
set node0_down_end [expr $period_start+(60*75)]
set node0_start_str [timestamp -format %Y-%m-%dT%X -seconds $node0_down_start]
set node0_end_str [timestamp -format %Y-%m-%dT%X -seconds $node0_down_end]
#
# Check accounting config and bail if not found.
#
if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} {
skip "This test can't be run without a usable AccountStorageType"
}
set wc_key_track true
if { [get_config_param -dbd "TrackWCKey"] eq "No" } {
log_warn "This test can't be totally run without TrackWCKey set in slurmdbd.conf"
set wc_key_track false
}
if {[get_admin_level] ne "Administrator"} {
skip "This test can't be run without being an Accounting administrator.\nUse: sacctmgr mod user \$USER set admin=admin"
}
proc cleanup {} {
global sql_rem_clus1 sql_rem_clus2 users accounts
global cluster1 cluster2 fed_name
archive_load $sql_rem_clus1
archive_load $sql_rem_clus2
remove_user "" "" $users
remove_acct "" $accounts
remove_cluster "$cluster1"
remove_cluster "$cluster2"
}
# Add clusters
if [add_cluster $cluster1 [array get clus_req]] {
fail "Unable to add cluster 1"
}
if [add_cluster $cluster2 [array get clus_req]] {
fail "Unable to add cluster 2"
}
# Add accounts
if [add_acct "$account1,$account2" [array get acct_req1]] {
fail "Unable to add accounts ($account1,$account2)"
}
# Add accounts
if [add_acct $account3 [array get acct_req2]] {
fail "Unable to add account ($account3)"
}
# Add users
if [add_user $users [array get user_req]] {
fail "Unable to add user ($users)"
}
# Get the user association ids for the jobs we plan to add
set user1acct1 0
set user1acct2 0
set user1acct3 0
set user2acct1 0
set user2acct2 0
set user2acct3 0
eval spawn $sacctmgr -n -p list assoc users=$users account=$accounts cluster=$cluster2 format="User,account,id"
expect {
-re "There was a problem" {
fail "There was a problem with the sacctmgr command"
}
-re "$user1.$account1.($number)." {
set user1acct1 $expect_out(1,string)
exp_continue
}
-re "$user1.$account2.($number)." {
set user1acct2 $expect_out(1,string)
exp_continue
}
-re "$user1.$account3.($number)." {
set user1acct3 $expect_out(1,string)
exp_continue
}
-re "$user2.$account1.($number)." {
set user2acct1 $expect_out(1,string)
exp_continue
}
-re "$user2.$account2.($number)." {
set user2acct2 $expect_out(1,string)
exp_continue
}
-re "$user2.$account3.($number)." {
set user2acct3 $expect_out(1,string)
exp_continue
}
timeout {
fail "sacctmgr list associations not responding"
}
eof {
wait
}
}
if {!$user1acct1 || !$user1acct2 || !$user1acct3 || !$user2acct1 || !$user2acct2|| !$user2acct3} {
fail "Didn't get one of the user associations ($user1acct1,$user1acct2,$user1acct3,$user2acct1,$user2acct2,$user2acct3)"
}
# Get the wckey ids for the jobs we plan to add
set user1wckey1 0
set user2wckey1 0
eval spawn $sacctmgr -n -p list wckeys users=$users wckeys=$wckey1 cluster=$cluster1 format="user,wckey,id"
expect {
-re "There was a problem" {
fail "There was a problem with the sacctmgr command"
}
-re "$user1.$wckey1.($number)." {
set user1wckey1 $expect_out(1,string)
exp_continue
}
-re "$user2.$wckey1.($number)." {
set user2wckey1 $expect_out(1,string)
exp_continue
}
timeout {
fail "sacctmgr list wckeys not responding"
}
eof {
wait
}
}
if {!$user1wckey1 || !$user2wckey1} {
remove_user "" "" $users
remove_acct "" $accounts
remove_cluster "$cluster1"
fail "Didn't get one of the user wckeys ($user1wckey1,$user2wckey1)"
}
proc test_fed {} {
global scontrol re_word_str sacctmgr env bin_rm cluster1 cluster2 fedname test_dir
set config_dir ""
set new_conf $test_dir/slurm.conf.test22.1
spawn $scontrol show config
expect {
-re "SLURM_CONF *= (/.*)/($re_word_str).*SLURM_VERSION" {
set config_dir $expect_out(1,string)
exp_continue
}
timeout {
fail "scontrol is not responding"
}
eof {
wait
}
}
delete_federations $fedname
set cmd1 "$sacctmgr -i add federation $fedname"
log_debug "$cmd1"
log_debug "[eval exec $cmd1]"
set cmd2 "$sacctmgr -i mod fed $fedname set clusters=$cluster1,$cluster2"
log_debug "$cmd2"
log_debug "[eval exec $cmd2]"
log_debug "This is the config dir: $config_dir; new config: $new_conf "
set old_conf "$config_dir/slurm.conf"
log_debug [exec sed "s/ClusterName\\s*=.*/ClusterName=$cluster1/Ig" $old_conf > $new_conf]
set env(SLURM_CONF) $new_conf
log_debug "ENV VARIABLE: [exec printenv SLURM_CONF] "
# Execute the federation tests in a catch block in order to
# run federation-specific cleanup
set exception_code [catch {
inc22_1_fed
} message] ; # Store any error message in $message
# Cleanup federation
delete_federations $fedname
set env(SLURM_CONF) $old_conf
# Convert any errors into failures (after cleaning up)
if {$exception_code == 1} { ; # errors only
fail "Failure testing federated sreport: $message"
}
}
proc create_sql {cluster sql_in sql_rem} {
global bin_rm cluster_cpus period_start period_end node_list
global node0 node0_cpus node0_down_start node0_down_end
global user1acct1 wckey1 user1wckey1 uid gid debug job1_acct job0_start job0_end
global job1_cpus job1_nodes job1_start job1_end user2acct3 user2wckey1
global job2_acct job2_elig job2_start job2_end job2_cpus job2_nodes
global user1acct2 job3_acct job3_elig job3_start job3_end job3_cpus
global job3_nodes sacct start_str end_str account1 job1_start_str
global job1_end_str job1_diff_str account3 job2_start_str
global job2_end_str job2_diff_str account2 job3_start_str
global job3_end_str job3_diff_str sacctmgr node0_start_str
global node0_end_str matches users accounts
global test_job1 test_job2 test_job3 test_job4
global job4_elig job4_start job4_run job4_end job4_diff_str
global job4_nodes job4_alloc job4_acct job4_cpus
exec $bin_rm -f $sql_in
set file [open $sql_in "w"]
# DON'T MESS WITH THIS UNLESS YOU REALLY UNDERSTAND WHAT YOU ARE DOING!!!!!
# THIS COULD SERIOUSLY MESS UP YOUR DATABASE IF YOU ALTER THIS INCORRECTLY
# JUST A FRIENDLY REMINDER ;)
# Put in the cluster for back in the day before accounting was made here for us we are using 'Tue Jan 1 00:00:00 2008' = 1199174400 as the start
puts $file "insert into cluster_event_table (node_name, cluster, tres, period_start, period_end, reason, cluster_nodes) values"
puts $file "('', '$cluster', '1=$cluster_cpus', $period_start, $period_end, 'Cluster processor count', '$node_list' )"
# Put a node down for 30 minutes starting at 45 minutes after the start to make sure our rollups work so we should get 15 minutes on one hour and 15 on the other
puts $file ", ('$node0', '$cluster', '1=$node0_cpus', $node0_down_start, $node0_down_end, 'down','')"
#puts $file ", ('$node1', '$cluster', '1=$node1_cpus', $period_start, $period_end, 'down')"
puts $file "on duplicate key update period_start=VALUES(period_start), period_end=VALUES(period_end);"
# Now we will put in a job running for an hour and 5 minutes
puts $file "insert into job_table (jobid, associd, wckey, wckeyid, uid, gid, `partition`, blockid, cluster, account, eligible, submit, start, end, suspended, name, state, comp_code, priority, req_cpus, tres_alloc, nodelist, kill_requid, qos, deleted) values"
puts $file "('65536', '$user1acct1', '$wckey1', '$user1wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job1_acct', $job0_start, $job0_start, $job0_start, $job0_end, '0', '$test_job1', '3', '0', '$job1_cpus', $job1_cpus, '1=$job1_cpus', '$job1_nodes', '0', '0', '0')"
puts $file "('65537', '$user1acct1', '$wckey1', '$user1wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job1_acct', $job1_start, $job1_start, $job1_start, $job1_end, '0', '$test_job1', '3', '0', '$job1_cpus', $job1_cpus, '1=$job1_cpus', '$job1_nodes', '0', '0', '0')"
puts $file ", ('65538', '$user2acct3', '$wckey1', '$user2wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job2_acct', $job2_elig, $job2_elig, $job2_start, $job2_end, '0', '$test_job2', '3', '0', '$job2_cpus', '$job2_cpus', '1=$job2_cpus', '$job2_nodes', '0', '0', '0')"
puts $file ", ('65539', '$user1acct2', '$wckey1', '$user1wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job3_acct', $job3_elig, $job3_elig, $job3_start, $job3_end, '0', '$test_job3', '3', '0', '$job3_cpus', '$job3_cpus', '1=$job3_cpus', '$job3_nodes', '0', '0', '0')"
puts $file ", ('65540', '$user1acct2', '$wckey1', '$user1wckey1', '$uid', '$gid', 'debug', '', '$cluster', '$job4_acct', $job4_elig, $job4_elig, $job4_start, $job4_end, '0', '$test_job4', '3', '0', '$job4_cpus', '$job4_cpus', '1=$job4_cpus', '$job4_nodes', '0', '0', '0')"
puts $file "on duplicate key update id=LAST_INSERT_ID(id), eligible=VALUES(eligible), submit=VALUES(submit), start=VALUES(start), end=VALUES(end), associd=VALUES(associd), tres_alloc=VALUES(tres_alloc), wckey=VALUES(wckey), wckeyid=VALUES(wckeyid);";
close $file
exec $bin_rm -f $sql_rem
set file [open $sql_rem "w"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_event_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_job_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_step_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_usage_day_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_usage_hour_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_usage_month_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_assoc_usage_day_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_assoc_usage_hour_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_assoc_usage_month_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_wckey_usage_day_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_wckey_usage_hour_table\";"]
puts $file [format "%s%s%s" "truncate table \"" $cluster "_wckey_usage_month_table\";"]
close $file
#
# Use sacctmgr to load info
#
if [archive_load $sql_in] {
fail "Unable to load archive"
}
#
# Use sacct to see if the job loaded
#
set matches 0
eval spawn $sacct -p -M $cluster --format=cluster,account,associd,wckey,wckeyid,start,end,elapsed --noheader --start=$start_str --end=$end_str
expect {
-re "There was a problem" {
fail "There was a problem with the sacct command"
}
-re "$cluster.$account1.$user1acct1.$wckey1.$user1wckey1.$job1_start_str.$job1_end_str.$job1_diff_str." {
log_debug "Got 1"
incr matches
exp_continue
}
-re "$cluster.$account3.$user2acct3.$wckey1.$user2wckey1.$job2_start_str.$job2_end_str.$job2_diff_str." {
log_debug "Got 2"
incr matches
exp_continue
}
-re "$cluster.$account2.$user1acct2.$wckey1.$user1wckey1.$job3_start_str.$job3_end_str.$job3_diff_str." {
log_debug "Got 3"
incr matches
exp_continue
}
timeout {
fail "sacctmgr archive load not responding"
}
eof {
wait
}
}
if {$matches != 3} {
fail "Job wasn't loaded correctly ($matches != 3)"
}
#
# Use sacctmgr to see if node event loaded
#
log_debug "$cluster..$start_str.$end_str.$cluster_cpus.$node_list"
set matches 0
eval spawn $sacctmgr -p list events cluster=$cluster format=cluster,noden,start,end,cpu --noheader start=$start_str end=$end_str
expect {
-re "There was a problem" {
fail "There was a problem with the sacctmgr command"
}
-re "($cluster..$start_str.$end_str.$cluster_cpus.)" {
log_debug "Got 1"
incr matches
exp_continue
}
-re "($cluster.$node0.$node0_start_str.$node0_end_str.$node0_cpus.)" {
log_debug "Got 2"
incr matches
exp_continue
}
timeout {
fail "sacctmgr archive load not responding"
}
eof {
wait
}
}
if {$matches != 2} {
fail "Cluster env wasn't loaded correctly ($matches != 2)"
}
#
# Use sacctmgr to roll up that time period
#
set matches 0
eval spawn $sacctmgr -i roll $start_str $end_str
expect {
-re "There was a problem" {
fail "There was a problem with the sacctmgr command"
}
-re "$cluster" {
incr matches
exp_continue
}
-re "SUCCESS" {
incr matches
exp_continue
}
timeout {
fail "sacctmgr archive load not responding"
}
eof {
wait
}
}
if {$matches != 1} {
fail "sacctmgr wasn't able to roll data"
}
}
create_sql $cluster1 $sql_in_clus1 $sql_rem_clus1
create_sql $cluster2 $sql_in_clus2 $sql_rem_clus2
#
# Execute sub-test
#
# Fed Testing
test_fed
inc22_1_1
inc22_1_2
inc22_1_3
inc22_1_4
inc22_1_5
|