File: test.bats

package info (click to toggle)
charliecloud 0.43-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 3,084 kB
  • sloc: python: 6,021; sh: 4,284; ansic: 3,863; makefile: 598
file content (142 lines) | stat: -rw-r--r-- 5,204 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
CH_TEST_TAG=$ch_test_tag
load "${CHTEST_DIR}/common.bash"

# Note: If you get output like the following (piping through cat turns off
# BATS terminal magic):
#
#  $ ./bats ../examples/spark/test.bats | cat
#  1..5
#  ok 1 spark/configure
#  ok 2 spark/start
#  [...]/test/bats.src/libexec/bats-exec-test: line 329: /tmp/bats.92406.src: No such file or directory
#  [...]/test/bats.src/libexec/bats-exec-test: line 329: /tmp/bats.92406.src: No such file or directory
#  [...]/test/bats.src/libexec/bats-exec-test: line 329: /tmp/bats.92406.src: No such file or directory
#
# that means that mpirun is starting too many processes per node (you want 1).
# One solution is to export OMPI_MCA_rmaps_base_mapping_policy= (i.e., set but
# empty).

setup () {
    scope standard
    prerequisites_ok spark
    pmix_or_skip
    [[ $CH_TEST_PACK_FMT = *-unpack ]] || skip 'issue #1161'
    umask 0077

    # Unset these Java variables so the container doesn’t use host paths.
    unset JAVA_BINDIR JAVA_HOME JAVA_ROOT

    spark_dir=${TMP_}/spark  # runs before each test, so no mktemp
    spark_config=$spark_dir
    spark_log=/tmp/sparklog
    confbind=${spark_config}:/mnt/0
    if [[ $ch_multinode ]]; then
        # We use hostname to determine the interface to use for this test,
        # avoiding complicated logic determining which interface is the HSN.
        # In many environments this likely results in the tests running over
        # the slower management interface, which is fine for testing, but
        # should be avoided for large scale runs.
        master_host="$(hostname)"
        # Start Spark workers using pdsh. We would really prefer to do this
        # using srun, but that doesn’t work; see issue #230.
        command -v pdsh >/dev/null 2>&1 || pedantic_fail "pdsh not in path"
        pernode="pdsh -R ssh -w ${SLURM_NODELIST} -- PATH='${PATH}'"
    else
        master_host=localhost
        pernode=
    fi
    master_url=spark://${master_host}:7077
    master_log="${spark_log}/*master.Master*.out"  # expand globs later
}


@test "${ch_tag}/configure" {
    # check for restrictive umask
    run umask -S
    echo "$output"
    [[ $status -eq 0 ]]
    [[ $output = 'u=rwx,g=,o=' ]]
    # create config
    $ch_mpirun_node mkdir -p "$spark_config"
    # We set JAVA_HOME in the spark environment file as this appears to be the
    # idiomatic method for ensuring spark finds the java install.
    tee <<EOF > "${spark_config}/spark-env.sh"
SPARK_LOCAL_DIRS=/tmp/spark
SPARK_LOG_DIR=$spark_log
SPARK_WORKER_DIR=/tmp/spark
SPARK_LOCAL_IP=127.0.0.1
SPARK_MASTER_HOST=${master_host}
JAVA_HOME=/usr/lib/jvm/default-java/
EOF
    my_secret=$(cat /dev/urandom | tr -dc '0-9a-f' | head -c 48)
    tee <<EOF > "${spark_config}/spark-defaults.conf"
spark.authenticate.true
spark.authenticate.secret ${my_secret}
EOF
    if [[ $ch_multinode ]]; then
        sbcast -f "${spark_config}/spark-env.sh" "${spark_config}/spark-env.sh"
        sbcast -f "${spark_config}/spark-defaults.conf" "${spark_config}/spark-defaults.conf"
    fi
}


@test "${ch_tag}/start" {
    # remove old master logs so new one has predictable name
    rm -Rf --one-file-system "$spark_log"
    # start the master
    ch-run -b "$confbind" "$ch_img" -- /opt/spark/sbin/start-master.sh
    sleep 15
    # shellcheck disable=SC2086
    cat $master_log
    # shellcheck disable=SC2086
    grep -Fq 'New state: ALIVE' $master_log
    # start the workers
    # shellcheck disable=SC2086
    $pernode ch-run -b "$confbind" "$ch_img" -- \
                    /opt/spark/sbin/start-worker.sh "$master_url"
    sleep 15
}


@test "${ch_tag}/worker count" {
    # Note that in the log, each worker shows up as 127.0.0.1, which might
    # lead you to believe that all the workers started on the same (master)
    # node. However, I believe this string is self-reported by the workers and
    # is an artifact of SPARK_LOCAL_IP=127.0.0.1 above, which AFAICT just
    # tells the workers to put their web interfaces on localhost. They still
    # connect to the master and get work OK.
    [[ -z $ch_multinode ]] && SLURM_NNODES=1
    # shellcheck disable=SC2086
    worker_ct=$(grep -Fc 'Registering worker' $master_log || true)
    echo "node count: $SLURM_NNODES; worker count: ${worker_ct}"
    [[ $worker_ct -eq "$SLURM_NNODES" ]]
}


@test "${ch_tag}/pi" {
    run ch-run -b "$confbind" "$ch_img" -- \
               /opt/spark/bin/spark-submit --master "$master_url" \
               /opt/spark/examples/src/main/python/pi.py 64
    echo "$output"
    [[ $status -eq 0 ]]
    # This computation converges quite slowly, so we only ask for two correct
    # digits of pi.
    [[ $output = *'Pi is roughly 3.1'* ]]
}


@test "${ch_tag}/stop" {
    $pernode ch-run -b "$confbind" "$ch_img" -- /opt/spark/sbin/stop-worker.sh
    ch-run -b "$confbind" "$ch_img" -- /opt/spark/sbin/stop-master.sh
    sleep 2
    # Any Spark processes left?
    # (Use egrep instead of fgrep so we don’t match the grep process.)
    # shellcheck disable=SC2086
    $pernode ps aux | ( ! grep -E '[o]rg\.apache\.spark\.deploy' )
}


@test "${ch_tag}/hang" {
    # If there are any test processes remaining, this test will hang.
    true
}