1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
|
#!/bin/bash
# Helper for running tests on GPUs
#
# Author: Christian Kastner <ckk@kvr.at>
# License: MIT
usage() {
cat >&2 <<"EOF"
usage: gpuisol-test-launcher [options] VENDOR CMD [ARGS ...]
Checks for availability of and access to a GPU from VENDOR, runs the test, and
exits with the exit code of the test, or with exit code 77 (which autopkgtest
interprets as "skipped") if no GPU was found. Optionally exports some system
data as autopkgtest artifacts.
VENDOR can currently be either 'amd' or 'nvidia'.
Use this helper to skip tests on ci.debian.net (which doesn't support GPUs) but
have them run on ci.ai.debian.net (which supports various GPUs from AMD and
NVIDIA), and on ci.rocm.debian.net (which as numerous AMD GPUs).
To run the autopkgtests on your own system, in a QEMU VM or a rootless podman
container, you will need the utilities provided by package gpuisol-qemu
resp. gpuisol-podman.
Supported options:
-h, --help
Print this help
--cd-tmp
Change directory to AUTOPKGTEST_TMP before executing the test.
Supported environment variables:
GPUISOL_TEST_LANCHER_WITH_DMESG
If set, export dmesg before and after the test as an autopkgtest artifact.
The user in the testbed must have access to dmesg, so either the user needs
to be privileged, or dmesg must not be restricted. Restriction can be
lifted with sudo `sysctl kernel.dmesg_restrict=0`.
GPUISOL_TEST_LAUNCHER_WITH_AMD_DRI[=PATH]
If set, export firmware and possibly other GPU-specific information as an
autopkgtest artifact. The user in the testbed must have access to
"/sys/kernel/debug/dri/", which requires privileges. Alternatively, one
can bind-mount that directory to some user-readable path, eg:
`mount --bind /sys/kernel/debug/dri /tmp/foo`, and pass that path as
GPUISOL_TEST_LAUNCHER_WITH_AMD_DRI=/tmp/foo.
GPUISOL_TEST_LAUNCHER_WITH_AMD_ROCMINFO
If set, export the output of `rocminfo` as an autopkgtest artifact.
Examples for d/tests/control, for a test with an AMD GPU:
Simple:
Test-Command: gpuisol-test-launcher amd testRunner --verbose --skip fooTest
Depends: @, gpuisol-test-launcher
Restrictions: skippable
Architecture: amd64 arm64 ppc64el
Write your own test runner/wrapper, have gpuisol-test-launcher call it:
Test-Command: gpuisol-test-launcher amd debian/tests/my-runner
Depends: @, gpuisol-test-launcher
Restrictions: skippable
Architecture: amd64 arm64 ppc64el
EOF
}
vendor=
opt_cd_tmp=0
opt_with_dmesg=0
opt_with_amd_dri=0
opt_with_amd_rocminfo=0
dri_path=
# Can't use getopt because it won't stop parsing, but any options after the
# first positional argument (the test command) aren't for us, they are for
# the test command
while [[ $# -gt 0 ]]; do
case "$1" in
-h | --help)
usage
exit 0
;;
--cd-tmp)
opt_cd_tmp=1
shift
;;
--)
shift
break
;;
-*)
echo "$0: unknown option: $1" >&2
usage
exit 1
;;
*)
break
;;
esac
done
if [ -z "$1" ]; then
echo "Not enough arguments." >&2
exit 1
fi
case "$1" in
amd)
vendor=amd
;;
nvidia)
vendor=nvidia
;;
*)
echo "Unsupported vendor: $1" >&2
exit 1
;;
esac
shift
# Test that each variable is actually set (null or not)
if [ -n "${ROCM_TEST_LAUNCHER_WITH_DMESG+x}" ]; then
opt_with_dmesg=1
fi
if [ -n "${ROCM_TEST_LAUNCHER_WITH_AMD_DRI+x}" ]; then
opt_with_amd_dri=1
dri_path="${ROCM_TEST_LAUNCHER_WITH_AMD_DRI}"
fi
if [ -n "${ROCM_TEST_LAUNCHER_WITH_AMD_ROCMINFO+x}" ]; then
opt_with_amd_rocminfo=1
fi
if [ "$vendor" = "amd" ]; then
if [ ! -e /dev/kfd ]; then
echo "/dev/kfd not present, system either lacks AMD GPU or 'amdgpu' driver is not loaded."
echo "Skipping tests."
# Magic number to signal 'skipped'
exit 77
elif [ "$(id -u)" != "0" ] && [ ! -r /dev/kfd ]; then
echo "/dev/kfd present but no read permission."
echo "Skipping tests."
exit 77
fi
elif [ "$vendor" = "nvidia" ]; then
nvidia_found=0
if [ -e /dev/nvidiactl ]; then
nvidia_found=1
elif [ -x /usr/bin/nvidia-modprobe ] && /usr/bin/nvidia-modprobe -c 0 -u &>/dev/null; then
nvidia_found=1
elif [ -x /usr/bin/lsmod ] && lsmod | grep -Eq '^nvidia[[:space:]]+'; then
nvidia_found=1
fi
if [ "$nvidia_found" -ne 1 ]; then
echo "Either no NVIDIA GPU, or 'nvidia' driver is not loaded."
echo "Skipping tests."
# Magic number to signal 'skipped'
exit 77
fi
fi
# So that we can sort files by creation time
tstamp() {
echo "$(date '+%s.%N')"
}
check_for_sudo() {
local msg
msg="$1"
if ! [ -x /usr/bin/sudo ]; then
if [ -n "$msg" ]; then
echo "$0: sudo not available; $msg" >&2
else
echo "$0: sudo not available." >&2
fi
return 1
else
return 0
fi
}
save_dmesg() {
local phase
local outfile
phase="$1"
if [ "$phase" != "before" ] && [ "$phase" != "after" ]; then
echo "save_dmesg: unknown phase $phase" >&2
exit 2
fi
outfile="$AUTOPKGTEST_ARTIFACTS/$(tstamp).dmesg.$phase"
# First, try regular dmesg, which works for root and all systems with
# kernel.dmesg_restrict=0
dmesg >"$outfile" && return
check_for_sudo "could not save dmesg" || return 0
# shellcheck disable=SC2024 # we don't need privileged write
if ! sudo -n dmesg >"$outfile"; then
echo "$0: failed to save dmesg." >&2
fi
}
save_amd_firmware() {
local dripath
local fwinfo
local outfile
local fwfound
dripath="${1:-/sys/kernel/debug/dri}"
fwfound=0
if [ -d "$dripath" ]; then
for subpath in "$dripath"/*; do
index="${subpath##*/}"
fwinfo="$subpath/amdgpu_firmware_info"
outfile="$AUTOPKGTEST_ARTIFACTS/$(tstamp).amdgpu_firmware_info.$index"
if [ -f "$fwinfo" ]; then
cat "$fwinfo" >"$outfile"
fwfound=1
fi
done
else
# directory might be there, we just might not have permission
check_for_sudo "could not read firmware info" || return 0
if sudo -n [ -d "$dripath" ]; then
for subpath in $(sudo -n ls "$dripath"); do
index="${subpath##*/}"
fwinfo="$subpath/amdgpu_firmware_info"
outfile="$AUTOPKGTEST_ARTIFACTS/$(tstamp).amdgpu_firmware_info.$index"
if sudo -n [ -f "$fwinfo" ]; then
# shellcheck disable=SC2024 # we don't need privileged write
sudo -n cat "$fwinfo" >"$outfile"
fwfound=1
fi
done
else
echo "$0: Cannot access $dripath, cannot query firmware info." >&2
return
fi
fi
if [ "$fwfound" -eq 0 ]; then
echo "$0: No firmware info found. Is $dripath populated?" >&2
fi
}
save_rocminfo() {
# No need to check for sudo here, as we've already verified access to
# /dev/kfd, which should be all we need
if ! [ -x /usr/bin/rocminfo ]; then
echo "$0: rocminfo not available, not saving info." >&2
exit 1
fi
if ! rocminfo >"$AUTOPKGTEST_ARTIFACTS/$(tstamp).rocminfo.txt"; then
echo "$0: Could not save rocminfo." >&2
fi
}
### Pre-test ###
# 16 = testbed failure
if ([ "$opt_with_dmesg" -eq 1 ] \
|| [ "$opt_with_amd_dri" -eq 1 ] \
|| [ "$opt_with_amd_rocminfo" -eq 1 ]) && [ -z "$AUTOPKGTEST_ARTIFACTS" ]; then
echo "AUTOPKGTEST_ARTIFACTS not set, cannot save requested artifacts." >&2
exit 16
fi
[ "$opt_cd_tmp" -eq 1 ] && { cd "$AUTOPKGTEST_TMP" || exit 16; }
[ "$opt_with_dmesg" -eq 1 ] && save_dmesg "before"
[ "$opt_with_amd_dri" -eq 1 ] && save_amd_firmware "$dri_path"
[ "$opt_with_amd_rocminfo" -eq 1 ] && save_rocminfo
### Test ###
"$@"
exitcode=$?
### Post-test ###
[ "$opt_with_dmesg" -eq 1 ] && save_dmesg "after"
exit $exitcode
|