1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
|
summary: Check that snapd failure handling works
details: |
Snapd has a failure handling mechanism to auto revert when a refresh process fails.
This test verifies this mechanism by trying to install a broken snapd snap
multiple times to validate that if a device wants to keep trying a refresh to a
broken snapd snap for some reason, it will be able to revert and fallback
as many times as needed. Also checks that after the revert, the snapd snap
revision has to be the same as before the broken one was installed.
prepare: |
# on UC16, we should transition to using the snapd snap before running the
# test because it by default uses the core snap
# there is a different test for the transition between the core snap and the
# snapd snap
if os.query is-core16; then
# rebuild the core snap into the snapd snap and install it
"$TESTSTOOLS"/snaps-state repack_core_snap_into_snapd_snap
snap install --dangerous snapd-from-core.snap
fi
# ensure we are in snapd snap world
snap list snapd
ls -l /snap/snapd/
restore: |
if os.query is-core16; then
echo "ensuring we reverted fully to core snap system"
not test -d /snap/snapd
# cleanup the snapd-from-core snap we built
rm -f snapd-from-core.snap
rm -f /etc/systemd/user/snapd.session-agent.service
rm -f /etc/systemd/user/snapd.session-agent.socket
rm -f /etc/systemd/user/sockets.target.wants/snapd.session-agent.socket
systemctl --user daemon-reload || true
rm -f /etc/dbus-1/session.d/snapd.session-services.conf
rm -f /etc/dbus-1/system.d/snapd.system-services.conf
fi
debug: |
# dump failure data
systemctl status snapd.socket
systemctl status snapd.service
"$TESTSTOOLS"/journal-state get-log -u snapd.failure.service
"$TESTSTOOLS"/journal-state get-log -u snapd.socket
if os.query is-core16; then
# might be useful to know what's up with the core snap too on uc16
ls -l /snap/core/ || true
fi
ls -l /snap/snapd/ || true
cat /etc/systemd/system/snapd.service || true
cat /etc/systemd/system/usr-lib-snapd.mount || true
/snap/snapd/x1/usr/bin/snap debug state /var/lib/snapd/state.json || true
/snap/snapd/x1/usr/bin/snap debug state --change="$(/snap/snapd/x1/usr/bin/snap debug state /var/lib/snapd/state.json|tail -n1|awk '{print $1}')" /var/lib/snapd/state.json || true
execute: |
if [ "$SPREAD_REBOOT" = 0 ]; then
echo "Testing failover handling of the snapd snap"
# for debugging
snap list --all
# get the number of times that snapd.failure was started before we do
# anything
started_before="$("$TESTSTOOLS"/journal-state get-log -u snapd.failure | grep -c 'Starting Failure handling of the snapd snap.' || true)"
# test that random signals don't trigger snapd-failure
current=$(readlink /snap/snapd/current)
# make sure that snapd is being executed within the cgroup assigned to snapd.service
echo "Ensure snapd is running as part of the snapd.service unit"
# shellcheck disable=SC2046,SC2002
cat /proc/$(pidof snapd)/cgroup | MATCH /snapd.service
echo "Verify that a random signal does not trigger the failure handling"
echo "and snapd is just restarted"
if os.query is-core-le 22; then
systemctl kill --signal=SIGKILL snapd.service
else
# Work around systemctl kill issue, where it may fail when the cgroup
# is recycled after the main process is killed. This seems to be a bug
# in systemd made more common by faster cgroup recycling by the kernel.
# shellcheck disable=SC2046
kill -9 $(cat /sys/fs/cgroup/system.slice/snapd.service/cgroup.procs)
fi
sleep 5
started_after_rand_sig="$("$TESTSTOOLS"/journal-state get-log -u snapd.failure | grep -c 'Starting Failure handling of the snapd snap.' || true)"
if os.query is-core16; then
# on UC16 systemd triggers OnFailure dependencies and snapd gets
# started by snap-failure
if [ "$started_after_rand_sig" -eq "$started_before" ] ; then
echo "snapd.failure did not start snapd"
exit 1
fi
# make sure that snapd is being executed within the cgroup assigned to snapd.service
echo "Verify behavior on UC16, where snapd process eventually becomes"
# snap-failure restarts the snapd service
# shellcheck disable=SC2046,SC2002,SC2016
retry -n 60 --wait 1 sh -e -c 'cat /proc/$(pidof snapd)/cgroup | MATCH /snapd.service'
# the socket access is still functional
snap list
else
# TODO this check is incorrect and is essentially racing with systemd
# which could try to active the failure service, as we're essentially
# testing that an event does not happen
if [ "$started_after_rand_sig" -gt "$started_before" ] ; then
echo "snapd.failure inadvertently started with a random signal!"
exit 1
fi
# make sure that snapd is being executed within the cgroup assigned to snapd.service
echo "Ensure snapd is still running as part of the snapd.service unit"
# shellcheck disable=SC2046,SC2002
cat /proc/$(pidof snapd)/cgroup | MATCH /snapd.service
fi
echo "Snap list is working still"
snap list | MATCH "^snapd .* $current .*"
SNAPD_SNAP=/var/lib/snapd/snaps/snapd_"$current".snap
echo "Break snapd"
unsquashfs -d ./snapd-broken "$SNAPD_SNAP"
echo "" > ./snapd-broken/usr/lib/snapd/snapd
(cd ./snapd-broken && snap pack .)
# try to install a broken snapd snap multiple times, so that if a device
# wants to keep trying a refresh to a broken snapd snap for some reason
# it will be able to revert and fallback as many times as needed
#shellcheck disable=SC2167
for _ in $(seq 1 2); do
# look for a log indicating that the failure service has finished
# running, such that we know that the failure recovery is complete
finished_before="$("$TESTSTOOLS"/journal-state get-log -u snapd.failure | grep -c 'Finished Failure handling of the snapd snap.' || true)"
echo "Install the broken snapd"
if snap install --dangerous ./snapd-broken/snapd_*.snap; then
echo "installing a broken snapd should not work, test failed"
exit 1
fi
echo "Verify that snapd.failure was activated when we tried to install a broken snapd"
#shellcheck disable=SC2165
for _ in $(seq 60); do
# get the number of times that snapd.failure finished
finished_after="$("$TESTSTOOLS"/journal-state get-log -u snapd.failure | grep -c 'Finished Failure handling of the snapd snap.' || true)"
if [ "$finished_after" -gt "$finished_before" ] ; then
break
fi
sleep 1
done
# snapd.failure restarts the snapd service
echo "Await snapd service to become active"
retry --maxmins 5.5 systemctl is-active snapd.service
echo "Double check the status of snapd.socket"
systemctl is-active snapd.socket
echo ".. and both sockets exist"
test -S /run/snapd.socket
test -S /run/snapd-snap.socket
# just because snapd.failure.service is active doesn't mean that we are
# fully ready; we should wait until the snap command shows up again
echo "And verify that snap commands still work and snapd is reverted"
retry --maxmins 5.5 bash -c "snap list | MATCH '^snapd .* $current .*'"
echo "Verify we got the expected error message"
snap change --last=install|MATCH "there was a snapd rollback across the restart"
echo "Ensure snapd is running as part of the snapd.service unit"
# shellcheck disable=SC2046,SC2002
cat /proc/$(pidof snapd)/cgroup | MATCH /snapd.service
echo "restart snapd and ensure we can still talk to it"
systemctl restart snapd.socket snapd.service
# we should still have the snapd snap and it should have the revision we had
# before the broken one was installed
snap list | MATCH "^snapd .* $current .*"
done
# we cannot undo the core -> snapd installation we did in the prepare
# section during restore, because we can't reboot during restore, so instead
# do that restoration here
# TODO: move this to restore section when spread is fixed
# see https://github.com/snapcore/spread/pull/85
if os.query is-core16; then
echo "Manually uninstall the snapd snap on UC16"
systemctl stop snapd.service snapd.socket snapd.autoimport.service snapd.snap-repair.service snapd.snap-repair.timer
retry --wait 3 -n 5 sh -c "umount \"/snap/snapd/$(readlink /snap/snapd/current)\""
rm -f /etc/systemd/system/usr-lib-snapd.mount
rm -f /etc/systemd/system/snap-snapd-*.mount
rm -f /etc/systemd/system/snapd.{service,timer,socket}
rm -f /etc/systemd/system/snapd.*.{service,timer,socket}
rm -f /etc/systemd/system/*.wants/snapd.*.{service,timer,socket}
rm -f /etc/systemd/system/snapd.mounts.target.wants/snap-snapd-*.mount
rm -f /etc/systemd/system/multi-user.target.wants/snap-snapd-*.mount
systemctl daemon-reload
# this will have the "snapd" snap /usr/lib/snapd bind mounted
umount --lazy /usr/lib/snapd
systemctl start snapd.service snapd.socket
systemctl status snapd|MATCH " /usr/lib/snapd/snapd"
echo "Reboot to get a clean system again"
REBOOT
fi
else # "$SPREAD_REBOOT" != 0
# technically this check is unnecessary because we only reboot during
# the test's execution on uc16, but just be extra safe
if os.query is-core16; then
# now remove the snapd snap since we booted with the core snap
# we need first to unmount snapd snaps
for mntpnt in /snap/snapd/*; do
if mountpoint -q "${mntpnt}"; then
unit="$(systemd-escape --path "${mntpnt}").mount"
umount "${mntpnt}" || true
rm -f "/etc/systemd/system/${unit}"
fi
done
systemctl daemon-reload || true
snap list
snap remove snapd
# and we can still run the rsync snap after the reboot
rsync --help
fi
fi
|