File: task.yaml

package info (click to toggle)
snapd 2.72-1
links: PTS, VCS
area: main
in suites: sid
size: 80,412 kB
sloc: sh: 16,506; ansic: 16,211; python: 11,213; makefile: 1,919; exp: 190; awk: 58; xml: 22
file content (233 lines) | stat: -rw-r--r-- 11,034 bytes
parent folder | download | duplicates (3)
summary: Check that snapd failure handling works

details: |
    Snapd has a failure handling mechanism to auto revert when a refresh process fails. 
    This test verifies this mechanism by trying to install a broken snapd snap
    multiple times to validate that if a device wants to keep trying a refresh to a
    broken snapd snap for some reason, it will be able to revert and fallback
    as many times as needed. Also checks that after the revert, the snapd snap
    revision has to be the same as before the broken one was installed.

prepare: |
    # on UC16, we should transition to using the snapd snap before running the 
    # test because it by default uses the core snap
    # there is a different test for the transition between the core snap and the
    # snapd snap
    if os.query is-core16; then
        # rebuild the core snap into the snapd snap and install it
        "$TESTSTOOLS"/snaps-state repack_core_snap_into_snapd_snap
        snap install --dangerous snapd-from-core.snap
    fi
    # ensure we are in snapd snap world
    snap list snapd
    ls -l /snap/snapd/

restore: |
    if os.query is-core16; then
        echo "ensuring we reverted fully to core snap system"
        not test -d /snap/snapd
        # cleanup the snapd-from-core snap we built
        rm -f snapd-from-core.snap
        rm -f /etc/systemd/user/snapd.session-agent.service
        rm -f /etc/systemd/user/snapd.session-agent.socket
        rm -f /etc/systemd/user/sockets.target.wants/snapd.session-agent.socket
        systemctl --user daemon-reload || true
        rm -f /etc/dbus-1/session.d/snapd.session-services.conf
        rm -f /etc/dbus-1/system.d/snapd.system-services.conf
    fi

debug: |
    # dump failure data
    systemctl status snapd.socket
    systemctl status snapd.service
    "$TESTSTOOLS"/journal-state get-log -u snapd.failure.service
    "$TESTSTOOLS"/journal-state get-log -u snapd.socket
    if os.query is-core16; then
        # might be useful to know what's up with the core snap too on uc16
        ls -l /snap/core/ || true
    fi
    ls -l /snap/snapd/ || true

    cat /etc/systemd/system/snapd.service || true
    cat /etc/systemd/system/usr-lib-snapd.mount || true
    /snap/snapd/x1/usr/bin/snap debug state /var/lib/snapd/state.json || true
    /snap/snapd/x1/usr/bin/snap debug state --change="$(/snap/snapd/x1/usr/bin/snap debug state /var/lib/snapd/state.json|tail -n1|awk '{print $1}')" /var/lib/snapd/state.json || true

execute: |
    if [ "$SPREAD_REBOOT" = 0 ]; then
        echo "Testing failover handling of the snapd snap"

        # for debugging
        snap list --all

        # get the number of times that snapd.failure was started before we do 
        # anything
        started_before="$("$TESTSTOOLS"/journal-state get-log -u snapd.failure | grep -c 'Starting Failure handling of the snapd snap.' || true)"

        # test that random signals don't trigger snapd-failure
        current=$(readlink /snap/snapd/current)

        # make sure that snapd is being executed within the cgroup assigned to snapd.service
        echo "Ensure snapd is running as part of the snapd.service unit"
        # shellcheck disable=SC2046,SC2002
        cat /proc/$(pidof snapd)/cgroup | MATCH /snapd.service

        echo "Verify that a random signal does not trigger the failure handling"
        echo "and snapd is just restarted"

        if os.query is-core-le 22; then
            systemctl kill --signal=SIGKILL snapd.service
        else
            # Work around systemctl kill issue, where it may fail when the cgroup
            # is recycled after the main process is killed. This seems to be a bug
            # in systemd made more common by faster cgroup recycling by the kernel.
            # shellcheck disable=SC2046
            kill -9 $(cat /sys/fs/cgroup/system.slice/snapd.service/cgroup.procs)
        fi

        sleep 5
        started_after_rand_sig="$("$TESTSTOOLS"/journal-state get-log -u snapd.failure | grep -c 'Starting Failure handling of the snapd snap.' || true)"

        if os.query is-core16; then
            # on UC16 systemd triggers OnFailure dependencies and snapd gets
            # started by snap-failure
            if [ "$started_after_rand_sig" -eq "$started_before" ] ; then
                echo "snapd.failure did not start snapd"
                exit 1
            fi

            # make sure that snapd is being executed within the cgroup assigned to snapd.service
            echo "Verify behavior on UC16, where snapd process eventually becomes"
            # snap-failure restarts the snapd service
            # shellcheck disable=SC2046,SC2002,SC2016
            retry -n 60 --wait 1 sh -e -c 'cat /proc/$(pidof snapd)/cgroup | MATCH /snapd.service'
            # the socket access is still functional
            snap list
        else
            # TODO this check is incorrect and is essentially racing with systemd
            # which could try to active the failure service, as we're essentially
            # testing that an event does not happen
            if [ "$started_after_rand_sig" -gt "$started_before" ] ; then
                echo "snapd.failure inadvertently started with a random signal!"
                exit 1
            fi

            # make sure that snapd is being executed within the cgroup assigned to snapd.service
            echo "Ensure snapd is still running as part of the snapd.service unit"
            # shellcheck disable=SC2046,SC2002
            cat /proc/$(pidof snapd)/cgroup | MATCH /snapd.service
        fi


        echo "Snap list is working still"
        snap list | MATCH "^snapd .* $current .*"

        SNAPD_SNAP=/var/lib/snapd/snaps/snapd_"$current".snap

        echo "Break snapd"
        unsquashfs -d ./snapd-broken "$SNAPD_SNAP"
        echo "" > ./snapd-broken/usr/lib/snapd/snapd
        (cd ./snapd-broken && snap pack .)

        # try to install a broken snapd snap multiple times, so that if a device
        # wants to keep trying a refresh to a broken snapd snap for some reason
        # it will be able to revert and fallback as many times as needed
        #shellcheck disable=SC2167
        for _ in $(seq 1 2); do
            # look for a log indicating that the failure service has finished
            # running, such that we know that the failure recovery is complete
            finished_before="$("$TESTSTOOLS"/journal-state get-log -u snapd.failure | grep -c 'Finished Failure handling of the snapd snap.' || true)"

            echo "Install the broken snapd"
            if snap install --dangerous ./snapd-broken/snapd_*.snap; then
                echo "installing a broken snapd should not work, test failed"
                exit 1
            fi

            echo "Verify that snapd.failure was activated when we tried to install a broken snapd"
            #shellcheck disable=SC2165
            for _ in $(seq 60); do
                # get the number of times that snapd.failure finished
                finished_after="$("$TESTSTOOLS"/journal-state get-log -u snapd.failure | grep -c 'Finished Failure handling of the snapd snap.' || true)"
                if [ "$finished_after" -gt "$finished_before" ] ; then
                    break
                fi
                sleep 1
            done

            # snapd.failure restarts the snapd service
            echo "Await snapd service to become active"
            retry --maxmins 5.5 systemctl is-active snapd.service

            echo "Double check the status of snapd.socket"
            systemctl is-active snapd.socket
            echo ".. and both sockets exist"
            test -S /run/snapd.socket
            test -S /run/snapd-snap.socket

            # just because snapd.failure.service is active doesn't mean that we are 
            # fully ready; we should wait until the snap command shows up again
            echo "And verify that snap commands still work and snapd is reverted"
            retry --maxmins 5.5 bash -c "snap list | MATCH '^snapd .* $current .*'"

            echo "Verify we got the expected error message"
            snap change --last=install|MATCH "there was a snapd rollback across the restart"

            echo "Ensure snapd is running as part of the snapd.service unit"
            # shellcheck disable=SC2046,SC2002
            cat /proc/$(pidof snapd)/cgroup | MATCH /snapd.service

            echo "restart snapd and ensure we can still talk to it"
            systemctl restart snapd.socket snapd.service

            # we should still have the snapd snap and it should have the revision we had
            # before the broken one was installed
            snap list | MATCH "^snapd .* $current .*"
        done

        # we cannot undo the core -> snapd installation we did in the prepare 
        # section during restore, because we can't reboot during restore, so instead
        # do that restoration here
        # TODO: move this to restore section when spread is fixed
        # see https://github.com/snapcore/spread/pull/85
        if os.query is-core16; then
            echo "Manually uninstall the snapd snap on UC16"
            systemctl stop snapd.service snapd.socket snapd.autoimport.service snapd.snap-repair.service snapd.snap-repair.timer
            retry --wait 3 -n 5 sh -c "umount \"/snap/snapd/$(readlink /snap/snapd/current)\""

            rm -f /etc/systemd/system/usr-lib-snapd.mount
            rm -f /etc/systemd/system/snap-snapd-*.mount
            rm -f /etc/systemd/system/snapd.{service,timer,socket}
            rm -f /etc/systemd/system/snapd.*.{service,timer,socket}
            rm -f /etc/systemd/system/*.wants/snapd.*.{service,timer,socket}
            rm -f /etc/systemd/system/snapd.mounts.target.wants/snap-snapd-*.mount
            rm -f /etc/systemd/system/multi-user.target.wants/snap-snapd-*.mount
            systemctl daemon-reload
            # this will have the "snapd" snap /usr/lib/snapd bind mounted
            umount --lazy /usr/lib/snapd
            systemctl start snapd.service snapd.socket
            systemctl status snapd|MATCH " /usr/lib/snapd/snapd"

            echo "Reboot to get a clean system again"
            REBOOT
        fi
    else # "$SPREAD_REBOOT" != 0
        # technically this check is unnecessary because we only reboot during
        # the test's execution on uc16, but just be extra safe
        if os.query is-core16; then
            # now remove the snapd snap since we booted with the core snap
            # we need first to unmount snapd snaps
            for mntpnt in /snap/snapd/*; do
              if mountpoint -q "${mntpnt}"; then
                unit="$(systemd-escape --path "${mntpnt}").mount"
                umount "${mntpnt}" || true
                rm -f "/etc/systemd/system/${unit}"
              fi
            done
            systemctl daemon-reload || true
            snap list
            snap remove snapd
            # and we can still run the rsync snap after the reboot
            rsync --help
        fi
    fi