File: failsafe

package info (click to toggle)
debootstick 2.8
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 296 kB
sloc: sh: 1,364; makefile: 34
file content (292 lines) | stat: -rw-r--r-- 6,972 bytes
# vim: filetype=sh

# failsafe mode handling
# ----------------------
# we want to leave the system in a clean state,
# whatever happens.
# for example, if a "disk full" error happens
# in the middle of the chrooted-customization
# step, we should be able to umount what have
# been mounted in the chroot, exit the chroot,
# umount things and remove peripherals created
# by debootstick outside the chroot, before
# exiting.
# we handle this by trapping the EXIT
# of the scripts. Also, each command creating
# a persistent artefact (mounts, devices, etc.)
# is recorded, in order to be able to 'undo the
# command' (i.e. remove the artefacts) if needed.

undo_all()
{
    # run saved failsafe commands prefixed with 'undo_'
    eval "$(tac $FAILSAFE_COMMANDS | \
            awk '{print "undo_" $0}')"

    # flush file 'FAILSAFE_COMMANDS'
    echo -n > $FAILSAFE_COMMANDS
}

on_sigint()
{
    trap - INT EXIT USR1
    on_exit --from-signal $*
    exit 1
}

on_sigusr1()
{
    trap - INT EXIT USR1
    # exiting because of wrong user input
    on_exit $*
    exit 1
}

on_exit()
{   # save exit code
    res=$?

    # get args
    toplevel=0
    fromsignal=0
    if [ "$1" = "--from-signal" ]
    then
        fromsignal=1
        shift
    fi
    if [ "$1" = "--toplevel" ]
    then
        toplevel=1
        shift
    fi
    cleanup_function=$1

    # if unexpected, inform user
    if [ $toplevel -eq 1 ]
    then
        if [ $fromsignal -eq 1 ]
        then
            echo
            echo "Interrupted."
        fi
        if [ $res -gt 0 ]
        then
            echo
            echo "E: an error occured." >&2
            echo "E: did you try 'debootstick --help-os-support'?" >&2
        fi
    fi

    if [ -s $FAILSAFE_COMMANDS ]    # if not empty
    then
        # inform user
        if [ $toplevel -eq 1 ]
        then
            echo -n "I: restoring a clean state... "
        fi

        # undo operations (remove artefacts)
        undo_all
        rm $FAILSAFE_COMMANDS

        # inform user
        if [ $toplevel -eq 1 ]
        then
            echo "done"
        fi
    fi

    # call an additional cleanup function
    # if provided.
    if [ ! -z "$cleanup_function" ]
    then
        $cleanup_function $res
    fi

    return $res
}

start_failsafe_mode()
{
    # stop if an error occurs
    set -e
    # clean remaining artefacts before exitting
    trap "on_exit $*" EXIT
    trap "on_sigint $*" INT
    trap "on_sigusr1 $*" USR1

    # allow with constructs (see f_with function)
    alias with="while f_with"

    # create a temporary file to save commands
    FAILSAFE_COMMANDS=$(mktemp)

    # bash does not expand aliases by default,
    # when running a script.
    # busybox sh does, and has no such configuration
    # option (thus the error ignoring construct)
    shopt -s expand_aliases 2>/dev/null || true
}

exit_wrong_user_input()
{
    kill -USR1 $$
}

undo_mount_with_prefix()
{
    # I know 2 usual things that could cause umount
    # to fail with an error reporting that 'device is busy'.
    # Either one process has its current directory on this
    # mount, or there is cached data that was not yet
    # written to disk. We handle these below.
    for last; do true; done # retrieve last arg
    cd / # just in case we would be on the mountpoint
    # some say that a sync request is treated asynchronously.
    # but if a second one comes in, then the first one is
    # forced. Thus the 2 requests in row:
    sync; sync
    $1 umount "$last"
    # try to return to previous dir if possible
    cd - >/dev/null 2>&1 || true
}

undo_mount()
{
    undo_mount_with_prefix "" $*
}

undo_busybox_mount()
{
    undo_mount_with_prefix "$busybox_path" $*
}

undo_mkdir()
{
    for last; do true; done # retrieve last arg
    rm -rf "$last"
}

undo_losetup()
{   # we assume the failsafe command was
    # $ failsafe losetup --sector-size <sector-size> <loop_device> <file>
    losetup -d "$3"
}

undo_partx()
{   # we assume the failsafe command was
    # $ failsafe partx -a <disk_device>
    disk_device="$2"

    # we have to detach lvm devices associated
    # to the <disk_device>, they keep the related
    # partition in a busy state otherwise.
    # Retrieving these devices is not so easy...
    partitions=$(get_loop_device_partitions $disk_device)
    vg_names=$(pvs -o vg_name --noheadings $partitions 2>/dev/null || true)
    if [ ! -z "$vg_names" ]
    then
        lv_devices=$(lvs -o vg_name,lv_name --noheadings $vg_names | \
                        awk '{print "/dev/" $1 "/" $2}')
        for lv_device in $lv_devices
        do
            lvchange -an $lv_device
            if [ -e $lv_device ]
            then
                dmsetup remove $lv_device
            fi
        done
    fi

    # we can now request the kernel to remove
    # <disk_device> partitions
    partx -d "$disk_device"

    # update lvm knowledge about physical volumes
    pvscan --cache >/dev/null
}

undo_chroot()
{
    exit
}

failsafe()
{
    $* &&  echo "$*" >> $FAILSAFE_COMMANDS
}

# workaround the fact losetup sometimes fail with
# 'resource temporarily unavailable' error.
# (sometimes seen when using "--sector-size 4096" option)
failsafe_losetup()
{
    while [ 1 ]
    do
        losetup "$@" 2>/dev/null && break || true
        sleep 0.1   # retry after a short delay
    done
    echo losetup "$@" >> $FAILSAFE_COMMANDS
}

undo()
{
    # undo-ing one failsafe operation only

    # we have to remove this operation from
    # file 'FAILSAFE_COMMANDS'.
    # first, we escape it in order to use
    # it in a sed statement below.
    escaped_cmd="$(
        echo "$*" | \
            sed -e 's/[\/&]/\\&/g')"
    # and now we remove it
    sed -i -e "/^$escaped_cmd\$/d" $FAILSAFE_COMMANDS

    # of course we really undo it
    eval "undo_$*"
}

# the function f_with() allows constructs such as:
#
# with mount [...]; do
#   [...]
# done
#
# The unmount-ing will be done at the end of the
# block regardless of what happens inside (issue raised
# or not).
#
# 'with' is actually an alias involving this function
# and a while loop:
# with -> while f_with   (see 'start_failsafe_mode')
#
# we ensure that the while loop stops at the 2nd
# iteration.
f_with()
{
    # save the command
    cmd=$*
    # we need an id to recognise this construct
    with_id=$(echo $cmd | md5sum | awk '{print $1}')
    # let's load the stack of ids we have
    set -- $with_ids_stack

    # if this is a new id...
    if [ "$1" != "$with_id" ]
    then
        # this is a new 'with' construct
        # perform the command requested
        failsafe $cmd
        # update the stack
        with_ids_stack="$with_id $with_ids_stack"
        return 0    # continue the while loop
    else
        # second (and last) time through this 'with' construct
        # pop this id from the stack
        shift; with_ids_stack=$*
        # revert the command
        undo $cmd
        return 1    # stop the while loop
    fi
}