File: heri-eval

package info (click to toggle)
herisvm 0.9.0-2
links: PTS, VCS
area: main
in suites: bookworm, bullseye, forky, sid, trixie
size: 328 kB
sloc: sh: 1,471; ruby: 615; makefile: 28
file content (412 lines) | stat: -rwxr-xr-x 11,064 bytes
#!/usr/bin/env bash

# Copyright (c) 2015 Alexandra Figlovskaya <fglval@gmail.com>
# Copyright (c) 2015-2019 Aleksey Cheusov <vle@gmx.net>
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# variables settable by user
: ${SVM_TRAIN_CMD:=svm-train}
: ${SVM_PREDICT_CMD:=svm-predict}

: ${SVM_HERI_STAT_CMD:=heri-stat}
: ${SVM_HERI_STAT_ADDONS_CMD:=heri-stat-addons}
: ${SVM_HERI_SPLIT_CMD:=heri-split}

: ${TMPDIR:=/tmp}

############################################################
set -e
export LC_ALL=C

indent2 (){
    sed '/./ s/^/  /' "$@"
}

sig_handler (){
    on_exit
    trap - "$1"
    kill -"$1" $$
}

on_exit(){
    show_stderr
    if test -z "$keep_tmp"; then
	if test -n "$tmp_dir"; then
	    rm -rf "$tmp_dir"
	fi
    else
	echo "Temporary files are here $tmp_dir" 1>&2
    fi
}

calculate_feature_count (){
    awk '{
	for (i=2; i <= NF; ++i) {
	    if ($i + 0 > m)
		m = $i + 0
	}
    }
    END {
	print m+1
    }' "$@"
}

calculate_feature_count (){
    awk '{
	for (i=2; i <= NF; ++i) {
	    if ($i + 0 > m)
		m = $i + 0
	}
    }
    END {
	print m+1
    }' "$@"
}

predictions_from_testing_sets (){
    if ! test -s "$tmp_dir/testing_fold.txt"; then
	cat "$tmp_dir/outcome_and_prediction1.txt"
	return
    fi

    awk '
    FNR == NR {
	# reading testing_fold.txt
	++obj_num[$1]
	testobj[$1,obj_num[$1]] = NR
	next
    }

    # reading predictions on testing folds
    FNR == 1 {
	++fold_num
    }

    {
	idx = testobj[fold_num, FNR]
	prediction [idx] = $0
    }

    END {
	if ((NR % 2) != 0){
	    print "internal error!" > "/dev/stderr"
	    exit 12
	}
	count = NR/2
	for (i=1; i <= count; ++i){
	    print prediction [i]
	}
    }' "$tmp_dir/testing_fold.txt" $prediction_all
}

show_stderr (){
    if test -z "$last"; then
	return
    fi
    for i in `seq $last`; do
	#
	fn="$tmp_dir/train_stderr${i}.txt"
	if test -s "$fn"; then
	    echo "---- train stderr $i ----" 1>&2
	    cat -- "$fn" 1>&2
	fi
	#
	fn="$tmp_dir/predict_stderr${i}.txt"
	if test -s "$fn"; then
	    echo "---- predict stderr $i ----" 1>&2
	    cat -- "$fn" 1>&2
	fi
    done
}

wait_all (){
    local i
    local ex
    ex=0
    for i in `seq $last`; do
	if wait ${pid[$i]}; then
	    :
	else
	    ex=$?
	fi
    done
    return "$ex"
}

#    heri-eval -t10 -n 5 dataset.libsvm          # 10*5-fold cross-validation
usage(){
    cat 1>&2 <<'EOF'
usage: heri-eval [OPTIONS] training_set [-- SVM_TRAIN_OPTIONS]

OPTIONS:
      -h                Help message

      -n <N>            The number of folds for T*N-fold cross-validation
      -r <ratio>        The ratio (in percents) for training set for hold-out
      -e testing_set    Testing set for hold-out

      -t <T>            The number of runs for T*N-fold cross-validation
                        or holdouts

      -T <threshold>    Threshold for score

      -o <filename>     Save predictions from testing sets
                        to the specified file
                        (outcome_tag prediction_tag [score])
      -O <filename>     Save incorrectly classified objects
                        to the specified file
                        (#object_number: outcome_tag prediction_tag [score])
      -m <filename>     Save confusion matrix to the specified file
                        (frequency : outcome_tag prediction_tag)

      -f                Enable output of per-fold statistics (see -Mf)
      -M <chars>        Output mode:
                           t -- output total statistics,
                           f -- output per-fold statistics,
                           c -- output cross-fold statistics.
      -s <split_opts>   Options passed to heri-split(1)
      -p <stat_opts>    Options passed to heri-stat(1)
      -S <seed>         Seed value passed to heri-split(1).
                        If it is not specified, the dataset is splitted
                        into training and testing datasets randomly.
      -K                Keep temporary directory after exiting
      -D                Debugging mode, implies -K

SVM_TRAIN_OPTIONS: options passed to svm-train(1) and alike

Environment variables:
  SVM_TRAIN_CMD   -- training utility, e.g., liblinear-train
                     (the default is svm-train)
  SVM_PREDICT_CMD -- predicting utility, e.g., liblinear-predict
                     (the default is svm-predict)
  TMPDIR          -- temporary directory (the default is /tmp)

Examples: 
  Ex1: heri-eval -e testing_set.libsvm training_set.libsvm -- -s 0 -t 0
  Ex2: export SVM_TRAIN_CMD='liblinear-train'
       export SVM_PREDICT_CMD='liblinear-predict'
       heri-eval -p '-mr' -n 5 training_set.libsvm -- -s 4 -q
       heri-eval -p '-mr' -n 5 training_set.libsvm -- -s 4 -q

  Ex3: export SVM_TRAIN_CMD='scikit_rf-train --estimators=400'
       export SVM_PREDICT_CMD='scikit_rf-predict'
       heri-eval -p '-c' -Mt -t 50 -r 70 dataset.libsvm
EOF
}

seed=$RANDOM
runs=1
output_mode=tc
times=1
while getopts De:fhKm:M:n:o:O:p:r:s:S:t:T: f; do
    case "$f" in
	'?')
	    usage
	    exit 1;;
	h)
	    usage
	    exit 0;;
	n)
	    number_of_folds="$OPTARG";;
	e)
	    testing_set="$OPTARG";;
	t)
	    times="$OPTARG";;
	T)
	    heristat_args="$heristat_args -t$OPTARG";;
	r)
	    ratio="$OPTARG";;
	m)
	    confusion_matrix="$OPTARG";;
	o)
	    predictions="$OPTARG";;
	O)
	    incorrect_predictions="$OPTARG";;
	s)
	    herisplit_args="$herisplit_args $OPTARG";;
	p)
	    heristat_args="$heristat_args $OPTARG";;
	f)
	    output_mode="f$output_mode";;
	M)
	    output_mode="$OPTARG";;
	S)
	    seed="$OPTARG";;
	K)
	    keep_tmp=1;;
	D)
	    keep_tmp=1
	    debug=1;;
    esac
done
shift `expr $OPTIND - 1`

while test "$#" -gt 0; do
    case "$1" in
	--)
	    shift
	    break;;
	*)
	    print_sh=`printf '%q' "$1"`
	    files="$files $print_sh"
	    shift;;
    esac
done

trap "sig_handler INT"  INT
trap "on_exit" 0

if test -z "$number_of_folds" -a -z "$testing_set" -a -z "$ratio"; then
    echo 'Either -n or -r or -e must be specified, run heri-eval -h for details' 1>&2
    exit 1
fi

if test -z "$files"; then
    echo 'Training set is mandatory, run heri-eval -h for details' 1>&2
    exit 1
fi

tmp_dir=`mktemp -d "$TMPDIR"/svm.XXXXXX`

training_testing (){
    if test -n "$number_of_folds"; then
	${SVM_HERI_SPLIT_CMD} $herisplit_args -c "$number_of_folds" -d "$tmp_dir" -s "$seed" $files
	seed="$((seed+1))"
	last="$number_of_folds"
	for i in `seq $number_of_folds`; do
	    mv "$tmp_dir/test$i.txt" "$tmp_dir/test$i.libsvm"
	    mv "$tmp_dir/train$i.txt" "$tmp_dir/train$i.libsvm"
	done
    elif test -n "$ratio"; then
	${SVM_HERI_SPLIT_CMD} $herisplit_args -R "$ratio" -d "$tmp_dir" -s "$seed" $files
	mv "$tmp_dir/test.txt" "$tmp_dir/test1.libsvm"
	mv "$tmp_dir/train.txt" "$tmp_dir/train1.libsvm"
	rm "$tmp_dir/testing_fold.txt"
	seed="$((seed+1))"
	last=1
    else
	eval "cat -- $files" > "$tmp_dir/train1.libsvm"
	cp "$testing_set" "$tmp_dir/test1.libsvm"
	last=1
    fi

    for i in `seq $last`; do
	${SVM_TRAIN_CMD} "$@" "$tmp_dir/train$i.libsvm" "$tmp_dir/model$i.bin" \
	    2> "$tmp_dir/train_stderr${i}.txt" \
	    >  "$tmp_dir/train_stdout${i}.txt" &
	pid[$i]=$!
    done

    wait_all

    for i in `seq $last`; do
	${SVM_PREDICT_CMD} "$tmp_dir/test$i.libsvm" "$tmp_dir/model$i.bin" \
	    "$tmp_dir/prediction${i}.txt" \
	    2> "$tmp_dir/predict_stderr${i}.txt" \
	    >  "$tmp_dir/predict_stdout${i}.txt" &
	pid[$i]=$!
    done

    wait_all

    rm -f "$tmp_dir/outcome.txt" "$tmp_dir/prediction.txt"
}

show_stat (){
    for t in `seq $times`; do
	prediction_all=''
	for i in `seq $last`; do
	    awk '{print $1}' "$tmp_dir/test${t}_$i.libsvm" > "$tmp_dir/outcome${t}_${i}.txt"
	    paste "$tmp_dir/outcome${t}_${i}.txt" "$tmp_dir/prediction${t}_${i}.txt" | \
		tr '	' ' '  > "$tmp_dir/outcome_and_prediction${t}_${i}.txt"
	    ${SVM_HERI_STAT_CMD} -1R $heristat_args \
		"$tmp_dir/outcome_and_prediction${t}_${i}.txt" > "$tmp_dir/stats${t}_${i}.txt"
	    if [[ "_$output_mode" =~ f ]]; then
		echo "Fold ${t}x$i statistics"
		${SVM_HERI_STAT_CMD} -1 $heristat_args "$tmp_dir/outcome_and_prediction${t}_${i}.txt" |
		indent2
		echo ''
	    fi

	    ln -f "$tmp_dir/outcome_and_prediction${t}_${i}.txt" "$tmp_dir/outcome_and_prediction${i}.txt"
	    prediction_all="$prediction_all $tmp_dir/outcome_and_prediction${t}_${i}.txt"
	done
    done
}

export HERISVM_FC=`calculate_feature_count $files $testing_set`

for t in `seq $times`; do
    training_testing "$@"
#    ls -l "$tmp_dir/"
    for i in `seq $last`; do
	mv "$tmp_dir/train${i}.libsvm" "$tmp_dir/train${t}_$i.libsvm"
	mv "$tmp_dir/test${i}.libsvm" "$tmp_dir/test${t}_$i.libsvm"
	mv "$tmp_dir/prediction${i}.txt" "$tmp_dir/prediction${t}_${i}.txt"
	mv "$tmp_dir/train_stderr${i}.txt" "$tmp_dir/train_stderr${t}_$i.txt"
	mv "$tmp_dir/train_stdout${i}.txt" "$tmp_dir/train_stdout${t}_$i.txt"
	mv "$tmp_dir/predict_stderr${i}.txt" "$tmp_dir/predict_stderr${t}_$i.txt"
	mv "$tmp_dir/predict_stdout${i}.txt" "$tmp_dir/predict_stdout${t}_$i.txt"
	if test -f "$tmp_dir/model${i}.bin"; then
	    mv "$tmp_dir/model${i}.bin" "$tmp_dir/model${t}_$i.bin"
	fi
    done
#    rm "$tmp_dir/test${i}.txt" "$tmp_dir/prediction${i}.txt"
done

#echo before test
#ls -l "$tmp_dir"
show_stat
#echo after test

predictions_from_testing_sets > "$tmp_dir/prediction.txt"

# -o
if test -n "$predictions"; then
    cp "$tmp_dir/prediction.txt" "$predictions"
fi

# -O
if test -n "$incorrect_predictions"; then
    awk '$1 != $2 {print "#" NR, $0}' "$tmp_dir/prediction.txt" \
	> "$incorrect_predictions"
fi

# -m
if test -n "$confusion_matrix"; then
    awk '$1 != $2 {print $1, $2}' "$tmp_dir/prediction.txt" |
    sort | uniq -c | sort -rn |
	awk '{print $1, ":", $2, $3}' > "$confusion_matrix"
fi

rm "$tmp_dir/prediction.txt"

#
if [[ "_$output_mode" =~ t ]]; then
    echo 'Total statistics'
    ${SVM_HERI_STAT_CMD} -1 $heristat_args "$tmp_dir"/outcome_and_prediction*_*.txt | indent2
    echo ''
fi

if test -n "$number_of_folds" && [[ "_$output_mode" =~ c ]]; then
    echo 'Total cross-folds statistics'
    ${SVM_HERI_STAT_ADDONS_CMD} "$tmp_dir"/stats*.txt | indent2
fi