1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
|
#!/bin/sh
#
#
#___INFO__MARK_BEGIN__
##########################################################################
#
# The Contents of this file are made available subject to the terms of
# the Sun Industry Standards Source License Version 1.2
#
# Sun Microsystems Inc., March, 2001
#
#
# Sun Industry Standards Source License Version 1.2
# =================================================
# The contents of this file are subject to the Sun Industry Standards
# Source License Version 1.2 (the "License"); You may not use this file
# except in compliance with the License. You may obtain a copy of the
# License at http://gridengine.sunsource.net/Gridengine_SISSL_license.html
#
# Software provided under this License is provided on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
# WITHOUT LIMITATION, WARRANTIES THAT THE SOFTWARE IS FREE OF DEFECTS,
# MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE, OR NON-INFRINGING.
# See the License for the specific provisions governing your rights and
# obligations concerning the Software.
#
# The Initial Developer of the Original Code is: Sun Microsystems, Inc.
#
# Copyright: 2001 by Sun Microsystems, Inc.
#
# All Rights Reserved.
#
##########################################################################
#___INFO__MARK_END__
set +u # don't treat unset parameters as an error
set +e # don't exit on bad command status
ckpt_dir=$3
if [ ! -f $ckpt_dir/ckpt.log ]; then
touch $ckpt_dir/ckpt.log
chmod 666 $ckpt_dir/ckpt.log
fi
if [ "$SGE_TASK_ID" = "undefined" -o "$SGE_TASK_ID" = "" ]; then
jobid=$JOB_ID
jobdir=$JOB_ID.1
else
jobid=$JOB_ID.$SGE_TASK_ID
jobdir=$JOB_ID.$SGE_TASK_ID
fi
#
# create temp directory for holding checkpoint info
#
tmpdir=$ckpt_dir/ckpt.$jobid
mkdir -p $tmpdir
cd $tmpdir
#
# create log file
#
F=$tmpdir/checkpoint.log
touch $F
exec >> $F 2>&1
echo -------------------------------------------------------------
echo `basename $0` called at `date`
echo called by: `id`
echo with args: $*
#
# Get original job_pid and osjobid
#
job_pid=`cat job_pid`
osjobid=`cat osjobid`
# restore the O.S. job identifier to the jobs directory
# NOTE: do we need to restore osjobid?
# NOTE: do we need to restore job_pid?
job_dir=`dirname $JOB_SCRIPT`/../active_jobs/$jobdir
echo original job_pid=$job_pid
echo original osjobid=$osjobid
ls -la $job_dir/job_pid
cat $job_dir/job_pid
ls -la $job_dir/osjobid
cat $job_dir/osjobid
#echo $job_pid > $job_dir/job_pid
#echo $osjobid > $job_dir/osjobid
echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid)" restarting >> $ckpt_dir/ckpt.log
#
# If previous restart file exists, we assume that we
# died during a checkpoint and we should recover using
# this file
#
if [ -f chkpnt_$jobid.save ]; then
mv chkpnt_$jobid.save chkpnt_$jobid
fi
#
# Register restart file, just in case it's not registered. This could
# happen if the host died during the last checkpoint
#
echo /usr/bin/rsvresf chkpnt_$jobid
/usr/bin/rsvresf chkpnt_$jobid
#
# Now restart the job and wait for it to complete
#
echo /usr/bin/restart -f -i -w chkpnt_$jobid
/usr/bin/restart -f -i -w chkpnt_$jobid
exit_status=$?
echo Exit status of restart command: $exit_status
# Now be careful: The restart command is the parent process of the restarted
# job. Grid Engine is the parent process of the restart command.
# If the job was killed (probably due to a migration request), we need to
# tell our parent that by killing ourselves. Grid Engine will also detect an
# exit status > 128 analogous to a KILL
#if [ $exit_status = 1 ]
#then
# jstat=`/bin/acctcom -j $job_id -b -p -Z -f -v /usr/adm/acct/day/pacct | $SGE_ROOT/ckpt/cray_parse_job_status $2`
# echo "jobstatus $job_id $2 = $jstat"
# if [ "$jstat" = "" ]
# then
# exit_status=100
# elif [ "$jstat" = "0" ]
# then
# exit_status=0
# else
# exit_status=`expr $jstat + 128`
# fi
#fi
# If killing ourselves didn't help or the exit_status was < 128 exit
# with the exit status of our child
echo `date +"%D %T"` Job $jobid "(job_pid=$job_pid, osjobid=$osjobid) exiting, status=$exit_status" >> $ckpt_dir/ckpt.log
echo Exiting with exit status: $exit_status
exit $exit_status
|