File: tstmachines.in

package info (click to toggle)
mpich 1.1.0-3
  • links: PTS
  • area: main
  • in suites: hamm
  • size: 22,116 kB
  • ctags: 27,349
  • sloc: ansic: 193,435; sh: 11,172; fortran: 6,545; makefile: 5,801; cpp: 5,020; tcl: 3,548; asm: 3,536; csh: 1,079; java: 614; perl: 183; awk: 168; sed: 70; f90: 62
file content (261 lines) | stat: -rwxr-xr-x 7,499 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#! /bin/sh
#
# This script tests each of the machines in the machines/machine.$1 list
# to make sure that it is accessible and can run programs.  This is only
# a partial test
#
# In order for this to run in the background, we're careful to use -n 
# in the remote shell commands
#
verbose=0
rsh="#RSH_COMMAND#"
# Could use #DEFAULT_ARCH#
# and #DEFAULT_DEVICE# to see if we should check....
for arg in "$@" ; do
    case $arg in 
        -echo) set -x ;;
	-v) verbose=1 ;;
        -help) cat <<.
Usage: 
     tstmachines [ -echo ] [ -v ] architecture
Tests that you can run remote shells with $rsh on the systems in 
your machines file and that executables are all cross-mounted.  To run this
test, you must be able to create files in the current directory.

If there are problems with some of the machines, this test may take several
minutes per problem machine.   If there are no problems, there will be no
output.  The option -v may be used to see what tests are being run.
.
        exit 1 
	;;
        *) arch=$arg ;;
    esac
done
#
MPIR_HOME="#MPIR_HOME#"
if [ -z "$arch" ] ; then
    arch=`$MPIR_HOME/bin/tarch`
fi
#
machineFile="${MPIR_HOME}/util/machines/machines.${arch}"
if [ ! -f $machineFile ] ; then
    echo "Cannot read list of nodes $machineFile"
    exit 1
fi
list=`cat $machineFile | sed -e '/\#/d' | tr -s '\012' ' '`
/bin/rm -f mpichfoo
echo "A test" > mpichfoo
# Use same mechanism as in mpirun to get the value of pwd...
if [ -n "#AUTOMOUNTFIX#" ] ; then
    PWDtest=`pwd | #AUTOMOUNTFIX#`
    if [ ! -d $PWDtest ] ; then
        PWDtest=`pwd`
    fi
    if [ -n "$PWD" ] ; then
        /bin/rm -f $PWDtest/.mpirtmp $PWD/.mpirtmp
        echo "test" > $PWD/.mpirtmp
        if [ ! -s $PWDtest/.mpirtmp ] ; then
	    /bin/rm -f $PWD/.mpirtmp
            PWD=$PWDtest
        fi
        /bin/rm -f $PWDtest/.mpirtmp $PWD/.mpirtmp
    else 
        PWD=$PWDtest
    fi
else
    PWD=`pwd`
fi
#
if [ -n "$PWD" ] ; then
    PWD_TRIAL=$PWD
else
    PWD_TRIAL=$PWDtest
fi
if [ ! -d $PWD_TRIAL ] ; then 
    echo "Warning: your default path uses the automounter; this may"
    echo "cause some problems if you use other NFS-connected systems."
    PWD_TRIAL=`pwd`
fi
#
# First try running a simple program
# (test for access or stty/who am i problems)
myprog=$PWD_TRIAL/mpichfoo
errcnt=0
errsimple=0
livelist=""
printedheader=""
for machine in $list ; do
    # Strip cluster size from machine name
    ntest=`expr $machine : '.*:\([0-9]*\)'`
    if [ -n "$ntest" ] ; then
        machine=`expr $machine : '\(.*\):.*'`
    fi
    if [ $verbose = 1 ] ; then 
	echo "Trying true on $machine ..."
    fi
    output=`$rsh $machine -n true 2>&1`
    if [ -n "$output" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run true"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	# Check for stty or who am i problems"
	iswho=`echo $output | grep -i 'am i'`
        if [ -n "$iswho" ] ; then
	    echo "You may have a command like"
	    echo "    who am i"
            echo "in your .login or .cshrc file.  This command can only be"
	    echo "used when a process is attached to a terminal."
            echo "See the Users Manual for ways to fix this."
        fi
        isstty=`echo $output | grep -i stty`
	if [ -n "$isstty" ] ; then
	    echo "You may have a command like"
	    echo "    stty ...."
            echo "in your .login or .cshrc file.  This command can only be"
	    echo "used when a process is attached to a terminal."
            echo "See the Users Manual for ways to fix this."
        fi
	errcnt=`expr $errcnt + 1`
	errsimple=`expr $errsimple + 1`
    else
	livelist="$livelist $machine"
    fi
done
if [ $errsimple -gt 0 ] ; then
cat <<EOF
    The test of $rsh <machine> true  failed on some machines.
    This may be due to problems in your .login or .cshrc files; 
    some common problems are described when detected.  Look at the 
    output above to see what the problem is.

    If the problem is something like 'permission denied', then the 
    remote shell command $rsh does not allow you to run programs.
    See the documentation about remote shell and rhosts.

EOF
fi
# Next try running ls
# (Test for consistent filesystem)
#
myprog=$PWD_TRIAL/mpichfoo
errcnt=0
errls=0
livelist=""
printedheader=""
#
# Get the output form to expect from ls.  
# Use /bin/ls to avoid any alias problems
tstout=`/bin/ls $myprog`
for machine in $list ; do
    # Strip cluster size from machine name
    ntest=`expr $machine : '.*:\([0-9]*\)'`
    if [ -n "$ntest" ] ; then
        machine=`expr $machine : '\(.*\):.*'`
    fi
    if [ $verbose = 1 ] ; then 
	echo "Trying ls on $machine ..."
    fi
    output=`$rsh $machine -n /bin/ls $myprog 2>&1`
    if [ "$output" != "$tstout" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run ls $myprog"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	errcnt=`expr $errcnt + 1`
	errls=`expr $errls + 1`
    else
	livelist="$livelist $machine"
    fi
done
/bin/rm -f mpichfoo
if [ $errls -gt 0 ] ; then
cat <<EOF
    The ls test failed on some machines.
    This usually means that you do not have a common filesystem on 
    all of the machines in your machines list; MPICH requires this
    for mpirun (it is possible to handle this in a procgroup file; see
    the documentation for more details).

    Other possible problems include:
        The remote shell command $rsh does not allow you to run ls.
           See the documentation about remote shell and rhosts.
        You have a common file system, but with inconsistent names.
           See the documentation on the automounter fix.

EOF
fi
#
# Now, try running a simple USER program
/bin/rm -f tstfoo.c 
cat >tstfoo.c <<.
main(){return 0;}
.
#CC# -c tstfoo.c
#CLINKER# -o tstfoo tstfoo.o
if [ ! -x tstfoo ] ; then
    echo "Could not build a sample program using #CC# and #CLINKER#!"
    /bin/rm -f tstfoo.c
    exit 1
fi
myprog=$PWD_TRIAL/tstfoo
list="$livelist"
livelist=""
printedheader=""
erruser=0
for machine in $list ; do
    if [ $verbose = 1 ] ; then 
	echo "Trying user program on $machine ..."
    fi
    output=`$rsh $machine -n $myprog 2>&1`
    if [ "$output" != "" ] ; then
	if [ -z "$printedheader" ] ; then
	    echo "Errors while trying to run a simple C program"
	    printedheader=1
	fi
	echo "Unexpected response from $machine:"
        echo "--> $output"
	errcnt=`expr $errcnt + 1`
        erruser=`expr $erruser + 1`
    else
	livelist="$livelist $machine"
    fi
done
/bin/rm -f tstfoo tstfoo.c tstfoo.o
if [ $erruser -gt 0 ] ; then
    cat <<EOF
    The simple program test failed.

    This test tries to run a simple program on the machines in your machines
    list with the command
        $rsh machinename -n program 

    This can fail if you do not have a common filesystem (this should have
    been detected above) or if the remote shell command $rsh does not allow 
    you to run the program on the indicated remote machines.  
        See the documentation about remote shell and rhosts for possible 
    fixes.
    
EOF
fi
#
#
if [ $errcnt -gt 0 ] ; then
    echo " "
    echo "$errcnt errors were encountered while testing the machines list for $arch"
    if [ -n "$livelist" ] ; then
	echo "Only these machines seem to be available"
	for machine in $livelist ; do
	    echo "    $machine"
	done
    else
	echo "No machines seem to be available!"
    fi
    exit 1
fi
exit 0