File: totalview.c

package info (click to toggle)
lam 7.1.2-1
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 54,668 kB
  • ctags: 17,034
  • sloc: ansic: 156,264; sh: 9,976; cpp: 7,699; makefile: 5,590; perl: 476; fortran: 260; asm: 83
file content (355 lines) | stat: -rw-r--r-- 9,217 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
/*
 * Copyright (c) 2001-2003 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 * $HEADER$
 *
 * $Id: totalview.c,v 6.9 2003/07/17 18:01:41 jsquyres Exp $
 *
 *	Function:	- functions required for totalview support
 *
 * You can also find lot of usefull documentation at
 * http://www-unix.mcs.anl.gov/mpi/mpi-debug/
 * http://www-unix.mcs.anl.gov/mpi/mpi-debug/mpich-attach.txt 
 *
 */

#include <lam_config.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>

#include <lam-totalview.h>
#include <freq.h>
#include <lam_network.h>
#include <app_mgmt.h>
#include <all_list.h>
#include <sfh.h>
#include <all_opt.h>
#include <all_list.h>
#include <app_schema.h>
#include <dl_inet.h>
#include <kio.h>
#include <etc_misc.h>

/* Structure defination for MPIR_PROCDESC. We need to put it here
   instead of lam-totalview.h because the way to guarantee that
   TotalView has digested the struct definition of MPIR_PROCDESC
   before it is needed is to place the definition in the same file as
   MPIR_Breakpoint */

typedef struct {
  char *host_name;
  char *executable_name;
  int pid;  
} MPIR_PROCDESC;


/*
 * private functions 
 */
static void build_tv_process_table(LIST *app); 
static void nodename(int node, int want_name);

/* 
 * external variables
 */
extern int world_n;
extern struct _gps *mpiworld;


/*
 * Symbols exported for TotalView to see (must be public symbols)
 */
/* Variables */
volatile int MPIR_being_debugged = 0;
volatile MPIR_PROCDESC *MPIR_proctable = NULL;
volatile int MPIR_proctable_size = 0;
volatile int MPIR_debug_state = 0;
volatile int MPIR_i_am_starter = 0;
volatile int MPIR_partial_attach_ok = 0;

/* Functions */
void *MPIR_Breakpoint(void);

/*
 * private variables
 */
/* required to get node/link inforamtion from lamd */
static int4 nlinks;
static struct dolink *links;
static char name[1024];
/* TV command line args if we have to exec TV */
static int tv_argc = 0;
static char **tv_argv = NULL;


/*
 * Initialize the interface
 */
void
lam_tv_init(int argc, char *argv[], OPT *opt)
{
  int i;

  /* Setup convenience command line argument */

  ao_setopt(opt, "tv", 0, 0, 0);

  /* Build the argv for launching totalview (even though we might not
     use it) */

  sfh_argv_add(&tv_argc, &tv_argv, "totalview");
  sfh_argv_add(&tv_argc, &tv_argv, "mpirun");
  sfh_argv_add(&tv_argc, &tv_argv, "-a");
  for (i = 1; i < argc; ++i) {
    if (strcmp(argv[i], "-tv") == 0)
      continue;
    sfh_argv_add(&tv_argc, &tv_argv, argv[i]);
  }
}


/*
 * Be good citizens and free the memory that we have allocated, since
 * it will no longer be used
 */
void
lam_tv_finalize(void)
{
  sfh_argv_free(tv_argv);
}


/*
 * See if the convenience argv option -tv was given
 */
void
lam_tv_check(OPT *opt)
{
  /* Check for TotalView debug request */	

  if (!ao_taken(opt, "tv"))
    return;

  /* Yes, we want totalview support -- so launch the tv_argv that we
     built from above */

  execvp(tv_argv[0], tv_argv);
  show_help("mpirun", "totalview-exec-failed", NULL);
  exit(1);
}


/*
 * If we're being debugged, launch the whole attach-totalview-debugger
 * stuff
 */
void
lam_tv_attach(LIST *app)
{
  /* always build the process table so that debuggers that support
   * joining the party late (like Absoft's parallel debugger) can
   * still find everyone
   */
  build_tv_process_table(app);

  if (MPIR_being_debugged != 1)
    return;
  
  /* Signal the debugger to attach to the executable images on all the
     nodes */
  
  MPIR_debug_state = PROCESSES_SPAWNED;
  
  MPIR_Breakpoint();

}


/* Build the process table which will be used by totalview to attach
 * to all ranks it wants to attach to */ 
static void 
build_tv_process_table(LIST *app) 
{
  struct aschema *pp;  /* ptr process entry */
  int i, j;
  int size_hostname;
  char *cwd = NULL;
  char *executable_path = NULL;

  /* Get link table from lamd */
  if (ldogetlinks(&links, &nlinks)) {
    show_help(NULL, "lib-call-fail", "ldogetlinks", NULL);
    exit(1);
  }

  /* malloc TV process table */
  MPIR_proctable_size = world_n;
  MPIR_proctable = (MPIR_PROCDESC *) malloc ( MPIR_proctable_size * 
					      sizeof(MPIR_PROCDESC) );
  if (MPIR_proctable == NULL) {
    show_help(NULL, "system-call-fail", "malloc", NULL);
    exit(1);
  }

  /* allocate process table and fill pid and hostname for each rank */
  for (i = 0; i < MPIR_proctable_size; ++i) {
    /* fill process id */
    MPIR_proctable[i].pid = mpiworld[i].gps_pid;
    
    /* fill host name */
    nodename(mpiworld[i].gps_node,1);
    if (strcmp(name,"invalid node") == 0) {
      show_help("mpirun", "totalview-invalid-hostname", NULL);
      exit(1);
    }
    
    size_hostname = strlen(name);
    MPIR_proctable[i].host_name = (char *) malloc(sizeof(char) *
						  (size_hostname + 1));
    if (MPIR_proctable[i].host_name == NULL) {
      show_help(NULL, "system-call-fail", "malloc", NULL);
      exit(1);
    }

    strcpy(MPIR_proctable[i].host_name, name);
  } 

  /* fill executable name into process table */
  /* MRC: This is a bit tricky ..  not sure if we are covering all
     possible cases */

  for (pp = al_top(app), j = 0; ( (j < MPIR_proctable_size) && pp ); 
       ++j, pp = al_next(app, pp)) {

    /* Set up current working directory */
    if (pp->asc_env->ape_wrkdir) 
      cwd = pp->asc_env->ape_wrkdir;
    else
       cwd = getworkdir();

    /* If user specified absolute path names then use them directly */
    if ( pp->asc_args->apa_argv[0][0] == '/' ) {
      MPIR_proctable[j].executable_name = 
	(char *) malloc(sizeof(char) * ( strlen(pp->asc_args->apa_argv[0]) 
					 + 1 ) );
      if (MPIR_proctable[j].executable_name == NULL) {
	show_help(NULL, "system-call-fail", "malloc", NULL);
	exit(1);
      }
      strcpy(MPIR_proctable[j].executable_name, pp->asc_args->apa_argv[0]);
      continue;
    }
    
    /* else if user specified relative path name */
    if ( strstr((pp->asc_args->apa_argv[0])+1, "/") != NULL ) {
      MPIR_proctable[j].executable_name = 
	(char *) malloc(sizeof(char) * (strlen(cwd) + 
					strlen(pp->asc_args->apa_argv[0])
					+ 2));
      if (MPIR_proctable[j].executable_name == NULL) {
	show_help(NULL, "system-call-fail", "malloc", NULL);
	exit(1);
      }
      strcpy(MPIR_proctable[j].executable_name, cwd);
      strcat(MPIR_proctable[j].executable_name, "/");
      strcat(MPIR_proctable[j].executable_name, 
	     pp->asc_args->apa_argv[0]);
      
    } else {
      /* user specified just the executable name */
      executable_path = lam_rfpathfind(pp->asc_args->apa_argv[0], 
				       pp->asc_env->ape_wrkdir,
				       pp->asc_node);
      if (executable_path == NULL) {
	/* executable was not found in path on remote lamd, so we
           assume its in current directory on remote node .. trying to
           be good to some stupid users */
	MPIR_proctable[j].executable_name = 
	  (char *) malloc(sizeof(char) * (strlen(cwd) + 
					  strlen(pp->asc_args->apa_argv[0])
					  + 2));
	if (MPIR_proctable[j].executable_name == NULL) {
	  show_help(NULL, "system-call-fail", "malloc", NULL);
	  exit(1);
	}
	strcpy(MPIR_proctable[j].executable_name, cwd);
	strcat(MPIR_proctable[j].executable_name, "/");
	strcat(MPIR_proctable[j].executable_name, pp->asc_args->apa_argv[0]);
      }
      else {
	/* executable was found in path on remote lamd, so we append
           the executable name to the dir in which it was found to get
           absolute path */
	MPIR_proctable[j].executable_name = 
	  (char *) malloc(sizeof(char) * (strlen(executable_path) + 1 ));
	if (MPIR_proctable[j].executable_name == NULL) {
	  show_help(NULL, "system-call-fail", "malloc", NULL);
	  exit(1);
	}
	strcpy(MPIR_proctable[j].executable_name, executable_path);
      }
    }
  } /* for (....) */
}


/* 
 * MPIR_Breakpoint
 * 
 * Function: Dummy function to allow the debugger to set a breakpoint
 *           at this function
 * 
 */
void *
MPIR_Breakpoint(void)
{
  /* Don't need anything here, except maybe to silence some compiler
     warnings */

  if (MPIR_proctable == NULL) {
    show_help("mpirun", "totalview-invalid-process_table", NULL);
  }

  return (void *) 0;

}


/* 
 * nodename
 * 
 * Function: Extracts node name/IP from node ID
 * 
 */
static void
nodename(int node, int want_name)
{
  struct hostent *hent;

  if (node < 0 || node >= nlinks ||
      links[node].dol_link == NOTLINKID) {
    strcpy(name, "invalid node");
  } else {
    hent = lam_gethostbyaddr((char *)&links[node].dol_addr.sin_addr,
			     sizeof(struct in_addr), AF_INET);
    if (hent && want_name)
      lam_strncpy(name, hent->h_name, sizeof(name));
    else
      sprintf(name, "%s", inet_ntoa(links[node].dol_addr.sin_addr));
  }
}