File: ras_gridengine_module.c

package info (click to toggle)
openmpi 1.6.5-9.1%2Bdeb8u1
  • links: PTS, VCS
  • area: main
  • in suites: jessie
  • size: 91,628 kB
  • ctags: 44,305
  • sloc: ansic: 408,966; cpp: 44,454; sh: 27,828; makefile: 10,486; asm: 3,882; python: 1,239; lex: 805; perl: 549; csh: 253; fortran: 232; f90: 126; tcl: 12
file content (200 lines) | stat: -rw-r--r-- 6,515 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * Copyright (c) 2006-2009 Sun Microsystems, Inc.  All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */
/**
 * @file:
 * Resource Allocation for Grid Engine
 */
#include "orte_config.h"
#include "orte/constants.h"

#include <errno.h>
#include <unistd.h>
#include <string.h>

#include "opal/util/output.h"
#include "orte/util/show_help.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/runtime/orte_globals.h"
#include "orte/mca/ras/base/ras_private.h"
#include "orte/mca/ras/gridengine/ras_gridengine.h"

/*
 * Local functions
 */
static int orte_ras_gridengine_allocate(opal_list_t *nodes);
static int orte_ras_gridengine_finalize(void);
#if 0
static int get_slot_count(char* node_name, int* slot_cnt);
#endif

/*
 * Global variable
 */
orte_ras_base_module_t orte_ras_gridengine_module = {
    orte_ras_gridengine_allocate,
    orte_ras_gridengine_finalize
};

/**
 *  Discover available (pre-allocated) nodes. Allocate the
 *  requested number of nodes/process slots to the job.
 *  
 */
static int orte_ras_gridengine_allocate(opal_list_t *nodelist)
{
    char *pe_hostfile = getenv("PE_HOSTFILE");
    char *job_id = getenv("JOB_ID");
    char buf[1024], *tok, *num, *queue, *arch, *ptr;
    int rc;
    FILE *fp;
    orte_node_t *node;
    opal_list_item_t *item;
    bool found;

    /* show the Grid Engine's JOB_ID */
    if (mca_ras_gridengine_component.show_jobid ||
        mca_ras_gridengine_component.verbose != -1) {
        opal_output(0, "ras:gridengine: JOB_ID: %s", job_id);
    }
   
    /* check the PE_HOSTFILE before continuing on */
    if (!(fp = fopen(pe_hostfile, "r"))) {
        orte_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
            true, pe_hostfile, strerror(errno));
        rc = ORTE_ERROR;
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    /* parse the pe_hostfile for hostname, slots, etc, then compare the
     * current node with a list of hosts in the nodelist, if the current
     * node is not found in nodelist, add it in */
    opal_output(mca_ras_gridengine_component.verbose, 
		"ras:gridengine: PE_HOSTFILE: %s", pe_hostfile);

    while (fgets(buf, sizeof(buf), fp)) {
        ptr = strtok_r(buf, " \n", &tok);
        num = strtok_r(NULL, " \n", &tok);
        queue = strtok_r(NULL, " \n", &tok);
        arch = strtok_r(NULL, " \n", &tok);

        /* see if we already have this node */
        found = false;
        for (item = opal_list_get_first(nodelist);
             item != opal_list_get_end(nodelist);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
            if (0 == strcmp(ptr, node->name)) {
                /* just add the slots */
                node->slots += (int)strtol(num, (char **)NULL, 10);
                found = true;
                opal_output(mca_ras_gridengine_component.verbose,
                            "ras:gridengine: %s: PE_HOSTFILE increased to slots=%d",
                            node->name, node->slots);
                break;
            }
        }
        if (!found) {
            /* create a new node entry */
            node = OBJ_NEW(orte_node_t);
            if (NULL == node) {
                fclose(fp);
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            node->name = strdup(ptr);
            node->state = ORTE_NODE_STATE_UP;
            node->slots_inuse = 0;
            node->slots_max = 0;
            node->slots = (int)strtol(num, (char **)NULL, 10);
            opal_output(mca_ras_gridengine_component.verbose,
                        "ras:gridengine: %s: PE_HOSTFILE shows slots=%d",
                        node->name, node->slots);
            opal_list_append(nodelist, &node->super);
        }
    } /* finished reading the $PE_HOSTFILE */

cleanup:
    fclose(fp);

    /* in gridengine, if we didn't find anything, then something
     * is wrong. The user may not have indicated this was a parallel
     * job, or may not have an allocation at all. In any case, this
     * is considered an unrecoverable error and we need to report it
     */
    if (opal_list_is_empty(nodelist)) {
        orte_show_help("help-ras-gridengine.txt", "no-nodes-found", true);
        return ORTE_ERR_NOT_FOUND;
    }

    return ORTE_SUCCESS;
    
}

#if 0
/**
 * This function is not used currently, but may be used eventually.
 * Parse the PE_HOSTFILE to determine the number of process
 * slots/processors available on the node.
 */
static int get_slot_count(char* node_name, int* slot_cnt)
{   
    char buf[1024], *tok, *name, *num, *queue, *arch;
    char *pe_hostfile = getenv("PE_HOSTFILE");
    FILE *fp;
    
    /* check the PE_HOSTFILE before continuing on */
    if (!(fp = fopen(pe_hostfile, "r"))) {
        orte_show_help("help-ras-gridengine.txt", "cannot-read-pe-hostfile",
            true, pe_hostfile, strerror(errno));
        ORTE_ERROR_LOG(ORTE_ERROR);
        return(ORTE_ERROR);
    }
        
    while (fgets(buf, sizeof(buf), fp)) {
        name = strtok_r(buf, " \n", &tok);
        num = strtok_r(NULL, " \n", &tok);
        queue = strtok_r(NULL, " \n", &tok);
        arch = strtok_r(NULL, " \n", &tok);
        
        if(strcmp(node_name,name) == 0) {
            *slot_cnt = (int) strtol(num, (char **)NULL, 10);
            opal_output(mca_ras_gridengine_component.verbose,
                "ras:gridengine: %s: PE_HOSTFILE shows slots=%d",
                node_name, *slot_cnt);
            fclose(fp);
            return ORTE_SUCCESS;
        }
    }

    /* when there is no match */
    fclose(fp);
    return ORTE_ERROR;
}
#endif

/**
 * finalize
 */
static int orte_ras_gridengine_finalize(void)
{
    /* Nothing to do */
    opal_output(mca_ras_gridengine_component.verbose,
        "ras:gridengine:finalize: success (nothing to do)");
    return ORTE_SUCCESS;
}