File: btl_mx_proc.c

package info (click to toggle)
openmpi 1.1-2.3
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 39,124 kB
  • ctags: 22,534
  • sloc: ansic: 216,698; sh: 22,541; makefile: 6,921; cpp: 5,562; asm: 3,160; lex: 375; objc: 365; perl: 347; csh: 89; tcl: 12; f90: 5
file content (221 lines) | stat: -rw-r--r-- 7,714 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/*
 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
 *                         University Research and Technology
 *                         Corporation.  All rights reserved.
 * Copyright (c) 2004-2005 The University of Tennessee and The University
 *                         of Tennessee Research Foundation.  All rights
 *                         reserved.
 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, 
 *                         University of Stuttgart.  All rights reserved.
 * Copyright (c) 2004-2005 The Regents of the University of California.
 *                         All rights reserved.
 * $COPYRIGHT$
 * 
 * Additional copyrights may follow
 * 
 * $HEADER$
 */

#include "ompi_config.h"

#include "opal/class/opal_hash_table.h"
#include "ompi/mca/pml/base/pml_base_module_exchange.h"

#include "btl_mx.h"
#include "btl_mx_proc.h"

static void mca_btl_mx_proc_construct(mca_btl_mx_proc_t* proc);
static void mca_btl_mx_proc_destruct(mca_btl_mx_proc_t* proc);

OBJ_CLASS_INSTANCE(mca_btl_mx_proc_t, 
        opal_list_item_t, mca_btl_mx_proc_construct, 
        mca_btl_mx_proc_destruct);

void mca_btl_mx_proc_construct(mca_btl_mx_proc_t* proc)
{
    proc->proc_ompi           = 0;
    proc->proc_addr_index     = 0;
    proc->proc_endpoints      = NULL;
    proc->proc_endpoint_count = 0;
    proc->mx_peers_count      = 0;
    proc->mx_peers            = NULL;
    OBJ_CONSTRUCT(&proc->proc_lock, opal_mutex_t);
    /* add to list of all proc instance */
    OPAL_THREAD_LOCK(&mca_btl_mx_component.mx_lock);
    opal_list_append(&mca_btl_mx_component.mx_procs, &proc->super);
    OPAL_THREAD_UNLOCK(&mca_btl_mx_component.mx_lock);
}

/*
 * Cleanup MX proc instance
 */

void mca_btl_mx_proc_destruct(mca_btl_mx_proc_t* proc)
{
    /* remove from list of all proc instances */
    OPAL_THREAD_LOCK(&mca_btl_mx_component.mx_lock);
    opal_list_remove_item(&mca_btl_mx_component.mx_procs, &proc->super);
    OPAL_THREAD_UNLOCK(&mca_btl_mx_component.mx_lock);

    /* release resources */
    if( NULL != proc->proc_endpoints ) {
        free(proc->proc_endpoints);
        proc->proc_endpoints = NULL;
    }
    if( NULL != proc->mx_peers ) {
        free(proc->mx_peers);
        proc->mx_peers = NULL;
    }
}


/*
 * Look for an existing MX process instances based on the associated
 * ompi_proc_t instance.
 */
static mca_btl_mx_proc_t* mca_btl_mx_proc_lookup_ompi(ompi_proc_t* ompi_proc)
{
    mca_btl_mx_proc_t* mx_proc;

    OPAL_THREAD_LOCK(&mca_btl_mx_component.mx_lock);

    for( mx_proc = (mca_btl_mx_proc_t*)opal_list_get_first(&mca_btl_mx_component.mx_procs);
         mx_proc != (mca_btl_mx_proc_t*)opal_list_get_end(&mca_btl_mx_component.mx_procs);
         mx_proc  = (mca_btl_mx_proc_t*)opal_list_get_next(mx_proc) ) {

        if(mx_proc->proc_ompi == ompi_proc) {
            OPAL_THREAD_UNLOCK(&mca_btl_mx_component.mx_lock);
            return mx_proc;
        }

    }

    OPAL_THREAD_UNLOCK(&mca_btl_mx_component.mx_lock);

    return NULL;
}

/*
 * Create a MX process structure. There is a one-to-one correspondence
 * between a ompi_proc_t and a mca_btl_mx_proc_t instance. We cache
 * additional data (specifically the list of mca_btl_mx_endpoint_t instances, 
 * and published addresses) associated w/ a given destination on this
 * datastructure.
 */

mca_btl_mx_proc_t* mca_btl_mx_proc_create(ompi_proc_t* ompi_proc)
{
    mca_btl_mx_proc_t* module_proc = NULL;

    /* Check if we have already created a MX proc
     * structure for this ompi process */
    module_proc = mca_btl_mx_proc_lookup_ompi(ompi_proc);
    if( module_proc != NULL ) {
        /* Gotcha! */
        return module_proc;
    }

    /* Oops! First time, gotta create a new MX proc
     * out of the ompi_proc ... */

    module_proc = OBJ_NEW(mca_btl_mx_proc_t);

    module_proc->proc_ompi      = ompi_proc;

    return module_proc;
}


/*
 * Note that this routine must be called with the lock on the process
 * already held.  Insert a btl instance into the proc array and assign 
* it an address.
 */
int mca_btl_mx_proc_insert( mca_btl_mx_proc_t* module_proc, 
                            mca_btl_mx_endpoint_t* module_endpoint )
{
    mca_btl_mx_addr_t  *mx_peers;
    int rc;
    size_t size;

    /* query for the peer address info */
    rc = mca_pml_base_modex_recv( &mca_btl_mx_component.super.btl_version,
                                  module_proc->proc_ompi, (void*)&mx_peers, &size );
    if( OMPI_SUCCESS != rc ) {
        opal_output( 0, "mca_pml_base_modex_recv failed for peer [%d,%d,%d]",
                     ORTE_NAME_ARGS(&module_proc->proc_ompi->proc_name) );
        OBJ_RELEASE(module_proc);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    if( (size % sizeof(mca_btl_mx_addr_t)) != 0 ) {
        opal_output( 0, "invalid mx address for peer [%d,%d,%d]",
                     ORTE_NAME_ARGS(&module_proc->proc_ompi->proc_name) );
        OBJ_RELEASE(module_proc);
        return OMPI_ERROR;
    }
    module_proc->mx_peers_count = size / sizeof(mca_btl_mx_addr_t);
    if( 0 == module_proc->mx_peers_count ) {  /* no available connection */
        return OMPI_ERROR;
    }

    module_proc->status = MCA_BTL_MX_NOT_CONNECTED;
    module_proc->mx_peers = mx_peers;

    if( NULL == module_proc->proc_endpoints ) {
        module_proc->proc_endpoints = (mca_btl_base_endpoint_t**)
            malloc(module_proc->mx_peers_count * sizeof(mca_btl_base_endpoint_t*));
        if( NULL == module_proc->proc_endpoints ) {
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }
    /* insert into endpoint array */
    module_endpoint->endpoint_proc = module_proc;

    return OMPI_SUCCESS;
}

int mca_btl_mx_proc_connect( mca_btl_mx_endpoint_t* module_endpoint )
{
    int num_retry = 0, i;
    mx_return_t mx_status;
    mx_endpoint_addr_t mx_remote_addr;
    mca_btl_mx_proc_t* module_proc = module_endpoint->endpoint_proc;

    for( i = module_proc->proc_addr_index; i < module_proc->mx_peers_count; i++ ) {
        
    retry_connect:
        mx_status = mx_connect( module_endpoint->endpoint_btl->mx_endpoint,
                                module_proc->mx_peers[i].nic_id, module_proc->mx_peers[i].endpoint_id,
                                mca_btl_mx_component.mx_filter, mca_btl_mx_component.mx_timeout, &mx_remote_addr );
        if( MX_SUCCESS != mx_status ) {
            if( MX_TIMEOUT == mx_status )
                if( num_retry++ < mca_btl_mx_component.mx_connection_retries )
                    goto retry_connect;
            {
                char peer_name[MX_MAX_HOSTNAME_LEN];

                if( MX_SUCCESS != mx_nic_id_to_hostname( module_proc->mx_peers[i].nic_id, peer_name ) )
                    sprintf( peer_name, "unknown %lx nic_id", (long)module_proc->mx_peers[i].nic_id );

                opal_output( 0, "mx_connect fail for %s(%dth remote address) with key %x (error %s)\n", 
                             peer_name, i, mca_btl_mx_component.mx_filter, mx_strerror(mx_status) );
            }
            continue;
        }
        module_endpoint->mx_peer.nic_id      = module_proc->mx_peers[i].nic_id;
        module_endpoint->mx_peer.endpoint_id = module_proc->mx_peers[i].endpoint_id;
        module_endpoint->mx_peer_addr        = mx_remote_addr;
        module_proc->proc_addr_index         = i;
        module_proc->status                  = MCA_BTL_MX_CONNECTED;
        break;
    }

    if( i == module_proc->mx_peers_count ) {  /* no available connection */
        module_proc->status = MCA_BTL_MX_NOT_REACHEABLE;
        return OMPI_ERROR;
    }

    module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint;
    return OMPI_SUCCESS;
}