File: gtranksperf.c

package info (click to toggle)
mpich 4.0.2-3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 423,384 kB
  • sloc: ansic: 1,088,434; cpp: 71,364; javascript: 40,763; f90: 22,829; sh: 17,463; perl: 14,773; xml: 14,418; python: 10,265; makefile: 9,246; fortran: 8,008; java: 4,355; asm: 324; ruby: 176; lisp: 19; php: 8; sed: 4
file content (136 lines) | stat: -rw-r--r-- 4,404 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
 * Copyright (C) by Argonne National Laboratory
 *     See COPYRIGHT in top-level directory
 */

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include "mpitest.h"

#include <math.h>       /* for fabs(3) */

/* Measure and compare the relative performance of MPI_Group_translate_ranks
 * with small and large group2 sizes but a constant number of ranks.  This
 * serves as a performance sanity check for the Scalasca use case where we
 * translate to MPI_COMM_WORLD ranks.  The performance should only depend on the
 * number of ranks passed, not the size of either group (especially group2).
 *
 * This test is probably only meaningful for large-ish process counts, so we may
 * not be able to run this test by default in the nightlies. */

/* number of iterations used for timing */
#define NUM_LOOPS (1000000)

int main(int argc, char *argv[])
{
    int errs = 0;
    int *ranks;
    int *ranksout;
    MPI_Group gworld, grev, gself;
    MPI_Comm comm;
    MPI_Comm commrev;
    int rank, size, i;
    double start, end, time1, time2;

    MTest_Init(&argc, &argv);

    comm = MPI_COMM_WORLD;

    MPI_Comm_size(comm, &size);
    MPI_Comm_rank(comm, &rank);

    ranks = malloc(size * sizeof(int));
    ranksout = malloc(size * sizeof(int));
    if (!ranks || !ranksout) {
        fprintf(stderr, "out of memory\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    /* generate a comm with the rank order reversed */
    MPI_Comm_split(comm, 0, (size - rank - 1), &commrev);
    MPI_Comm_group(commrev, &grev);
    MPI_Comm_group(MPI_COMM_SELF, &gself);
    MPI_Comm_group(comm, &gworld);

    /* sanity check correctness first */
    for (i = 0; i < size; i++) {
        ranks[i] = i;
        ranksout[i] = -1;
    }
    MPI_Group_translate_ranks(grev, size, ranks, gworld, ranksout);
    for (i = 0; i < size; i++) {
        if (ranksout[i] != (size - i - 1)) {
            if (rank == 0)
                printf("%d: (gworld) expected ranksout[%d]=%d, got %d\n", rank, i,
                       (size - rank - 1), ranksout[i]);
            ++errs;
        }
    }
    MPI_Group_translate_ranks(grev, size, ranks, gself, ranksout);
    for (i = 0; i < size; i++) {
        int expected = (i == (size - rank - 1) ? 0 : MPI_UNDEFINED);
        if (ranksout[i] != expected) {
            if (rank == 0)
                printf("%d: (gself) expected ranksout[%d]=%d, got %d\n", rank, i, expected,
                       ranksout[i]);
            ++errs;
        }
    }

    /* now compare relative performance */

    /* we needs lots of procs to get a group large enough to have meaningful
     * numbers.  On most testing machines this means that we're oversubscribing
     * cores in a big way, which might perturb the timing results.  So we make
     * sure everyone started up and then everyone but rank 0 goes to sleep to
     * let rank 0 do all the timings. */
    MPI_Barrier(comm);

    if (rank != 0) {
        MTestSleep(10);
    } else {    /* rank==0 */

        MTestSleep(1);  /* try to avoid timing while everyone else is making syscalls */

        MPI_Group_translate_ranks(grev, size, ranks, gworld, ranksout); /*throwaway iter */
        start = MPI_Wtime();
        for (i = 0; i < NUM_LOOPS; ++i) {
            MPI_Group_translate_ranks(grev, size, ranks, gworld, ranksout);
        }
        end = MPI_Wtime();
        time1 = end - start;

        MPI_Group_translate_ranks(grev, size, ranks, gself, ranksout);  /*throwaway iter */
        start = MPI_Wtime();
        for (i = 0; i < NUM_LOOPS; ++i) {
            MPI_Group_translate_ranks(grev, size, ranks, gself, ranksout);
        }
        end = MPI_Wtime();
        time2 = end - start;

        /* complain if the "gworld" time exceeds 2x the "gself" time */
        if (fabs(time1 - time2) > (2.00 * time2)) {
            printf("too much difference in MPI_Group_translate_ranks performance:\n");
            printf("time1=%f time2=%f\n", time1, time2);
            printf("(fabs(time1-time2)/time2)=%f\n", (fabs(time1 - time2) / time2));
            if (time1 < time2) {
                printf("also, (time1<time2) is surprising...\n");
            }
            ++errs;
        }
    }

    free(ranks);
    free(ranksout);

    MPI_Group_free(&grev);
    MPI_Group_free(&gself);
    MPI_Group_free(&gworld);

    MPI_Comm_free(&commrev);

    MTest_Finalize(errs);

    return MTestReturnValue(errs);
}