File: collapse_test.inc

package info (click to toggle)
llvm-toolchain-19 1%3A19.1.7-3
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 1,998,520 kB
  • sloc: cpp: 6,951,680; ansic: 1,486,157; asm: 913,598; python: 232,024; f90: 80,126; objc: 75,281; lisp: 37,276; pascal: 16,990; sh: 10,009; ml: 5,058; perl: 4,724; awk: 3,523; makefile: 3,167; javascript: 2,504; xml: 892; fortran: 664; cs: 573
file content (211 lines) | stat: -rw-r--r-- 5,361 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#include <omp.h>
#include <malloc.h>
#include <stdio.h>
#include <memory.h>

#define LOOP_IV_TYPE0 LOOP_TYPES
#define LOOP_TYPE0 LOOP_TYPES
#define LOOP_STYPE0 LOOP_TYPES

#define LOOP_IV_TYPE1 LOOP_TYPES
#define LOOP_TYPE1 LOOP_TYPES
#define LOOP_STYPE1 LOOP_TYPES

#define LOOP_IV_TYPE2 LOOP_TYPES
#define LOOP_TYPE2 LOOP_TYPES
#define LOOP_STYPE2 LOOP_TYPES

#define MAX_THREADS 256

#if defined VERBOSE
#define PRINTF(...) printf(__VA_ARGS__)
#else
#define PRINTF(...)
#endif

LOOP_TYPE0 iLB, iUB;
LOOP_TYPE1 jA0, jB0;
LOOP_TYPE2 kA0, kB0;

LOOP_STYPE0 iStep;
LOOP_STYPE1 jA1, jB1, jStep;
LOOP_STYPE2 kA1, kB1, kStep;

// We can check <=, <, >=, > (!= has different pattern)
// Additional definition of LOOP_LEi, LOOP_LTi, etc. is helpful to build calls
// of the test from main

#if defined LOOP_LE0
#define COMPARE0 <=
#elif defined LOOP_LT0
#define COMPARE0 <
#elif defined LOOP_GE0
#define COMPARE0 >=
#elif defined LOOP_GT0
#define COMPARE0 >
#endif

#if defined LOOP_LE1
#define COMPARE1 <=
#elif defined LOOP_LT1
#define COMPARE1 <
#elif defined LOOP_GE1
#define COMPARE1 >=
#elif defined LOOP_GT1
#define COMPARE1 >
#endif

#if defined LOOP_LE2
#define COMPARE2 <=
#elif defined LOOP_LT2
#define COMPARE2 <
#elif defined LOOP_GE2
#define COMPARE2 >=
#elif defined LOOP_GT2
#define COMPARE2 >
#endif

typedef struct {
  LOOP_IV_TYPE0 i;
  LOOP_IV_TYPE1 j;
  LOOP_IV_TYPE2 k;
} spaceType;

spaceType *AllocSpace(unsigned size) {

  spaceType *p = (spaceType *)malloc(size * sizeof(spaceType));
  memset(p, 0, size * sizeof(spaceType));
  return p;
}

void FreeSpace(spaceType *space) { free(space); }

// record an iteration
void Set(spaceType *space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i,
         LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k) {
  if (count > trueCount) {
    // number of iterations exceeded
    // will be reported with checks
    return;
  }
  space[count - 1].i = i;
  space[count - 1].j = j;
  space[count - 1].k = k;
}
int test() {
  int pass = 1;
  LOOP_IV_TYPE0 i;
  LOOP_IV_TYPE1 j;
  LOOP_IV_TYPE2 k;

  spaceType *openmpSpace;
  spaceType *scalarSpace;

  unsigned trueCount = 0;
  unsigned openmpCount = 0;
  unsigned scalarCount = 0;
  unsigned uselessThreadsOpenMP = 0;
  unsigned usefulThreadsOpenMP = 0;

  // Use half of the available threads/logical processors.
  unsigned num_threads = omp_get_max_threads() / 2;

  // Make sure num_threads is not 0 after the division in case
  // omp_get_max_threads() returns 1.
  if (num_threads == 0)
    num_threads = 1;

  if (num_threads > MAX_THREADS)
    num_threads = MAX_THREADS;

  unsigned long *chunkSizesOpenmp =
      (unsigned long *)malloc(sizeof(unsigned long) * num_threads);
  memset(chunkSizesOpenmp, 0, sizeof(unsigned long) * num_threads);

  // count iterations and allocate space
  LOOP { ++trueCount; }

  openmpSpace = AllocSpace(trueCount);
  scalarSpace = AllocSpace(trueCount);

  // fill the scalar (compare) space
  LOOP {
    ++scalarCount;
    Set(scalarSpace, scalarCount, trueCount, i, j, k);
  }

  // test run body:
  // perform and record OpenMP iterations and thread use
#pragma omp parallel num_threads(num_threads)
  {
    unsigned gtid = omp_get_thread_num();
#pragma omp for collapse(3) private(i, j, k)
    LOOP {
      unsigned count;
#pragma omp atomic update
      ++chunkSizesOpenmp[gtid];
#pragma omp atomic capture
      count = ++openmpCount;
      Set(openmpSpace, count, trueCount, i, j, k);
    }
  }

  // check for the right number of iterations processed
  // (only need to check for less, greater is checked when recording)
  if (openmpCount < trueCount) {
    PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n",
           openmpCount, trueCount);
    pass = 0;
  } else if (openmpCount > trueCount) {
    PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n",
           openmpCount, trueCount);
    pass = 0;
  }

  // check openMP for iteration correctnes against scalar
  for (unsigned i = 0; i < trueCount; i++) {
    unsigned j;
    for (j = 0; j < openmpCount; j++) {
      if ((scalarSpace[i].i == openmpSpace[j].i) &&
          (scalarSpace[i].j == openmpSpace[j].j) &&
          (scalarSpace[i].k == openmpSpace[j].k)) {
        break;
      }
    }
    if (j == openmpCount) {
      PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i,
             scalarSpace[i].j, scalarSpace[i].k);
      pass = 0;
    }
  }

  // check for efficient thread use
  for (unsigned i = 0; i < num_threads; ++i) {
    if (chunkSizesOpenmp[i] == 0) {
      ++uselessThreadsOpenMP;
    }
  }

  // a check to see if at least more than one thread was used (weakish)
  if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) {
    PRINTF("OpenMP FAILURE: threads are not used\n");
    pass = 0;
  }

#if 0
    // a check to see if the load was spread more or less evenly so that
    // when there was more work than threads each one got at least something 
    // (stronger, but may currently fail for a general collapse case)
    if ((trueCount >= num_threads) && (uselessThreadsOpenMP > 0)) {
       PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n", 
           uselessThreadsOpenMP, openmpCount);
       pass = 0;
    }
#endif

  // clean up space
  FreeSpace(openmpSpace);
  FreeSpace(scalarSpace);
  free(chunkSizesOpenmp);
  return pass;
}