1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
|
#include <omp.h>
#include <malloc.h>
#include <stdio.h>
#include <memory.h>
#define LOOP_IV_TYPE0 LOOP_TYPES
#define LOOP_TYPE0 LOOP_TYPES
#define LOOP_STYPE0 LOOP_TYPES
#define LOOP_IV_TYPE1 LOOP_TYPES
#define LOOP_TYPE1 LOOP_TYPES
#define LOOP_STYPE1 LOOP_TYPES
#define LOOP_IV_TYPE2 LOOP_TYPES
#define LOOP_TYPE2 LOOP_TYPES
#define LOOP_STYPE2 LOOP_TYPES
#define MAX_THREADS 256
#if defined VERBOSE
#define PRINTF(...) printf(__VA_ARGS__)
#else
#define PRINTF(...)
#endif
LOOP_TYPE0 iLB, iUB;
LOOP_TYPE1 jA0, jB0;
LOOP_TYPE2 kA0, kB0;
LOOP_STYPE0 iStep;
LOOP_STYPE1 jA1, jB1, jStep;
LOOP_STYPE2 kA1, kB1, kStep;
// We can check <=, <, >=, > (!= has different pattern)
// Additional definition of LOOP_LEi, LOOP_LTi, etc. is helpful to build calls
// of the test from main
#if defined LOOP_LE0
#define COMPARE0 <=
#elif defined LOOP_LT0
#define COMPARE0 <
#elif defined LOOP_GE0
#define COMPARE0 >=
#elif defined LOOP_GT0
#define COMPARE0 >
#endif
#if defined LOOP_LE1
#define COMPARE1 <=
#elif defined LOOP_LT1
#define COMPARE1 <
#elif defined LOOP_GE1
#define COMPARE1 >=
#elif defined LOOP_GT1
#define COMPARE1 >
#endif
#if defined LOOP_LE2
#define COMPARE2 <=
#elif defined LOOP_LT2
#define COMPARE2 <
#elif defined LOOP_GE2
#define COMPARE2 >=
#elif defined LOOP_GT2
#define COMPARE2 >
#endif
typedef struct {
LOOP_IV_TYPE0 i;
LOOP_IV_TYPE1 j;
LOOP_IV_TYPE2 k;
} spaceType;
spaceType *AllocSpace(unsigned size) {
spaceType *p = (spaceType *)malloc(size * sizeof(spaceType));
memset(p, 0, size * sizeof(spaceType));
return p;
}
void FreeSpace(spaceType *space) { free(space); }
// record an iteration
void Set(spaceType *space, unsigned count, unsigned trueCount, LOOP_IV_TYPE0 i,
LOOP_IV_TYPE1 j, LOOP_IV_TYPE0 k) {
if (count > trueCount) {
// number of iterations exceeded
// will be reported with checks
return;
}
space[count - 1].i = i;
space[count - 1].j = j;
space[count - 1].k = k;
}
int test() {
int pass = 1;
LOOP_IV_TYPE0 i;
LOOP_IV_TYPE1 j;
LOOP_IV_TYPE2 k;
spaceType *openmpSpace;
spaceType *scalarSpace;
unsigned trueCount = 0;
unsigned openmpCount = 0;
unsigned scalarCount = 0;
unsigned uselessThreadsOpenMP = 0;
unsigned usefulThreadsOpenMP = 0;
// Use half of the available threads/logical processors.
unsigned num_threads = omp_get_max_threads() / 2;
// Make sure num_threads is not 0 after the division in case
// omp_get_max_threads() returns 1.
if (num_threads == 0)
num_threads = 1;
if (num_threads > MAX_THREADS)
num_threads = MAX_THREADS;
unsigned long *chunkSizesOpenmp =
(unsigned long *)malloc(sizeof(unsigned long) * num_threads);
memset(chunkSizesOpenmp, 0, sizeof(unsigned long) * num_threads);
// count iterations and allocate space
LOOP { ++trueCount; }
openmpSpace = AllocSpace(trueCount);
scalarSpace = AllocSpace(trueCount);
// fill the scalar (compare) space
LOOP {
++scalarCount;
Set(scalarSpace, scalarCount, trueCount, i, j, k);
}
// test run body:
// perform and record OpenMP iterations and thread use
#pragma omp parallel num_threads(num_threads)
{
unsigned gtid = omp_get_thread_num();
#pragma omp for collapse(3) private(i, j, k)
LOOP {
unsigned count;
#pragma omp atomic update
++chunkSizesOpenmp[gtid];
#pragma omp atomic capture
count = ++openmpCount;
Set(openmpSpace, count, trueCount, i, j, k);
}
}
// check for the right number of iterations processed
// (only need to check for less, greater is checked when recording)
if (openmpCount < trueCount) {
PRINTF("OpenMP FAILURE: Openmp processed fewer iterations: %d vs %d\n",
openmpCount, trueCount);
pass = 0;
} else if (openmpCount > trueCount) {
PRINTF("OpenMP FAILURE: Openmp processed more iterations: %d vs %d\n",
openmpCount, trueCount);
pass = 0;
}
// check openMP for iteration correctnes against scalar
for (unsigned i = 0; i < trueCount; i++) {
unsigned j;
for (j = 0; j < openmpCount; j++) {
if ((scalarSpace[i].i == openmpSpace[j].i) &&
(scalarSpace[i].j == openmpSpace[j].j) &&
(scalarSpace[i].k == openmpSpace[j].k)) {
break;
}
}
if (j == openmpCount) {
PRINTF("OpenMP FAILURE: (%d %d %d) not processed\n", scalarSpace[i].i,
scalarSpace[i].j, scalarSpace[i].k);
pass = 0;
}
}
// check for efficient thread use
for (unsigned i = 0; i < num_threads; ++i) {
if (chunkSizesOpenmp[i] == 0) {
++uselessThreadsOpenMP;
}
}
// a check to see if at least more than one thread was used (weakish)
if ((uselessThreadsOpenMP == num_threads - 1) && (trueCount > 1)) {
PRINTF("OpenMP FAILURE: threads are not used\n");
pass = 0;
}
#if 0
// a check to see if the load was spread more or less evenly so that
// when there was more work than threads each one got at least something
// (stronger, but may currently fail for a general collapse case)
if ((trueCount >= num_threads) && (uselessThreadsOpenMP > 0)) {
PRINTF("OpenMP FAILURE: %d threads not used with %d iterations\n",
uselessThreadsOpenMP, openmpCount);
pass = 0;
}
#endif
// clean up space
FreeSpace(openmpSpace);
FreeSpace(scalarSpace);
free(chunkSizesOpenmp);
return pass;
}
|