1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
|
/* SPDX-License-Identifier: BSD-3-Clause
* Copyright(c) 2010-2018 Intel Corporation
*/
/*
* This is a simple functional test for rte_smp_mb() implementation.
* I.E. make sure that LOAD and STORE operations that precede the
* rte_smp_mb() call are globally visible across the lcores
* before the LOAD and STORE operations that follows it.
* The test uses simple implementation of Peterson's lock algorithm
* (https://en.wikipedia.org/wiki/Peterson%27s_algorithm)
* for two execution units to make sure that rte_smp_mb() prevents
* store-load reordering to happen.
* Also when executed on a single lcore could be used as a approximate
* estimation of number of cycles particular implementation of rte_smp_mb()
* will take.
*/
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <inttypes.h>
#include <rte_memory.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_eal.h>
#include <rte_lcore.h>
#include <rte_pause.h>
#include <rte_random.h>
#include <rte_cycles.h>
#include <rte_vect.h>
#include <rte_debug.h>
#include "test.h"
#define ADD_MAX 8
#define ITER_MAX 0x1000000
enum plock_use_type {
USE_MB,
USE_SMP_MB,
USE_NUM
};
struct plock {
volatile uint32_t flag[2];
volatile uint32_t victim;
enum plock_use_type utype;
};
/*
* Lock plus protected by it two counters.
*/
struct plock_test {
struct plock lock;
uint64_t val;
uint64_t iter;
};
/*
* Each active lcore shares plock_test struct with it's left and right
* neighbours.
*/
struct lcore_plock_test {
struct plock_test *pt[2]; /* shared, lock-protected data */
uint64_t sum[2]; /* local copy of the shared data */
uint64_t iter; /* number of iterations to perform */
uint32_t lc; /* given lcore id */
};
static inline void
store_load_barrier(uint32_t utype)
{
if (utype == USE_MB)
rte_mb();
else if (utype == USE_SMP_MB)
rte_smp_mb();
else
RTE_VERIFY(0);
}
/*
* Peterson lock implementation.
*/
static void
plock_lock(struct plock *l, uint32_t self)
{
uint32_t other;
other = self ^ 1;
l->flag[self] = 1;
rte_smp_wmb();
l->victim = self;
store_load_barrier(l->utype);
while (l->flag[other] == 1 && l->victim == self)
rte_pause();
rte_smp_rmb();
}
static void
plock_unlock(struct plock *l, uint32_t self)
{
rte_smp_wmb();
l->flag[self] = 0;
}
static void
plock_reset(struct plock *l, enum plock_use_type utype)
{
memset(l, 0, sizeof(*l));
l->utype = utype;
}
/*
* grab the lock, update both counters, release the lock.
*/
static void
plock_add(struct plock_test *pt, uint32_t self, uint32_t n)
{
plock_lock(&pt->lock, self);
pt->iter++;
pt->val += n;
plock_unlock(&pt->lock, self);
}
static int
plock_test1_lcore(void *data)
{
uint64_t tm;
uint32_t lc, ln;
uint64_t i, n;
struct lcore_plock_test *lpt;
lpt = data;
lc = rte_lcore_id();
/* find lcore_plock_test struct for given lcore */
for (ln = rte_lcore_count(); ln != 0 && lpt->lc != lc; lpt++, ln--)
;
if (ln == 0) {
printf("%s(%u) error at init\n", __func__, lc);
return -1;
}
n = rte_rand() % ADD_MAX;
tm = rte_get_timer_cycles();
/*
* for each iteration:
* - update shared, locked protected data in a safe manner
* - update local copy of the shared data
*/
for (i = 0; i != lpt->iter; i++) {
plock_add(lpt->pt[0], 0, n);
plock_add(lpt->pt[1], 1, n);
lpt->sum[0] += n;
lpt->sum[1] += n;
n = (n + 1) % ADD_MAX;
}
tm = rte_get_timer_cycles() - tm;
printf("%s(%u): %" PRIu64 " iterations finished, in %" PRIu64
" cycles, %#Lf cycles/iteration, "
"local sum={%" PRIu64 ", %" PRIu64 "}\n",
__func__, lc, i, tm, (long double)tm / i,
lpt->sum[0], lpt->sum[1]);
return 0;
}
/*
* For N active lcores we allocate N+1 lcore_plock_test structures.
* Each active lcore shares one lcore_plock_test structure with its
* left lcore neighbor and one lcore_plock_test structure with its
* right lcore neighbor.
* During the test each lcore updates data in both shared structures and
* its local copies. Then at validation phase we check that our shared
* and local data are the same.
*/
static int
plock_test(uint64_t iter, enum plock_use_type utype)
{
int32_t rc;
uint32_t i, lc, n;
uint64_t *sum;
struct plock_test *pt;
struct lcore_plock_test *lpt;
/* init phase, allocate and initialize shared data */
n = rte_lcore_count();
pt = calloc(n + 1, sizeof(*pt));
lpt = calloc(n, sizeof(*lpt));
sum = calloc(n + 1, sizeof(*sum));
printf("%s(iter=%" PRIu64 ", utype=%u) started on %u lcores\n",
__func__, iter, utype, n);
if (pt == NULL || lpt == NULL || sum == NULL) {
printf("%s: failed to allocate memory for %u lcores\n",
__func__, n);
free(pt);
free(lpt);
free(sum);
return -ENOMEM;
}
for (i = 0; i != n + 1; i++)
plock_reset(&pt[i].lock, utype);
i = 0;
RTE_LCORE_FOREACH(lc) {
lpt[i].lc = lc;
lpt[i].iter = iter;
lpt[i].pt[0] = pt + i;
lpt[i].pt[1] = pt + i + 1;
i++;
}
lpt[i - 1].pt[1] = pt;
for (i = 0; i != n; i++)
printf("lpt[%u]={lc=%u, pt={%p, %p},};\n",
i, lpt[i].lc, lpt[i].pt[0], lpt[i].pt[1]);
/* test phase - start and wait for completion on each active lcore */
rte_eal_mp_remote_launch(plock_test1_lcore, lpt, CALL_MAIN);
rte_eal_mp_wait_lcore();
/* validation phase - make sure that shared and local data match */
for (i = 0; i != n; i++) {
sum[i] += lpt[i].sum[0];
sum[i + 1] += lpt[i].sum[1];
}
sum[0] += sum[i];
rc = 0;
for (i = 0; i != n; i++) {
printf("%s: sum[%u]=%" PRIu64 ", pt[%u].val=%" PRIu64 ", pt[%u].iter=%" PRIu64 ";\n",
__func__, i, sum[i], i, pt[i].val, i, pt[i].iter);
/* race condition occurred, lock doesn't work properly */
if (sum[i] != pt[i].val || 2 * iter != pt[i].iter) {
printf("error: local and shared sums don't match\n");
rc = -1;
}
}
free(pt);
free(lpt);
free(sum);
printf("%s(utype=%u) returns %d\n", __func__, utype, rc);
return rc;
}
static int
test_barrier(void)
{
int32_t i, ret, rc[USE_NUM];
for (i = 0; i != RTE_DIM(rc); i++)
rc[i] = plock_test(ITER_MAX, i);
ret = 0;
for (i = 0; i != RTE_DIM(rc); i++) {
printf("%s for utype=%d %s\n",
__func__, i, rc[i] == 0 ? "passed" : "failed");
ret |= rc[i];
}
return ret;
}
REGISTER_PERF_TEST(barrier_autotest, test_barrier);
|