File: strided-bench.c

package info (click to toggle)
armci-mpi 0.0~git20160222-2
  • links: PTS
  • area: main
  • in suites: stretch
  • size: 1,756 kB
  • sloc: ansic: 12,698; sh: 229; makefile: 53; fortran: 44
file content (126 lines) | stat: -rw-r--r-- 3,614 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/*
 * Copyright (C) 2010. See COPYRIGHT in top-level directory.
 */

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>

#include <mpi.h>
#include <armci.h>
#ifdef MODE_SET
#include <armcix.h>
#endif

#define MAX_XDIM        1024
#define MAX_YDIM        1024

#define MAX_DATA_SIZE   (MAX_XDIM*MAX_YDIM*sizeof(double))
#define NUM_ITERATIONS  ((xdim*ydim <= 1024) ? 64 : 16)
#define NUM_WARMUP_ITER 1 

int main(int argc, char ** argv) {
  int    rank, nproc;
#ifdef MULTIPLE
  int    thread_level;
#endif
  int    target_rank, xdim, ydim, test_iter;
  int    stride[1], count[2], levels;
  double scale;
  int   *buf;
  void **base_ptrs;
#ifdef MODE_SET
  ARMCI_Group grp_world;
#endif

#ifdef MULTIPLE
  MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &thread_level);
#else
  MPI_Init(&argc, &argv);
#endif
  ARMCI_Init();

  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &nproc);

  if (rank == 0) printf("Starting one-sided strided performance test with %d processes\n", nproc);

  buf = ARMCI_Malloc_local(MAX_DATA_SIZE);
  base_ptrs = malloc(sizeof(void*)*nproc);
  ARMCI_Malloc(base_ptrs, MAX_DATA_SIZE);

  memset(buf, rank+1, MAX_DATA_SIZE);

#ifdef MODE_SET
  ARMCI_Group_get_default(&grp_world);

  if (getenv("ARMCIX_MODE_SET"))
    ARMCIX_Mode_set(ARMCIX_MODE_CONFLICT_FREE | ARMCIX_MODE_NO_LOAD_STORE, base_ptrs[rank], &grp_world);
  else if (rank == 0)
    printf("Warning: ARMCIX_MODE_SET not enabled\n");
#endif

  if (rank == 0)
    printf("%12s %12s %12s %12s %12s %12s %12s %12s\n", "Trg. Rank", "Xdim Ydim",
        "Get (usec)", "Put (usec)", "Acc (usec)",
        "Get (MiB/s)", "Put (MiB/s)", "Acc (MiB/s)");

  stride[0] = MAX_XDIM*sizeof(double);
  levels    = 1;
  scale     = 1.0;

  for (target_rank = 1; rank == 0 && target_rank < nproc; target_rank++) {

    for (xdim = 1; xdim <= MAX_XDIM; xdim *= 2) {
      count[0] = xdim*sizeof(double);

      for (ydim = 1; ydim <= MAX_YDIM; ydim *= 2) {
        const int data_size = xdim*ydim*sizeof(double);
        double    t_get=0.0, t_put=0.0, t_acc=0.0;

        count[1] = ydim;

        for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
          if (test_iter == NUM_WARMUP_ITER)
            t_put = MPI_Wtime();

          ARMCI_PutS(buf, stride, base_ptrs[target_rank], stride, count, levels, target_rank);
        }
        ARMCI_Fence(target_rank);
        t_put = (MPI_Wtime() - t_put)/NUM_ITERATIONS;

        for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
          if (test_iter == NUM_WARMUP_ITER)
            t_acc = MPI_Wtime();

          ARMCI_AccS(ARMCI_ACC_DBL, (void*) &scale, buf, stride, base_ptrs[target_rank], stride, count, levels, target_rank);
        }
        ARMCI_Fence(target_rank);
        t_acc = (MPI_Wtime() - t_acc)/NUM_ITERATIONS;

        for (test_iter = 0; test_iter < NUM_ITERATIONS + NUM_WARMUP_ITER; test_iter++) {
          if (test_iter == NUM_WARMUP_ITER)
            t_get = MPI_Wtime();

          ARMCI_GetS(base_ptrs[target_rank], stride, buf, stride, count, levels, target_rank);
        }
        t_get = (MPI_Wtime() - t_get)/NUM_ITERATIONS;

        printf("%12d %6d%6d %12.3f %12.3f %12.3f %12.3f %12.3f %12.3f\n", target_rank, xdim, ydim,
            t_get*1.0e6, t_put*1.0e6, t_acc*1.0e6, data_size/(1024.0*1024.0)/t_get, data_size/(1024.0*1024.0)/t_put, data_size/(1024.0*1024.0)/t_acc);
      }
    }
  }

  ARMCI_Barrier();

  ARMCI_Free(base_ptrs[rank]);
  ARMCI_Free_local(buf);
  free(base_ptrs);

  ARMCI_Finalize();
  MPI_Finalize();

  return 0;
}