File: cudapi.cu

package info (click to toggle)
mpich 5.0.0-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental
  • size: 251,828 kB
  • sloc: ansic: 1,323,147; cpp: 82,869; f90: 72,420; javascript: 40,763; perl: 28,296; sh: 19,399; python: 16,191; xml: 14,418; makefile: 9,468; fortran: 8,046; java: 4,635; pascal: 352; asm: 324; ruby: 176; awk: 27; lisp: 19; php: 8; sed: 4
file content (86 lines) | stat: -rw-r--r-- 2,157 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/*
 * Copyright (C) by Argonne National Laboratory
 *     See COPYRIGHT in top-level directory
 */

#include "mpi.h"
#include <stdio.h>
#include <math.h>

#define THREADS_PER_BLOCK 256

__device__ double f(double a)
{
    return (4.0 / (1.0 + a * a));
}

__global__ void do_sum(int n, double h, int stride, double *sum) {
    int idx = 1 + (blockDim.x * blockIdx.x + threadIdx.x) + stride;
    __shared__ double block_sum;

    if (threadIdx.x == 0) {
        block_sum = 0.0;
    }
    __syncthreads();

    /* compute rectangles and add to block sum */
    if (idx <= n) {
        double x = h * ((double) idx - 0.5);
        atomicAdd(&block_sum, f(x));
    }

    /* add block sum to total */
    __syncthreads();
    if (threadIdx.x == 0) {
        atomicAdd(sum, block_sum * h);
    }
}

int main(int argc, char *argv[])
{
    int n, myid, numprocs;
    double PI25DT = 3.141592653589793238462643;
    double pi, h;
    double *sum;
    double startwtime = 0.0, endwtime;
    int namelen;
    char processor_name[MPI_MAX_PROCESSOR_NAME];

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    MPI_Get_processor_name(processor_name, &namelen);

    fprintf(stdout, "Process %d of %d is on %s\n", myid, numprocs, processor_name);
    fflush(stdout);

    cudaMalloc((void **)&sum, sizeof(double));

    n = 10000;
    if (myid == 0)
        startwtime = MPI_Wtime();

    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

    h = 1.0 / (double) n;
    int blocks = (n + (THREADS_PER_BLOCK * numprocs - 1)) / (THREADS_PER_BLOCK * numprocs);
    int stride = blocks * THREADS_PER_BLOCK * myid;

    /* compute partial sum using the GPU */
    do_sum<<<blocks, THREADS_PER_BLOCK>>>(n, h, stride, sum);
    cudaDeviceSynchronize();

    MPI_Reduce(sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    if (myid == 0) {
        endwtime = MPI_Wtime();
        printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT));
        printf("wall clock time = %f\n", endwtime - startwtime);
        fflush(stdout);
    }

    cudaFree(sum);

    MPI_Finalize();
    return 0;
}