File: mutex_array.cu

package info (click to toggle)
stdgpu 1.3.0%2Bgit20220507.32e0517-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,528 kB
  • sloc: cpp: 7,818; pascal: 1,893; xml: 214; sh: 181; makefile: 50
file content (107 lines) | stat: -rw-r--r-- 3,839 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
 *  Copyright 2019 Patrick Stotko
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

#include <iostream>
#include <thrust/reduce.h>
#include <thrust/sequence.h>

#include <stdgpu/atomic.cuh> // stdgpu::atomic
#include <stdgpu/iterator.h> // device_begin, device_end
#include <stdgpu/memory.h>   // createDeviceArray, destroyDeviceArray
#include <stdgpu/mutex.cuh>  // stdgpu::mutex_array
#include <stdgpu/platform.h> // STDGPU_HOST_DEVICE
#include <stdgpu/vector.cuh> // stdgpu::vector

struct is_odd
{
    STDGPU_HOST_DEVICE bool
    operator()(const int x) const
    {
        return x % 2 == 1;
    }
};

__global__ void
try_partial_sum(const int* d_input, const stdgpu::index_t n, stdgpu::mutex_array<> locks, int* d_result)
{
    stdgpu::index_t i = static_cast<stdgpu::index_t>(blockIdx.x * blockDim.x + threadIdx.x);

    if (i >= n)
        return;

    stdgpu::index_t j = i % locks.size();

    // While loops might hang due to internal driver scheduling, so use a fixed number of trials.
    // Do not loop over try_lock(). Instead, loop over the whole sequential part to avoid deadlocks.
    bool finished = false;
    const stdgpu::index_t number_trials = 5;
    for (stdgpu::index_t k = 0; k < number_trials; ++k)
    {
        // --- SEQUENTIAL PART ---
        if (!finished && locks[j].try_lock())
        {
            // START --- critical section --- START

            d_result[j] += d_input[i];

            //  END  --- critical section ---  END
            locks[j].unlock();
            finished = true;
        }
        // --- SEQUENTIAL PART ---
    }
}

int
main()
{
    //
    // EXAMPLE DESCRIPTION
    // -------------------
    // This example demonstrates how stdgpu::mutex_array can be used to implement spin locks on the GPU.
    // Since the correct usage still comes with many implications, this example is oversimplified and just shows the
    // deadlock-free looping.
    //

    const stdgpu::index_t n = 100;
    const stdgpu::index_t m = 10;

    int* d_input = createDeviceArray<int>(n);
    int* d_result = createDeviceArray<int>(m);
    stdgpu::mutex_array<> locks = stdgpu::mutex_array<>::createDeviceObject(m);

    thrust::sequence(stdgpu::device_begin(d_input), stdgpu::device_end(d_input), 1);

    // d_input : 1, 2, 3, ..., 100

    stdgpu::index_t threads = 32;
    stdgpu::index_t blocks = (n + threads - 1) / threads;
    try_partial_sum<<<static_cast<unsigned int>(blocks), static_cast<unsigned int>(threads)>>>(d_input,
                                                                                               n,
                                                                                               locks,
                                                                                               d_result);
    cudaDeviceSynchronize();

    int sum = thrust::reduce(stdgpu::device_cbegin(d_result), stdgpu::device_cend(d_result), 0, thrust::plus<int>());

    const int sum_closed_form = n * (n + 1) / 2;

    std::cout << "The sum of all partially computed sums (via mutex locks) is " << sum
              << " which intentionally might not match the expected value of " << sum_closed_form << std::endl;

    destroyDeviceArray<int>(d_input);
    destroyDeviceArray<int>(d_result);
    stdgpu::mutex_array<>::destroyDeviceObject(locks);
}