File: mth_combinations.cu

package info (click to toggle)
python-awkward 2.8.10-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,140 kB
  • sloc: python: 182,845; cpp: 33,828; sh: 432; makefile: 21; javascript: 8
file content (100 lines) | stat: -rw-r--r-- 2,369 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#include <iostream>
#include <cuda_runtime.h>
#include <vector>

__device__ int64_t choose(int n, int k) {
    if (n < k) return 0;
    if (n == k) return 1;

    int delta, imax;
    if (k < n - k) {
        delta = n - k;
        imax = k;
    } else {
        delta = k;
        imax = n - k;
    }

    int64_t ans = delta + 1;
    for (int i = 2; i <= imax; ++i) {
        ans = (ans * (delta + i)) / i;
    }

    return ans;
}

__device__ int64_t largestV(int a, int b, int64_t x) {
    int64_t v = a - 1;
    while (choose(v, b) > x) {
        --v;
    }
    return v;
}

__global__ void cuda_calculateMth(int n, int k, int** d_result, int totalcount, int start, int end) {
    int pos = blockIdx.x * blockDim.x + threadIdx.x;
    if (pos >= totalcount) return;

    int a = end - start;
    int b = k;
    int64_t x = choose(a, b) - 1 - pos;

    for (int i = 0; i < k; ++i) {
        int v = largestV(a, b, x);
        d_result[i][pos] = (end - 1) - v;
        x -= choose(v, b);
        a = v;
        b--;
    }
}

int main() {
    int n = 10;
    int k = 2;
    int start = 2;
    int end = 7;

    int totalcount = 1;
    int factorial = 1;
    for (int i = 0; i < k; ++i) {
        totalcount *= (end - start - i);
        factorial *= (i + 1);
    }
    totalcount /= factorial;

    std::vector<int*> result_ptrs(k);
    std::vector<int*> d_result_ptrs(k);

    for (int i = 0; i < k; ++i) {
        result_ptrs[i] = new int[totalcount];
        cudaMalloc(&d_result_ptrs[i], totalcount * sizeof(int));
    }

    int** d_result;
    cudaMalloc(&d_result, k * sizeof(int*));
    cudaMemcpy(d_result, d_result_ptrs.data(), k * sizeof(int*), cudaMemcpyHostToDevice);

    int threadsperblock = 512;
    int blockspergrid = (totalcount + threadsperblock - 1) / threadsperblock;

    cuda_calculateMth<<<blockspergrid, threadsperblock>>>(n, k, d_result, totalcount, start, end);

    for (int i = 0; i < k; ++i) {
        cudaMemcpy(result_ptrs[i], d_result_ptrs[i], totalcount * sizeof(int), cudaMemcpyDeviceToHost);
    }

    cudaFree(d_result);
    for (int i = 0; i < k; ++i) {
        cudaFree(d_result_ptrs[i]);
    }

    for (int i = 0; i < k; ++i) {
        for (int j = 0; j < totalcount; ++j) {
            std::cout << result_ptrs[i][j] << " ";
        }
        std::cout << std::endl;
        delete[] result_ptrs[i];
    }

    return 0;
}