1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
|
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* Perform second step of bisection algorithm for large matrices for
* intervals that contained after the first step more than one eigenvalue
*/
#ifndef _BISECT_KERNEL_LARGE_MULTI_H_
#define _BISECT_KERNEL_LARGE_MULTI_H_
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// includes, project
#include "config.h"
#include "util.h"
// additional kernel
#include "bisect_util.cu"
////////////////////////////////////////////////////////////////////////////////
//! Perform second step of bisection algorithm for large matrices for
//! intervals that after the first step contained more than one eigenvalue
//! @param g_d diagonal elements of symmetric, tridiagonal matrix
//! @param g_s superdiagonal elements of symmetric, tridiagonal matrix
//! @param n matrix size
//! @param blocks_mult start addresses of blocks of intervals that are
//! processed by one block of threads, each of the
//! intervals contains more than one eigenvalue
//! @param blocks_mult_sum total number of eigenvalues / singleton intervals
//! in one block of intervals
//! @param g_left left limits of intervals
//! @param g_right right limits of intervals
//! @param g_left_count number of eigenvalues less than left limits
//! @param g_right_count number of eigenvalues less than right limits
//! @param g_lambda final eigenvalue
//! @param g_pos index of eigenvalue (in ascending order)
//! @param precision desired precision of eigenvalues
////////////////////////////////////////////////////////////////////////////////
__global__ void bisectKernelLarge_MultIntervals(
float *g_d, float *g_s, const unsigned int n, unsigned int *blocks_mult,
unsigned int *blocks_mult_sum, float *g_left, float *g_right,
unsigned int *g_left_count, unsigned int *g_right_count, float *g_lambda,
unsigned int *g_pos, float precision) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
const unsigned int tid = threadIdx.x;
// left and right limits of interval
__shared__ float s_left[2 * MAX_THREADS_BLOCK];
__shared__ float s_right[2 * MAX_THREADS_BLOCK];
// number of eigenvalues smaller than interval limits
__shared__ unsigned int s_left_count[2 * MAX_THREADS_BLOCK];
__shared__ unsigned int s_right_count[2 * MAX_THREADS_BLOCK];
// helper array for chunk compaction of second chunk
__shared__ unsigned int s_compaction_list[2 * MAX_THREADS_BLOCK + 1];
// compaction list helper for exclusive scan
unsigned int *s_compaction_list_exc = s_compaction_list + 1;
// flag if all threads are converged
__shared__ unsigned int all_threads_converged;
// number of active threads
__shared__ unsigned int num_threads_active;
// number of threads to employ for compaction
__shared__ unsigned int num_threads_compaction;
// flag if second chunk has to be compacted
__shared__ unsigned int compact_second_chunk;
// parameters of block of intervals processed by this block of threads
__shared__ unsigned int c_block_start;
__shared__ unsigned int c_block_end;
__shared__ unsigned int c_block_offset_output;
// midpoint of currently active interval of the thread
float mid = 0.0f;
// number of eigenvalues smaller than \a mid
unsigned int mid_count = 0;
// current interval parameter
float left;
float right;
unsigned int left_count;
unsigned int right_count;
// helper for compaction, keep track which threads have a second child
unsigned int is_active_second = 0;
// initialize common start conditions
if (0 == tid) {
c_block_start = blocks_mult[blockIdx.x];
c_block_end = blocks_mult[blockIdx.x + 1];
c_block_offset_output = blocks_mult_sum[blockIdx.x];
num_threads_active = c_block_end - c_block_start;
s_compaction_list[0] = 0;
num_threads_compaction = ceilPow2(num_threads_active);
all_threads_converged = 1;
compact_second_chunk = 0;
}
cg::sync(cta);
// read data into shared memory
if (tid < num_threads_active) {
s_left[tid] = g_left[c_block_start + tid];
s_right[tid] = g_right[c_block_start + tid];
s_left_count[tid] = g_left_count[c_block_start + tid];
s_right_count[tid] = g_right_count[c_block_start + tid];
}
cg::sync(cta);
// do until all threads converged
while (true) {
// for (int iter=0; iter < 0; iter++) {
// subdivide interval if currently active and not already converged
subdivideActiveInterval(tid, s_left, s_right, s_left_count, s_right_count,
num_threads_active, left, right, left_count,
right_count, mid, all_threads_converged);
cg::sync(cta);
// stop if all eigenvalues have been found
if (1 == all_threads_converged) {
break;
}
// compute number of eigenvalues smaller than mid for active and not
// converged intervals, use all threads for loading data from gmem and
// s_left and s_right as scratch space to store the data load from gmem
// in shared memory
mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, mid, tid,
num_threads_active, s_left,
s_right, (left == right), cta);
cg::sync(cta);
if (tid < num_threads_active) {
// store intervals
if (left != right) {
storeNonEmptyIntervals(tid, num_threads_active, s_left, s_right,
s_left_count, s_right_count, left, mid, right,
left_count, mid_count, right_count, precision,
compact_second_chunk, s_compaction_list_exc,
is_active_second);
} else {
storeIntervalConverged(
s_left, s_right, s_left_count, s_right_count, left, mid, right,
left_count, mid_count, right_count, s_compaction_list_exc,
compact_second_chunk, num_threads_active, is_active_second);
}
}
cg::sync(cta);
// compact second chunk of intervals if any of the threads generated
// two child intervals
if (1 == compact_second_chunk) {
createIndicesCompaction(s_compaction_list_exc, num_threads_compaction,
cta);
compactIntervals(s_left, s_right, s_left_count, s_right_count, mid, right,
mid_count, right_count, s_compaction_list,
num_threads_active, is_active_second);
}
cg::sync(cta);
// update state variables
if (0 == tid) {
num_threads_active += s_compaction_list[num_threads_active];
num_threads_compaction = ceilPow2(num_threads_active);
compact_second_chunk = 0;
all_threads_converged = 1;
}
cg::sync(cta);
// clear
s_compaction_list_exc[threadIdx.x] = 0;
s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
cg::sync(cta);
} // end until all threads converged
// write data back to global memory
if (tid < num_threads_active) {
unsigned int addr = c_block_offset_output + tid;
g_lambda[addr] = s_left[tid];
g_pos[addr] = s_right_count[tid];
}
}
#endif // #ifndef _BISECT_KERNEL_LARGE_MULTI_H_
|