1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
|
//----------------------------------------------------------------------
// trim X and Y
//----------------------------------------------------------------------
//try to trim tail of X
while ( (pX_end - pX_start > shared_vector_size)
&& (Yi[pY_end-1] < Xi[pX_end - shared_vector_size -1 ]) )
{
pX_end -= shared_vector_size;
}
//try to trim tail of Y
while ( (pY_end - pY_start > shared_vector_size)
&& (Xi[pX_end-1] < Yi[pY_end - shared_vector_size -1 ]) )
{
pY_end -= shared_vector_size;
}
int64_t Xnz = pX_end - pX_start ;
int shared_steps_X = (Xnz + shared_vector_size -1)/shared_vector_size;
int64_t step_end = (shared_steps_X <= 1? Xnz : shared_vector_size);
while( (shared_steps_X>0) && (Yi[pY_start] > Xi[pX_start+ step_end-1]) )
{ // Fast forward to skip empty intersections
pX_start += step_end;
Xnz = pX_end - pX_start ;
shared_steps_X -= 1;
step_end = (shared_steps_X <= 1? Xnz : shared_vector_size);
}
for ( int64_t kk = tid; kk< step_end; kk+= blockDim.x)
{
Xi_s[kk] = Xi[ kk + pX_start];
}
this_thread_block().sync();
int64_t Ynz = pY_end - pY_start; // Ynz
int shared_steps_Y = (Ynz + shared_vector_size -1)/shared_vector_size;
step_end = (shared_steps_Y <= 1 ? Ynz : shared_vector_size);
while( (shared_steps_Y>0) && (Xi[pX_start] > Yi[pY_start + step_end-1]) )
{ //Fast forward to skip
pY_start+= step_end;
Ynz = pY_end - pY_start;
shared_steps_Y -= 1;
step_end = (shared_steps_Y <= 1 ? Ynz : shared_vector_size);
}
for ( int64_t kk =tid; kk< step_end; kk+= blockDim.x)
{
Yi_s[kk] = Yi[ kk + pY_start];
}
this_thread_block().sync();
//----------------------------------------------------------------------
// compute cij
//----------------------------------------------------------------------
//we want more than one intersection per thread
while ( (shared_steps_X > 0) && (shared_steps_Y > 0) )
{
int64_t Xwork = pX_end - pX_start;
int64_t Ywork = pY_end - pY_start;
if ( shared_steps_X > 1) Xwork = shared_vector_size;
if ( shared_steps_Y > 1) Ywork = shared_vector_size;
int64_t nxy = Xwork + Ywork;
// ceil Divide by 32 = blockDim.x :
int work_per_thread = (nxy + blockDim.x -1)/blockDim.x;
int diag = GB_IMIN( work_per_thread*tid, nxy);
int diag_end = GB_IMIN( diag + work_per_thread, nxy);
// Ywork takes place of Ynz:
int x_min = GB_IMAX( (diag - Ywork) , 0);
//Xwork takes place of Xnz:
int x_max = GB_IMIN( diag, Xwork);
while ( x_min < x_max)
{
//binary search for correct diag break
int pivot = (x_min +x_max) >> 1;
int64_t Xpiv = Xi_s[pivot] ;
int64_t Ypiv = Yi_s[diag -pivot -1] ;
x_min = (pivot + 1)* (Xpiv < Ypiv) + x_min * (1 - (Xpiv < Ypiv));
x_max = pivot * (1 - (Xpiv < Ypiv)) + x_max * (Xpiv < Ypiv);
}
int xcoord = x_min;
int ycoord = diag -x_min -1;
/*
//predictor-corrector search independent on each thread
int xcoord = GB_IMAX(diag-1, 0); //predicted to be uniform distribution
while ( Xi_s[xcoord] < Yi_s[diag-xcoord-1] && (xcoord<x_max) ) xcoord++;
while ( Xi_s[xcoord] > Yi_s[diag-xcoord-1] && (xcoord>x_min) ) xcoord--;
int ycoord = diag -xcoord -1;
*/
int64_t Xtest = Xi_s[xcoord] ;
int64_t Ytest = Yi_s[ycoord] ;
if ( (diag > 0) && (diag < nxy ) && (ycoord >= 0 ) && (Xtest == Ytest))
{
diag--; //adjust for intersection incrementing both pointers
}
// two start points are known now
int tx_start = xcoord; // +pX_start;
int ty_start = diag -xcoord; // +pY_start;
x_min = GB_IMAX( (diag_end - Ywork), 0); //Ywork replace Ynz
x_max = GB_IMIN( diag_end, Xwork); //Xwork replace Xnz
while ( x_min < x_max)
{
int pivot = (x_min +x_max) >> 1;
int64_t Xpiv = Xi_s[pivot] ;
int64_t Ypiv = Yi_s[diag_end -pivot -1] ;
x_min = (pivot + 1)* (Xpiv < Ypiv) + x_min * (1 - (Xpiv < Ypiv));
x_max = pivot * (1 - (Xpiv < Ypiv)) + x_max * (Xpiv < Ypiv);
}
xcoord = x_min;
ycoord = diag_end -x_min -1;
/*
//predictor-corrector search independent on each thread
xcoord = diag_end-1; //predicted to be uniform distribution
while ( Xi_s[xcoord] < Yi_s[diag_end-xcoord-1] && (xcoord<x_max)) xcoord++;
while ( Xi_s[xcoord] > Yi_s[diag_end-xcoord-1] && (xcoord>x_min)) xcoord--;
ycoord = diag_end -xcoord -1;
*/
// two end points are known now
int tx_end = xcoord; // +pX_start;
int ty_end = diag_end - xcoord; // + pY_start;
//merge-path dot product
int64_t pX = tx_start; // pX
int64_t pY = ty_start; // pY
while ( pX < tx_end && pY < ty_end )
{
int64_t Xind = Xi_s[pX] ;
int64_t Yind = Yi_s[pY] ;
#if GB_IS_PLUS_PAIR_REAL_SEMIRING && GB_Z_IGNORE_OVERFLOW
cij += (Xind == Yind) ;
#else
if (Xind == Yind)
{
// cij += aki * bkj
#if MP_FLIP
GB_DOT_MERGE (pY + pY_start, pX + pX_start) ;
#else
GB_DOT_MERGE (pX + pX_start, pY + pY_start) ;
#endif
// TODO check terminal condition, using tile.any
}
#endif
pX += (Xind <= Yind) ;
pY += (Xind >= Yind) ;
}
GB_CIJ_EXIST_POSTCHECK ;
this_thread_block().sync();
if ( (shared_steps_X >= 1)
&& (shared_steps_Y >= 1)
&& ( Xi_s[Xwork-1] == Yi_s[Ywork-1]) )
{
pX_start += shared_vector_size;
shared_steps_X -= 1;
if (shared_steps_X == 0) break;
pY_start += shared_vector_size;
shared_steps_Y -= 1;
if (shared_steps_Y == 0) break;
step_end = ( (pX_end - pX_start) < shared_vector_size ? (pX_end - pX_start) : shared_vector_size);
while( (shared_steps_X>0) && (Yi[pY_start] > Xi[pX_start + step_end-1]) )
{ //fast forward
pX_start += step_end;
shared_steps_X -= 1;
step_end = ( (pX_end - pX_start) < shared_vector_size ? (pX_end - pX_start) : shared_vector_size);
}
if (shared_steps_X == 0) break;
for ( int64_t kk = tid; kk< step_end; kk+= blockDim.x)
{
Xi_s[kk] = Xi[ kk + pX_start];
}
this_thread_block().sync();
step_end = ( (pY_end - pY_start) < shared_vector_size ? (pY_end - pY_start) : shared_vector_size);
while( (shared_steps_Y>0) && (Xi[pX_start] > Yi[pY_start + step_end-1]) )
{ //fast forward
pY_start += step_end;
shared_steps_Y -= 1;
step_end = ( (pY_end - pY_start) < shared_vector_size ? (pY_end - pY_start) : shared_vector_size);
}
if (shared_steps_Y == 0) break;
for ( int64_t kk = tid; kk< step_end; kk+= blockDim.x)
{
Yi_s[kk] = Yi[ kk + pY_start];
}
this_thread_block().sync();
}
else if ( (shared_steps_X >= 1) && (Xi_s[Xwork-1] < Yi_s[Ywork-1] ))
{
pX_start += shared_vector_size;
shared_steps_X -= 1;
if (shared_steps_X == 0) break;
step_end= ( (pX_end - pX_start) < shared_vector_size ? (pX_end - pX_start) : shared_vector_size);
while( (shared_steps_X>0) && (Yi[pY_start] > Xi[pX_start + step_end-1]) )
{ //fast forward
pX_start += step_end;
shared_steps_X -= 1;
step_end= ( (pX_end - pX_start) < shared_vector_size ? (pX_end - pX_start) : shared_vector_size);
}
if (shared_steps_X == 0) break;
for ( int64_t kk = tid; kk< step_end; kk+= blockDim.x)
{
Xi_s[kk] = Xi[ kk + pX_start];
}
this_thread_block().sync();
}
else if ( (shared_steps_Y >= 1) && (Yi_s[Ywork-1] < Xi_s[Xwork-1]) )
{
pY_start += shared_vector_size;
shared_steps_Y -= 1;
if (shared_steps_Y == 0) break;
step_end = ( (pY_end - pY_start) < shared_vector_size ? (pY_end - pY_start) : shared_vector_size);
while( (shared_steps_Y>0) && (Xi[pX_start] > Yi[pY_start + step_end-1]) )
{ //fast forward
pY_start += step_end;
shared_steps_Y -= 1;
step_end = ( (pY_end - pY_start) < shared_vector_size ? (pY_end - pY_start) : shared_vector_size);
}
if (shared_steps_Y == 0) break;
for ( int64_t kk = tid; kk< step_end; kk+= blockDim.x)
{
Yi_s[kk] = Yi[ kk + pY_start];
}
this_thread_block().sync();
}
} // end while shared_steps_X > 0 && shared_steps_Y >0
//tile.sync( ) ;
#undef MP_FLIP
#undef pX
#undef pX_start
#undef pX_end
#undef Xi
#undef pY
#undef pY_start
#undef pY_end
#undef Yi
|