1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
|
/* cilk-abi-cilk-for.cpp -*-C++-*-
*
*************************************************************************
*
* @copyright
* Copyright (C) 2011, 2013, Intel Corporation
* All rights reserved.
*
* @copyright
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* @copyright
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
**************************************************************************/
/* Implementation of cilk_for ABI.
*
* This file must be C++, not C, in order to handle C++ exceptions correctly
* from within the body of the cilk_for loop
*/
#include "internal/abi.h"
#include "metacall_impl.h"
#include "global_state.h"
// Icky macros to determine if we're compiled with optimization. Based on
// the declaration of __CILKRTS_ASSERT in common.h
#if defined(_WIN32)
# if defined (_DEBUG)
# define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od
# else
# define CILKRTS_OPTIMIZED 1
# endif // defined(_DEBUG)
#else
# if defined(__OPTIMIZE__)
# define CILKRTS_OPTIMIZED 1
# else
# define CILKRTS_OPTIMIZED 0
# endif
#endif
template <typename count_t>
static inline int grainsize(int req, count_t count)
{
// A positive requested grain size comes from the user. A very high grain
// size risks losing parallelism, but the user told us what they want for
// grainsize. Who are we to argue?
if (req > 0)
return req;
// At present, a negative requested grain size is treated the same way as
// a zero grain size, i.e., the runtime computes the actual grainsize
// using a hueristic. In the future, the compiler may give us additional
// information about the size of the cilk_for body by passing a negative
// grain size.
// Avoid generating a zero grainsize, even for empty loops.
if (count < 1)
return 1;
global_state_t* g = cilkg_get_global_state();
if (g->under_ptool)
{
// Grainsize = 1, when running under PIN, and when the grainsize has
// not explicitly been set by the user.
return 1;
}
else
{
// Divide loop count by 8 times the worker count and round up.
const int Px8 = g->P * 8;
count_t n = (count + Px8 - 1) / Px8;
// 2K should be enough to amortize the cost of the cilk_for. Any
// larger grainsize risks losing parallelism.
if (n > 2048)
return 2048;
return (int) n; // n <= 2048, so no loss of precision on cast to int
}
}
/*
* call_cilk_for_loop_body
*
* Centralizes the code to call the loop body. The compiler should be
* inlining this code
*
* low - Low loop index we're considering in this portion of the algorithm
* high - High loop index we're considering in this portion of the algorithm
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* w - __cilkrts_worker we're currently executing on
* loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
* the cilk_for loop to flatten out the internal nodes
*/
template <typename count_t, typename F>
inline static
void call_cilk_for_loop_body(count_t low, count_t high,
F body, void *data,
__cilkrts_worker *w,
__cilkrts_pedigree *loop_root_pedigree)
{
// Cilkscreen should not report this call in a stack trace
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
// The worker is only valid until the first spawn. Fetch the
// __cilkrts_stack_frame out of the worker, since it will be stable across
// steals. The sf pointer actually points to the *parent's*
// __cilkrts_stack_frame, since this function is a non-spawning function
// and therefore has no cilk stack frame of its own.
__cilkrts_stack_frame *sf = w->current_stack_frame;
// Save the pedigree node pointed to by the worker. We'll need to restore
// that when we exit since the spawn helpers in the cilk_for call tree
// will assume that it's valid
const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;
// Add the leaf pedigree node to the chain. The parent is the root node
// to flatten the tree regardless of the DAG branches in the cilk_for
// divide-and-conquer recursion.
//
// The rank is initialized to the low index. The user is
// expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
// loop body.
__cilkrts_pedigree loop_leaf_pedigree;
loop_leaf_pedigree.rank = (uint64_t)low;
loop_leaf_pedigree.parent = loop_root_pedigree;
// The worker's pedigree always starts with a rank of 0
w->pedigree.rank = 0;
w->pedigree.parent = &loop_leaf_pedigree;
// Call the compiler generated cilk_for loop body lambda function
body(data, low, high);
// The loop body may have included spawns, so we must refetch the worker
// from the __cilkrts_stack_frame, which is stable regardless of which
// worker we're executing on.
w = sf->worker;
// Restore the pedigree chain. It must be valid because the spawn helpers
// generated by the cilk_for implementation will access it.
w->pedigree.parent = saved_next_pedigree_node;
}
/* capture_spawn_arg_stack_frame
*
* Efficiently get the address of the caller's __cilkrts_stack_frame. The
* preconditons are that 'w' is the worker at the time of the call and
* 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
* spawn helper. This function should be called only within the argument list
* of a function that is being spawned because that is the only situation in
* which these preconditions hold. This function returns the worker
* (unchanged) after storing the captured stack frame pointer is stored in the
* sf argument.
*
* The purpose of this function is to get the caller's stack frame in a
* context where the caller's worker is known but its stack frame is not
* necessarily initialized. The "shrink wrap" optimization delays
* initializing the contents of a spawning function's '__cilkrts_stack_frame'
* as well as the 'current_stack_frame' pointer within the worker. By calling
* this function within a spawning function's argument list, we can ensure
* that these initializations have occured but that a detach (which would
* invalidate the worker pointer in the caller) has not yet occured. Once the
* '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
* remainder of the caller's execution, and becomes an efficient way to get
* the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
* even after a spawn or sync.
*/
inline __cilkrts_worker*
capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
{
// Get current stack frame
sf = w->current_stack_frame;
#ifdef __INTEL_COMPILER
# if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
// In older compilers 'w->current_stack_frame' points to the
// spawn-helper's stack frame. In newer compiler's however, it points
// directly to the pointer's stack frame. (This change was made to avoid
// having the spawn helper in the frame list when evaluating function
// arguments, thus avoiding corruption when those arguments themselves
// contain cilk_spawns.)
// w->current_stack_frame is the spawn helper's stack frame.
// w->current_stack_frame->call_parent is the caller's stack frame.
sf = sf->call_parent;
# endif
#endif
return w;
}
/*
* cilk_for_recursive
*
* Templatized function to implement the recursive divide-and-conquer
* algorithm that's how we implement a cilk_for.
*
* low - Low loop index we're considering in this portion of the algorithm
* high - High loop index we're considering in this portion of the algorithm
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* grain - grain size (0 if it should be computed)
* w - __cilkrts_worker we're currently executing on
* loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
* the cilk_for loop to flatten out the internal nodes
*/
template <typename count_t, typename F>
static
void cilk_for_recursive(count_t low, count_t high,
F body, void *data, int grain,
__cilkrts_worker *w,
__cilkrts_pedigree *loop_root_pedigree)
{
tail_recurse:
// Cilkscreen should not report this call in a stack trace
// This needs to be done everytime the worker resumes
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
count_t count = high - low;
// Invariant: count > 0, grain >= 1
if (count > grain)
{
// Invariant: count >= 2
count_t mid = low + count / 2;
// The worker is valid only until the first spawn and is expensive to
// retrieve (using '__cilkrts_get_tls_worker') after the spawn. The
// '__cilkrts_stack_frame' is more stable, but isn't initialized until
// the first spawn. Thus, we want to grab the address of the
// '__cilkrts_stack_frame' after it is initialized but before the
// spawn detaches. The only place we can do that is within the
// argument list of the spawned function, hence the call to
// capture_spawn_arg_stack_frame().
__cilkrts_stack_frame *sf;
#if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
// The current version of gcc initializes the sf structure eagerly.
// We can take advantage of this fact to avoid calling
// `capture_spawn_arg_stack_frame` when compiling with gcc.
// Remove this if the "shrink-wrap" optimization is implemented.
sf = w->current_stack_frame;
_Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
loop_root_pedigree);
#else
_Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
capture_spawn_arg_stack_frame(sf, w),
loop_root_pedigree);
#endif
w = sf->worker;
low = mid;
goto tail_recurse;
}
// Call the cilk_for loop body lambda function passed in by the compiler to
// execute one grain
call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
}
static void noop() { }
/*
* cilk_for_root
*
* Templatized function to implement the top level of a cilk_for loop.
*
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* count - trip count for loop
* grain - grain size (0 if it should be computed)
*/
template <typename count_t, typename F>
static void cilk_for_root(F body, void *data, count_t count, int grain)
{
// Cilkscreen should not report this call in a stack trace
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
// Pedigree computation:
//
// If the last pedigree node on entry to the _Cilk_for has value X,
// then at the start of each iteration of the loop body, the value of
// the last pedigree node should be 0, the value of the second-to-last
// node should equal the loop counter, and the value of the
// third-to-last node should be X. On return from the _Cilk_for, the
// value of the last pedigree should be incremented to X+2. The
// pedigree within the loop is thus flattened, such that the depth of
// recursion does not affect the results either inside or outside of
// the loop. Note that the pedigree after the loop exists is the same
// as if a single spawn and sync were executed within this function.
// TBD: Since the shrink-wrap optimization was turned on in the compiler,
// it is not possible to get the current stack frame without actually
// forcing a call to bind-thread. This spurious spawn is a temporary
// stopgap until the correct intrinsics are added to give us total control
// over frame initialization.
_Cilk_spawn noop();
// Fetch the current worker. From that we can get the current stack frame
// which will be constant even if we're stolen
__cilkrts_worker *w = __cilkrts_get_tls_worker();
__cilkrts_stack_frame *sf = w->current_stack_frame;
// Decrement the rank by one to undo the pedigree change from the
// _Cilk_spawn
--w->pedigree.rank;
// Save the current worker pedigree into loop_root_pedigree, which will be
// the root node for our flattened pedigree.
__cilkrts_pedigree loop_root_pedigree = w->pedigree;
// Don't splice the loop_root node in yet. It will be done when we
// call the loop body lambda function
// w->pedigree.rank = 0;
// w->pedigree.next = &loop_root_pedigree;
/* Spawn is necessary at top-level to force runtime to start up.
* Runtime must be started in order to call the grainsize() function.
*/
int gs = grainsize(grain, count);
cilk_for_recursive((count_t) 0, count, body, data, gs, w,
&loop_root_pedigree);
// Need to refetch the worker after calling a spawning function.
w = sf->worker;
// Restore the pedigree in the worker.
w->pedigree = loop_root_pedigree;
// Bump the worker pedigree.
++w->pedigree.rank;
// Implicit sync will increment the pedigree leaf rank again, for a total
// of two increments. If the noop spawn above is removed, then we'll need
// to re-enable the following code:
// // If this is an optimized build, then the compiler will have optimized
// // out the increment of the worker's pedigree in the implied sync. We
// // need to add one to make the pedigree_loop test work correctly.
// #if CILKRTS_OPTIMIZED
// ++sf->worker->pedigree.rank;
// #endif
}
// Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
// __cilkrts_cilk_for_64.
extern "C" {
/*
* __cilkrts_cilk_for_32
*
* Implementation of cilk_for for 32-bit trip counts (regardless of processor
* word size). Assumes that the range is 0 - count.
*
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* count - trip count for loop
* grain - grain size (0 if it should be computed)
*/
CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
cilk32_t count, int grain)
{
// Cilkscreen should not report this call in a stack trace
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
// Check for an empty range here as an optimization - don't need to do any
// __cilkrts_stack_frame initialization
if (count > 0)
cilk_for_root(body, data, count, grain);
}
/*
* __cilkrts_cilk_for_64
*
* Implementation of cilk_for for 64-bit trip counts (regardless of processor
* word size). Assumes that the range is 0 - count.
*
* body - lambda function for the cilk_for loop body
* data - data used by the lambda function
* count - trip count for loop
* grain - grain size (0 if it should be computed)
*/
CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
cilk64_t count, int grain)
{
// Check for an empty range here as an optimization - don't need to do any
// __cilkrts_stack_frame initialization
if (count > 0)
cilk_for_root(body, data, count, grain);
}
} // end extern "C"
/* End cilk-abi-cilk-for.cpp */
|