1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#include "parallel_for.h"
namespace embree
{
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value sequential_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
{
return func(range<Index>(first,last));
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value sequential_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{
return func(range<Index>(first,last));
}
template<typename Index, typename Value, typename Func, typename Reduction>
__noinline Value parallel_reduce_internal( Index taskCount, const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{
const Index maxTasks = 512;
const Index threadCount = (Index) TaskScheduler::threadCount();
taskCount = min(taskCount,threadCount,maxTasks);
/* parallel invocation of all tasks */
dynamic_large_stack_array(Value,values,taskCount,8192); // consumes at most 8192 bytes on the stack
parallel_for(taskCount, [&](const Index taskIndex) {
const Index k0 = first+(taskIndex+0)*(last-first)/taskCount;
const Index k1 = first+(taskIndex+1)*(last-first)/taskCount;
values[taskIndex] = func(range<Index>(k0,k1));
});
/* perform reduction over all tasks */
Value v = identity;
for (Index i=0; i<taskCount; i++) v = reduction(v,values[i]);
return v;
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Value& identity, const Func& func, const Reduction& reduction )
{
#if defined(TASKING_INTERNAL) && !defined(TASKING_TBB)
/* fast path for small number of iterations */
Index taskCount = (last-first+minStepSize-1)/minStepSize;
if (likely(taskCount == 1)) {
return func(range<Index>(first,last));
}
return parallel_reduce_internal(taskCount,first,last,minStepSize,identity,func,reduction);
#elif defined(TASKING_TBB)
#if TBB_INTERFACE_VERSION >= 12002
tbb::task_group_context context;
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction,context);
if (context.is_group_execution_cancelled())
throw std::runtime_error("task cancelled");
return v;
#else
const Value v = tbb::parallel_reduce(tbb::blocked_range<Index>(first,last,minStepSize),identity,
[&](const tbb::blocked_range<Index>& r, const Value& start) { return reduction(start,func(range<Index>(r.begin(),r.end()))); },
reduction);
if (tbb::task::self().is_cancelled())
throw std::runtime_error("task cancelled");
return v;
#endif
#else // TASKING_PPL
struct AlignedValue
{
char storage[__alignof(Value)+sizeof(Value)];
static uintptr_t alignUp(uintptr_t p, size_t a) { return p + (~(p - 1) % a); };
Value* getValuePtr() { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
const Value* getValuePtr() const { return reinterpret_cast<Value*>(alignUp(uintptr_t(storage), __alignof(Value))); }
AlignedValue(const Value& v) { new(getValuePtr()) Value(v); }
AlignedValue(const AlignedValue& v) { new(getValuePtr()) Value(*v.getValuePtr()); }
AlignedValue(const AlignedValue&& v) { new(getValuePtr()) Value(*v.getValuePtr()); };
AlignedValue& operator = (const AlignedValue& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
AlignedValue& operator = (const AlignedValue&& v) { *getValuePtr() = *v.getValuePtr(); return *this; };
operator Value() const { return *getValuePtr(); }
};
struct Iterator_Index
{
Index v;
typedef std::forward_iterator_tag iterator_category;
typedef AlignedValue value_type;
typedef Index difference_type;
typedef Index distance_type;
typedef AlignedValue* pointer;
typedef AlignedValue& reference;
__forceinline Iterator_Index() {}
__forceinline Iterator_Index(Index v) : v(v) {}
__forceinline bool operator== (Iterator_Index other) { return v == other.v; }
__forceinline bool operator!= (Iterator_Index other) { return v != other.v; }
__forceinline Iterator_Index operator++() { return Iterator_Index(++v); }
__forceinline Iterator_Index operator++(int) { return Iterator_Index(v++); }
};
auto range_reduction = [&](Iterator_Index begin, Iterator_Index end, const AlignedValue& start) {
assert(begin.v < end.v);
return reduction(start, func(range<Index>(begin.v, end.v)));
};
const Value v = concurrency::parallel_reduce(Iterator_Index(first), Iterator_Index(last), AlignedValue(identity), range_reduction, reduction);
return v;
#endif
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const Index first, const Index last, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
{
if (likely(last-first < parallel_threshold)) {
return func(range<Index>(first,last));
} else {
return parallel_reduce(first,last,minStepSize,identity,func,reduction);
}
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const range<Index> range, const Index minStepSize, const Index parallel_threshold, const Value& identity, const Func& func, const Reduction& reduction )
{
return parallel_reduce(range.begin(),range.end(),minStepSize,parallel_threshold,identity,func,reduction);
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const Index first, const Index last, const Value& identity, const Func& func, const Reduction& reduction )
{
auto funcr = [&] ( const range<Index> r ) {
Value v = identity;
for (Index i=r.begin(); i<r.end(); i++)
v = reduction(v,func(i));
return v;
};
return parallel_reduce(first,last,Index(1),identity,funcr,reduction);
}
template<typename Index, typename Value, typename Func, typename Reduction>
__forceinline Value parallel_reduce( const range<Index> range, const Value& identity, const Func& func, const Reduction& reduction )
{
return parallel_reduce(range.begin(),range.end(),Index(1),identity,func,reduction);
}
}
|