1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
// -*- C++ -*-
// -*-===----------------------------------------------------------------------===//
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
//
//===----------------------------------------------------------------------===//
#ifndef _PSTL_INTERNAL_OMP_UTIL_H
#define _PSTL_INTERNAL_OMP_UTIL_H
#include <algorithm>
#include <atomic>
#include <iterator>
#include <cstddef>
#include <cstdio>
#include <memory>
#include <vector>
#include <omp.h>
#include "../parallel_backend_utils.h"
#include "../unseq_backend_simd.h"
#include "../utils.h"
// Portability "#pragma" definition
#ifdef _MSC_VER
# define _PSTL_PRAGMA(x) __pragma(x)
#else
# define _PSTL_PRAGMA(x) _Pragma(# x)
#endif
namespace __pstl
{
namespace __omp_backend
{
//------------------------------------------------------------------------
// use to cancel execution
//------------------------------------------------------------------------
inline void
__cancel_execution()
{
// TODO: Figure out how to make cancelation work.
}
//------------------------------------------------------------------------
// raw buffer
//------------------------------------------------------------------------
template <typename _Tp>
class __buffer
{
std::allocator<_Tp> __allocator_;
_Tp* __ptr_;
const std::size_t __buf_size_;
__buffer(const __buffer&) = delete;
void
operator=(const __buffer&) = delete;
public:
__buffer(std::size_t __n) : __allocator_(), __ptr_(__allocator_.allocate(__n)), __buf_size_(__n) {}
operator bool() const { return __ptr_ != nullptr; }
_Tp*
get() const
{
return __ptr_;
}
~__buffer() { __allocator_.deallocate(__ptr_, __buf_size_); }
};
// Preliminary size of each chunk: requires further discussion
inline constexpr std::size_t __default_chunk_size = 2048;
// Convenience function to determine when we should run serial.
template <typename _Iterator, std::enable_if_t<!std::is_integral<_Iterator>::value, bool> = true>
constexpr auto
__should_run_serial(_Iterator __first, _Iterator __last) -> bool
{
using _difference_type = typename std::iterator_traits<_Iterator>::difference_type;
auto __size = std::distance(__first, __last);
return __size <= static_cast<_difference_type>(__default_chunk_size);
}
template <typename _Index, std::enable_if_t<std::is_integral<_Index>::value, bool> = true>
constexpr auto
__should_run_serial(_Index __first, _Index __last) -> bool
{
using _difference_type = _Index;
auto __size = __last - __first;
return __size <= static_cast<_difference_type>(__default_chunk_size);
}
struct __chunk_metrics
{
std::size_t __n_chunks;
std::size_t __chunk_size;
std::size_t __first_chunk_size;
};
// The iteration space partitioner according to __requested_chunk_size
template <class _RandomAccessIterator, class _Size = std::size_t>
auto
__chunk_partitioner(_RandomAccessIterator __first, _RandomAccessIterator __last,
_Size __requested_chunk_size = __default_chunk_size) -> __chunk_metrics
{
/*
* This algorithm improves distribution of elements in chunks by avoiding
* small tail chunks. The leftover elements that do not fit neatly into
* the chunk size are redistributed to early chunks. This improves
* utilization of the processor's prefetch and reduces the number of
* tasks needed by 1.
*/
const _Size __n = __last - __first;
_Size __n_chunks = 0;
_Size __chunk_size = 0;
_Size __first_chunk_size = 0;
if (__n < __requested_chunk_size)
{
__chunk_size = __n;
__first_chunk_size = __n;
__n_chunks = 1;
return __chunk_metrics{__n_chunks, __chunk_size, __first_chunk_size};
}
__n_chunks = (__n / __requested_chunk_size) + 1;
__chunk_size = __n / __n_chunks;
__first_chunk_size = __chunk_size;
const _Size __n_leftover_items = __n - (__n_chunks * __chunk_size);
if (__n_leftover_items == __chunk_size)
{
__n_chunks += 1;
return __chunk_metrics{__n_chunks, __chunk_size, __first_chunk_size};
}
else if (__n_leftover_items == 0)
{
__first_chunk_size = __chunk_size;
return __chunk_metrics{__n_chunks, __chunk_size, __first_chunk_size};
}
const _Size __n_extra_items_per_chunk = __n_leftover_items / __n_chunks;
const _Size __n_final_leftover_items = __n_leftover_items - (__n_extra_items_per_chunk * __n_chunks);
__chunk_size += __n_extra_items_per_chunk;
__first_chunk_size = __chunk_size + __n_final_leftover_items;
return __chunk_metrics{__n_chunks, __chunk_size, __first_chunk_size};
}
template <typename _Iterator, typename _Index, typename _Func>
void
__process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chunk_index, _Func __f)
{
auto __this_chunk_size = __chunk_index == 0 ? __metrics.__first_chunk_size : __metrics.__chunk_size;
auto __index = __chunk_index == 0 ? 0
: (__chunk_index * __metrics.__chunk_size) +
(__metrics.__first_chunk_size - __metrics.__chunk_size);
auto __first = __base + __index;
auto __last = __first + __this_chunk_size;
__f(__first, __last);
}
} // namespace __omp_backend
} // namespace __pstl
#endif // _PSTL_INTERNAL_OMP_UTIL_H
|