1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
|
#ifndef RITSUKO_CHOOSE_MISSING_PLACEHOLDER_HPP
#define RITSUKO_CHOOSE_MISSING_PLACEHOLDER_HPP
#include <limits>
#include <set>
#include <type_traits>
#include <cmath>
#include <algorithm>
/**
* @file choose_missing_placeholder.hpp
* @brief Choose a placeholder for missing values.
*/
namespace ritsuko {
/**
* @cond
*/
template<class Iterator, class Mask, class Type>
bool found(Iterator start, Iterator end, Mask mask, Type candidate) {
if constexpr(std::is_same<Mask, bool>::value) {
return (std::find(start, end, candidate) != end);
} else {
for (; start != end; ++start, ++mask) {
if (!*mask && candidate == *start) {
return true;
}
}
return false;
}
}
template<class Iterator, class Mask, class Type = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::set<Type> create_unique_set(Iterator start, Iterator end, Mask mask) {
if constexpr(std::is_same<Mask, bool>::value) {
return std::set<Type>(start, end);
} else {
std::set<Type> output;
for (; start != end; ++start, ++mask) {
if (!*mask) {
output.insert(*start);
}
}
return output;
}
}
template<class Iterator, class Mask, class Type = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
bool check_for_nan(Iterator start, Iterator end, Mask mask) {
if constexpr(std::is_same<Mask, bool>::value) {
for (auto x = start; x != end; ++x) {
if (std::isnan(*x)) {
return true;
}
}
} else {
auto sIt = mask;
for (auto x = start; x != end; ++x, ++sIt) {
if (!*sIt && std::isnan(*x)) {
return true;
}
}
}
return false;
}
/**
* @endcond
*/
/**
* Choose an appropriate placeholder for missing values in an integer dataset, after ignoring all the masked values.
* This will try the various special values (the minimum, the maximum, and for signed types, 0)
* before sorting the dataset and searching for an unused integer value.
*
* @tparam Iterator_ Forward iterator for integer values.
* @tparam Mask_ Random access iterator for mask values.
* @tparam Type_ Integer type pointed to by `Iterator_`.
*
* @param start Start of the dataset.
* @param end End of the dataset.
* @param mask Start of the mask vector.
* This should have the same length as `end - start`; each entry is true if the corresponding value of the integer dataset is masked, and false otherwise.
*
* @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true.
*/
template<class Iterator, class Mask, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterator end, Mask mask) {
static_assert(std::numeric_limits<Type_>::is_integer);
// Trying important points first; minima and maxima, and 0.
if constexpr(std::numeric_limits<Type_>::is_signed) {
auto candidate = std::numeric_limits<Type_>::min();
if (!found(start, end, mask, candidate)) {
return std::make_pair(true, candidate);
}
}
{
auto candidate = std::numeric_limits<Type_>::max();
if (!found(start, end, mask, candidate)) {
return std::make_pair(true, candidate);
}
}
if (!found(start, end, mask, 0)) {
return std::make_pair(true, 0);
}
// Well... going through it in order.
auto uniq_sort = create_unique_set(start, end, mask);
Type_ last = std::numeric_limits<Type_>::min();
for (auto x : uniq_sort) {
if (last + 1 < x) {
return std::make_pair(true, last + 1);
}
last = x;
}
return std::make_pair(false, 0);
}
/**
* Overload of `choose_missing_integer_placeholder()` where no values are masked.
*
* @tparam Iterator_ Forward iterator for integer values.
* @tparam Type_ Integer type pointed to by `Iterator_`.
*
* @param start Start of the dataset.
* @param end End of the dataset.
*
* @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true.
*/
template<class Iterator, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_integer_placeholder(Iterator start, Iterator end) {
return choose_missing_integer_placeholder(start, end, false);
}
/**
* Choose an appropriate placeholder for missing values in a floating-point dataset, after ignoring all masked values.
* This will try the various IEEE special values (NaN, Inf, -Inf) and then some type-specific boundaries (the minimum, the maximum, and for signed types, 0)
* before sorting the dataset and searching for an unused float.
*
* @tparam Iterator_ Forward iterator for floating-point values.
* @tparam Type_ Float type pointed to by `Iterator_`.
*
* @param start Start of the dataset.
* @param end End of the dataset.
* @param mask Start of the mask vector.
* @param skip_nan Whether to skip NaN as a potential placeholder.
* Useful in frameworks like R that need special consideration of NaN payloads.
*
* @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true.
*/
template<class Iterator, class Mask, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_float_placeholder(Iterator start, Iterator end, Mask mask, bool skip_nan) {
if constexpr(std::numeric_limits<Type_>::is_iec559) {
if (!skip_nan) {
if (!check_for_nan(start, end, mask)) {
return std::make_pair(true, std::numeric_limits<Type_>::quiet_NaN());
}
}
// Trying positive and negative Infs.
auto inf = std::numeric_limits<Type_>::infinity();
if (!found(start, end, mask, inf)) {
return std::make_pair(true, inf);
}
auto ninf = -inf;
if (!found(start, end, mask, ninf)) {
return std::make_pair(true, ninf);
}
}
// Trying important points first; minima and maxima, and 0.
{
auto candidate = std::numeric_limits<Type_>::lowest();
if (!found(start, end, mask, candidate)) {
return std::make_pair(true, candidate);
}
}
{
auto candidate = std::numeric_limits<Type_>::max();
if (!found(start, end, mask, candidate)) {
return std::make_pair(true, candidate);
}
}
if (!found(start, end, mask, 0)) {
return std::make_pair(true, 0);
}
// Well... going through it in order.
auto uniq_sort = create_unique_set(start, end, mask);
Type_ last = std::numeric_limits<Type_>::lowest();
for (auto x : uniq_sort) {
if (std::isfinite(x)) {
Type_ candidate = last + (x - last) / 2;
if (candidate != last && candidate != x) {
return std::make_pair(true, candidate);
}
last = x;
}
}
return std::make_pair(false, 0);
}
/**
* Overload of `choose_missing_float_placeholder()` where no values are masked.
*
* @tparam Iterator_ Forward iterator for floating-point values.
* @tparam Type_ Integer type pointed to by `Iterator_`.
*
* @param start Start of the dataset.
* @param end End of the dataset.
* @param skip_nan Whether to skip NaN as a potential placeholder.
*
* @return Pair containing (i) a boolean indicating whether a placeholder was successfully found, and (ii) the chosen placeholder if the previous boolean is true.
*/
template<class Iterator, class Type_ = typename std::remove_cv<typename std::remove_reference<decltype(*(std::declval<Iterator>()))>::type>::type>
std::pair<bool, Type_> choose_missing_float_placeholder(Iterator start, Iterator end, bool skip_nan = false) {
return choose_missing_float_placeholder(start, end, false, skip_nan);
}
}
#endif
|