File: fast_div.hpp

package info (click to toggle)
primecount 7.16%2Bds-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 1,724 kB
  • sloc: cpp: 18,769; ansic: 102; makefile: 89; sh: 86
file content (146 lines) | stat: -rw-r--r-- 4,459 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
///
/// @file  fast_div.hpp
/// @brief Integer division of small types is much faster than integer
///        division of large types on most CPUs. The fast_div(x, y)
///        function tries to take advantage of this by casting x and y
///        to smaller types (if possible) before doing the division.
///
///        If ENABLE_DIV32 is defined we check at runtime if the
///        dividend and divisor are < 2^32 and if so we use 32-bit
///        integer division instead of 64-bit integer division. On
///        most CPUs before 2020 this significantly improves
///        performance.
///
///        On some new CPUs (such as Intel Cannonlake) 64-bit integer
///        division has been improved significantly and runs as fast
///        as 32-bit integer division. For such CPUs it is best to
///        disable ENABLE_DIV32 (using cmake -DWITH_DIV32=OFF) as this
///        avoids runtime checks for (64-bit / 32-bit) divisions.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef FAST_DIV_HPP
#define FAST_DIV_HPP

#include <macros.hpp>
#include <int128_t.hpp>

#include <stdint.h>

namespace {

/// Used for (64-bit / 32-bit) = 64-bit.
template <typename X, typename Y>
ALWAYS_INLINE typename std::enable_if<(sizeof(X) == sizeof(uint64_t) &&
                                       sizeof(Y) <= sizeof(uint32_t)), X>::type
fast_div(X x, Y y)
{
  ASSERT(x >= 0);
  ASSERT(y > 0);

#if defined(ENABLE_DIV32)

  using UX = typename pstd::make_unsigned<X>::type;
  using UY = typename pstd::make_unsigned<Y>::type;

  if (x <= pstd::numeric_limits<uint32_t>::max())
    return uint32_t(x) / UY(y);
  else
    return UX(x) / UY(y);
#else
  // Unsigned integer division is usually
  // faster than signed integer division.
  using UX = typename pstd::make_unsigned<X>::type;
  using UY = typename pstd::make_unsigned<Y>::type;
  return UX(x) / UY(y);
#endif
}

/// Used for  (64-bit /  64-bit) =  64-bit.
/// Used for (128-bit / 128-bit) = 128-bit.
template <typename X, typename Y>
ALWAYS_INLINE typename std::enable_if<(sizeof(X) >= sizeof(uint64_t) &&
                                       sizeof(Y) == sizeof(X)), X>::type
fast_div(X x, Y y)
{
  ASSERT(x >= 0);
  ASSERT(y > 0);

  // Unsigned integer division is usually
  // faster than signed integer division.
  using UX = typename pstd::make_unsigned<X>::type;
  using UY = typename pstd::make_unsigned<Y>::type;
  return UX(x) / UY(y);
}

/// Used for (128-bit / 32-bit) = 128-bit.
/// Used for (128-bit / 64-bit) = 128-bit.
template <typename X, typename Y>
ALWAYS_INLINE typename std::enable_if<(sizeof(X) > sizeof(uint64_t) &&
                                       sizeof(Y) <= sizeof(uint64_t)), X>::type
fast_div(X x, Y y)
{
  ASSERT(x >= 0);
  ASSERT(y > 0);

  // Unsigned integer division is usually
  // faster than signed integer division.
  using UX = typename pstd::make_unsigned<X>::type;
  using UY = typename pstd::make_unsigned<Y>::type;

  if (x <= pstd::numeric_limits<uint64_t>::max())
    return uint64_t(x) / UY(y);
  else
    return UX(x) / UY(y);
}

/// Used for (128-bit / 32-bit) = 64-bit.
/// Used for (128-bit / 64-bit) = 64-bit.
/// Use this function only when you know for sure
/// that the result is < 2^64.
///
template <typename X, typename Y>
ALWAYS_INLINE typename std::enable_if<(sizeof(X) > sizeof(uint64_t) &&
                                       sizeof(Y) <= sizeof(uint64_t)), uint64_t>::type
fast_div64(X x, Y y)
{
  ASSERT(x >= 0);
  ASSERT(y > 0);

#if defined(__x86_64__) && \
   (defined(__GNUC__) || defined(__clang__))

  uint64_t x0 = (uint64_t) x;
  uint64_t x1 = ((uint64_t*) &x)[1];
  uint64_t d = y;

  // (128-bit / 64-bit) = 64-bit.
  // When we know the result fits into 64-bit (even
  // though the numerator is 128-bit) we can use the divq
  // instruction instead of doing a full 128-bit division.
  __asm__("div %[divider]"
          : "+a"(x0), "+d"(x1) : [divider] "r"(d));

  return x0;
#else
  return (uint64_t) fast_div(x, y);
#endif
}

/// Used for (64-bit / 32-bit) = 64-bit.
/// Used for (64-bit / 64-bit) = 64-bit.
template <typename X, typename Y>
ALWAYS_INLINE typename std::enable_if<(sizeof(X) <= sizeof(uint64_t) &&
                                       sizeof(Y) <= sizeof(X)), uint64_t>::type
fast_div64(X x, Y y)
{
  return (uint64_t) fast_div(x, y);
}

} // namespace

#endif