File: gcd_bench.cpp

package info (click to toggle)
boost1.90 1.90.0-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 593,156 kB
  • sloc: cpp: 4,190,642; xml: 196,648; python: 34,618; ansic: 23,145; asm: 5,468; sh: 3,776; makefile: 1,161; perl: 1,020; sql: 728; ruby: 676; yacc: 478; java: 77; lisp: 24; csh: 6
file content (218 lines) | stat: -rw-r--r-- 7,054 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
//  Copyright 2020 John Maddock. Distributed under the Boost
//  Software License, Version 1.0. (See accompanying file
//  LICENSE_1_0.txt or copy at https://www.boost.org/LICENSE_1_0.txt

#include <iostream>
#include <benchmark/benchmark.h>
#include <boost/multiprecision/cpp_int.hpp>
#include <boost/multiprecision/gmp.hpp>
#include <boost/random.hpp>
#include <cmath>

#include <immintrin.h>

using namespace boost::multiprecision;
using namespace boost::random;

namespace boost {
   namespace multiprecision {
      namespace backends {

      template <unsigned MinBits1, unsigned MaxBits1, cpp_integer_type SignType1, cpp_int_check_type Checked1, class Allocator1>
      inline BOOST_MP_CXX14_CONSTEXPR typename std::enable_if<!is_trivial_cpp_int<cpp_int_backend<MinBits1, MaxBits1, SignType1, Checked1, Allocator1> >::value>::type
      eval_gcd_old(
          cpp_int_backend<MinBits1, MaxBits1, SignType1, Checked1, Allocator1>&       result,
          const cpp_int_backend<MinBits1, MaxBits1, SignType1, Checked1, Allocator1>& a,
          const cpp_int_backend<MinBits1, MaxBits1, SignType1, Checked1, Allocator1>& b)
      {
         using default_ops::eval_get_sign;
         using default_ops::eval_is_zero;
         using default_ops::eval_lsb;

         if (a.size() == 1)
         {
            eval_gcd(result, b, *a.limbs());
            return;
         }
         if (b.size() == 1)
         {
            eval_gcd(result, a, *b.limbs());
            return;
         }

         cpp_int_backend<MinBits1, MaxBits1, SignType1, Checked1, Allocator1> u(a), v(b);

         int s = eval_get_sign(u);

         /* GCD(0,x) := x */
         if (s < 0)
         {
            u.negate();
         }
         else if (s == 0)
         {
            result = v;
            return;
         }
         s = eval_get_sign(v);
         if (s < 0)
         {
            v.negate();
         }
         else if (s == 0)
         {
            result = u;
            return;
         }

         /* Let shift := lg K, where K is the greatest power of 2
   dividing both u and v. */

         unsigned us    = eval_lsb(u);
         unsigned vs    = eval_lsb(v);
         int      shift = (std::min)(us, vs);
         eval_right_shift(u, us);
         eval_right_shift(v, vs);

         do
         {
            /* Now u and v are both odd, so diff(u, v) is even.
      Let u = min(u, v), v = diff(u, v)/2. */
            s = u.compare(v);
            if (s > 0)
               u.swap(v);
            if (s == 0)
               break;

            while (((u.size() + 2 < v.size()) && (v.size() * 100 / u.size() > 105)) || ((u.size() <= 2) && (v.size() > 4)))
            {
               //
               // Speical case: if u and v differ considerably in size, then a Euclid step
               // is more efficient as we reduce v by several limbs in one go.
               // Unfortunately it requires an expensive long division:
               //
               eval_modulus(v, v, u);
               u.swap(v);
            }
            if (v.size() <= 2)
            {
               //
               // Special case: if v has no more than 2 limbs
               // then we can reduce u and v to a pair of integers and perform
               // direct integer gcd:
               //
               if (v.size() == 1)
                  u = eval_gcd(*v.limbs(), *u.limbs());
               else
               {
                  double_limb_type i = v.limbs()[0] | (static_cast<double_limb_type>(v.limbs()[1]) << sizeof(limb_type) * CHAR_BIT);
                  double_limb_type j = (u.size() == 1) ? *u.limbs() : u.limbs()[0] | (static_cast<double_limb_type>(u.limbs()[1]) << sizeof(limb_type) * CHAR_BIT);
                  u                  = eval_gcd(i, j);
               }
               break;
            }
            //
            // Regular binary gcd case:
            //
            eval_subtract(v, u);
            vs = eval_lsb(v);
            eval_right_shift(v, vs);
         } while (true);

         result = u;
         eval_left_shift(result, shift);
      }

      }
   }
}

template <class T>
std::tuple<std::vector<T>, std::vector<T>, std::vector<T> >& get_test_vector(unsigned bits)
{
   static std::map<unsigned, std::tuple<std::vector<T>, std::vector<T>, std::vector<T> > > data;

   std::tuple<std::vector<T>, std::vector<T>, std::vector<T> >& result = data[bits];

   if (std::get<0>(result).size() == 0)
   {
      mt19937                     mt;
      uniform_int_distribution<T> ui(T(1) << (bits - 1), T(1) << bits);

      std::vector<T>& a = std::get<0>(result);
      std::vector<T>& b = std::get<1>(result);
      std::vector<T>& c = std::get<2>(result);

      for (unsigned i = 0; i < 1000; ++i)
      {
         a.push_back(ui(mt));
         b.push_back(ui(mt));
         if (b.back() > a.back())
            b.back().swap(a.back());
         c.push_back(0);
      }
   }
   return result;
}

template <class T>
std::vector<T>& get_test_vector_a(unsigned bits)
{
   return std::get<0>(get_test_vector<T>(bits));
}
template <class T>
std::vector<T>& get_test_vector_b(unsigned bits)
{
   return std::get<1>(get_test_vector<T>(bits));
}
template <class T>
std::vector<T>& get_test_vector_c(unsigned bits)
{
   return std::get<2>(get_test_vector<T>(bits));
}


template <typename T>
static void BM_gcd_old(benchmark::State& state)
{
   int                         bits = state.range(0);

   std::vector<T>& a = get_test_vector_a<T>(bits);
   std::vector<T>& b = get_test_vector_b<T>(bits);
   std::vector<T>& c = get_test_vector_c<T>(bits);

   for (auto _ : state)
   {
      for (unsigned i = 0; i < a.size(); ++i)
         eval_gcd_old(c[i].backend(), a[i].backend(), b[i].backend());
   }
   state.SetComplexityN(bits);
}

template <typename T>
static void BM_gcd_current(benchmark::State& state)
{
   int                         bits = state.range(0);

   std::vector<T>& a = get_test_vector_a<T>(bits);
   std::vector<T>& b = get_test_vector_b<T>(bits);
   std::vector<T>& c = get_test_vector_c<T>(bits);

   for (auto _ : state)
   {
      for (unsigned i = 0; i < a.size(); ++i)
         eval_gcd(c[i].backend(), a[i].backend(), b[i].backend());
   }
   state.SetComplexityN(bits);
}

constexpr unsigned lower_range = 512;
constexpr unsigned upper_range = 1 << 15;

BENCHMARK_TEMPLATE(BM_gcd_old, cpp_int)->RangeMultiplier(2)->Range(lower_range, upper_range)->Unit(benchmark::kMillisecond)->Complexity();
BENCHMARK_TEMPLATE(BM_gcd_current, cpp_int)->RangeMultiplier(2)->Range(lower_range, upper_range)->Unit(benchmark::kMillisecond)->Complexity();
BENCHMARK_TEMPLATE(BM_gcd_old, cpp_int)->RangeMultiplier(2)->Range(lower_range, upper_range)->Unit(benchmark::kMillisecond)->Complexity();
BENCHMARK_TEMPLATE(BM_gcd_current, mpz_int)->RangeMultiplier(2)->Range(lower_range, upper_range)->Unit(benchmark::kMillisecond)->Complexity();
BENCHMARK_TEMPLATE(BM_gcd_current, mpz_int)->RangeMultiplier(2)->Range(lower_range, upper_range)->Unit(benchmark::kMillisecond)->Complexity();

BENCHMARK_MAIN();