File: exercise_threading_log_agm.cpp

package info (click to toggle)
boost1.90 1.90.0-1
links: PTS, VCS
area: main
in suites:
size: 593,120 kB
sloc: cpp: 4,190,908; xml: 196,648; python: 34,618; ansic: 23,145; asm: 5,468; sh: 3,774; makefile: 1,161; perl: 1,020; sql: 728; ruby: 676; yacc: 478; java: 77; lisp: 24; csh: 6
file content (499 lines) | stat: -rw-r--r-- 16,078 bytes
parent folder | download | duplicates (3)
///////////////////////////////////////////////////////////////////////////////
//      Copyright Christopher Kormanyos 2020 - 2025.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE_1_0.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)
//

// This example exercises Boost.Multiprecision in concurrent
// multi-threaded environments. To do so a loop involving
// non-trivial calculations of numerous function values
// has been set up within both concurrent as well as
// sequential running environments. In particular,
// this example uses an AGM method to do a "from the ground up"
// calculation of logarithms. The logarithm functions values
// are compared with the values from Boost.Multiprecision's
// specific log functions for the relevant backends.
// The log GM here is not optimized or intended for
// high-performance work, but can be taken as an
// interesting example of an AGM iteration if helpful.

// This example has been initially motivated in part
// by discussions in:
// https://github.com/boostorg/multiprecision/pull/211

// We find the following performance data here:
// https://github.com/boostorg/multiprecision/pull/213
//
// New numbers 2025-07-20 at 301 decimal digits
// --------------------------------------------
//
// cpp_dec_float:
// result_is_ok_concurrent: true, calculation_time_concurrent: 2.1s
// result_is_ok_sequential: true, calculation_time_sequential: 14.7s
//
// cpp_bin_float:
// result_is_ok_concurrent: true, calculation_time_concurrent: 0.28s
// result_is_ok_sequential: true, calculation_time_sequential: 1.88s
//
// gmp_float:
// result_is_ok_concurrent: true, calculation_time_concurrent: 0.11s
// result_is_ok_sequential: true, calculation_time_sequential: 0.73s
//
// mpfr_float:
// result_is_ok_concurrent: true, calculation_time_concurrent: 0.05s
// result_is_ok_sequential: true, calculation_time_sequential: 0.24s

#include <array>
#include <atomic>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <iomanip>
#include <iostream>
#include <limits>
#include <sstream>
#include <thread>
#include <vector>

#include <boost/math/constants/constants.hpp>
#include <boost/math/special_functions/prime.hpp>

#define BOOST_MP_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT       101
#define BOOST_MP_EXERCISE_THREADING_BACKEND_GMP_FLOAT           102
#define BOOST_MP_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT       103
#define BOOST_MP_EXERCISE_THREADING_BACKEND_MPFR_FLOAT          104

#if !defined(BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE)
#define BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE BOOST_MP_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT
//#define BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE BOOST_MP_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT
//#define BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE BOOST_MP_EXERCISE_THREADING_BACKEND_GMP_FLOAT
//#define BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE BOOST_MP_EXERCISE_THREADING_BACKEND_MPFR_FLOAT
#endif

constexpr unsigned local_mp_digits { 301U };

#if  (BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MP_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT)
#include <boost/multiprecision/cpp_dec_float.hpp>

using big_float_type = boost::multiprecision::number<boost::multiprecision::cpp_dec_float<local_mp_digits>,
                                                     boost::multiprecision::et_off>;

#elif (BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MP_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT)
#include <boost/multiprecision/cpp_bin_float.hpp>

using big_float_type = boost::multiprecision::number<boost::multiprecision::cpp_bin_float<local_mp_digits>,
                                                     boost::multiprecision::et_off>;

#elif  (BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MP_EXERCISE_THREADING_BACKEND_GMP_FLOAT)
#include <boost/multiprecision/gmp.hpp>

using big_float_type = boost::multiprecision::number<boost::multiprecision::gmp_float<local_mp_digits>,
                                                     boost::multiprecision::et_off>;

#elif  (BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MP_EXERCISE_THREADING_BACKEND_MPFR_FLOAT)
#include <boost/multiprecision/mpfr.hpp>

using big_float_type = boost::multiprecision::number<boost::multiprecision::mpfr_float_backend<local_mp_digits>,
                                                     boost::multiprecision::et_off>;

#else
#error BOOST_MP_EXERCISE_THREADING_BACKEND_TYPE is undefined.
#endif

namespace boost { namespace multiprecision { namespace exercise_threading {

namespace detail {

namespace my_concurrency {
template<typename index_type,
         typename callable_function_type>
void parallel_for(index_type             start,
                  index_type             end,
                  callable_function_type parallel_function)
{
  // Estimate the number of threads available.
  static const unsigned int number_of_threads_hint =
    std::thread::hardware_concurrency();

  static const unsigned int number_of_threads_total =
    ((number_of_threads_hint == 0U) ? 4U : number_of_threads_hint);

  // Use 7/8 of the available cores (leaving a core or two free on modern systems).
  static const unsigned int number_of_threads = number_of_threads_total - ((number_of_threads_total + 4U) / 8U);

  std::cout << "Executing with " << number_of_threads << " threads" << std::endl;

  // Set the size of a slice for the range functions.
  index_type n = index_type(end - start) + index_type(1);

  index_type slice =
    static_cast<index_type>(std::round(n / static_cast<float>(number_of_threads)));

  slice = (std::max)(slice, index_type(1));

  // Inner loop.
  auto launch_range =
    [&parallel_function](index_type index_lo, index_type index_hi)
    {
      for(index_type i = index_lo; i < index_hi; ++i)
      {
        parallel_function(i);
      }
    };

  // Create the thread pool and launch the jobs.
  std::vector<std::thread> pool;

  pool.reserve(number_of_threads);

  index_type i1 = start;
  index_type i2 = (std::min)(index_type(start + slice), end);

  for(index_type i = 0U; ((index_type(i + index_type(1U)) < number_of_threads) && (i1 < end)); ++i)
  {
    pool.emplace_back(launch_range, i1, i2);

    i1 = i2;

    i2 = (std::min)(index_type(i2 + slice), end);
  }

  if(i1 < end)
  {
    pool.emplace_back(launch_range, i1, end);
  }

  // Wait for the jobs to finish.
  for(std::thread& thread_in_pool : pool)
  {
    if(thread_in_pool.joinable())
    {
      thread_in_pool.join();
    }
  }
}
} // namespace my_concurrency

template<typename FloatingPointType,
         typename UnsignedIntegralType>
FloatingPointType pown(const FloatingPointType& b, const UnsignedIntegralType& p)
{
  // Calculate (b ^ p).

  using local_floating_point_type    = FloatingPointType;
  using local_unsigned_integral_type = UnsignedIntegralType;

  local_floating_point_type result;

  if     (p == local_unsigned_integral_type(0U)) { result = local_floating_point_type(1U); }
  else if(p == local_unsigned_integral_type(1U)) { result = b; }
  else if(p == local_unsigned_integral_type(2U)) { result = b; result *= b; }
  else
  {
    result = local_floating_point_type(1U);

    local_floating_point_type y(b);

    for(local_unsigned_integral_type p_local(p); p_local != local_unsigned_integral_type(0U); p_local >>= 1U)
    {
      if((static_cast<unsigned>(p_local) & 1U) != 0U)
      {
        result *= y;
      }

      y *= y;
    }
  }

  return result;
}

const std::vector<std::uint32_t>& primes()
{
  static std::vector<std::uint32_t> my_primes;

  if(my_primes.empty())
  {
    my_primes.resize(10000U);

    // Get exactly 10,000 primes.
    for(std::size_t i = 0U; i < my_primes.size(); ++i)
    {
      my_primes[i] = boost::math::prime((unsigned int) i);
    }
  }

  return my_primes;
}

} // namespace detail

template<typename FloatingPointType>
FloatingPointType log(const FloatingPointType& x)
{
  // Use an AGM method to compute the logarithm of x.

  // For values less than 1 invert the argument and
  // remember (in this case) to negate the result below.
  const bool b_negate = (x < 1);

  const FloatingPointType xx = ((b_negate == false) ? x : 1 / x);

  // Set a0 = 1
  // Set b0 = 4 / (x * 2^m) = 1 / (x * 2^(m - 2))

  FloatingPointType ak(1U);

  const float n_times_factor = static_cast<float>(static_cast<float>(std::numeric_limits<FloatingPointType>::digits10) * 1.67F);
  const float lgx_over_lg2   = std::log(static_cast<float>(xx)) / std::log(2.0F);

  std::int32_t m = static_cast<std::int32_t>(n_times_factor - lgx_over_lg2);

  // Ensure that the resulting power is non-negative.
  // Also enforce that m >= 8.
  m = (std::max)(m, static_cast<std::int32_t>(8));

  FloatingPointType bk = detail::pown(FloatingPointType(2), static_cast<std::uint32_t>(m));

  bk *= xx;
  bk  = 4 / bk;

  FloatingPointType ak_tmp(0U);

  using std::sqrt;

  // Determine the requested precision of the upcoming iteration in units of digits10.
  const FloatingPointType target_tolerance = sqrt(std::numeric_limits<FloatingPointType>::epsilon()) / 100;

  for(std::int32_t k = static_cast<std::int32_t>(0); k < static_cast<std::int32_t>(64); ++k)
  {
    using std::fabs;

    // Check for the number of significant digits to be
    // at least half of the requested digits. If at least
    // half of the requested digits have been achieved,
    // then break after the upcoming iteration.
    const bool break_after_this_iteration = (   (k > static_cast<std::int32_t>(4))
                                             && (fabs(1 - fabs(ak / bk)) < target_tolerance));

    ak_tmp  = ak;
    ak     += bk;
    ak     /= 2;

    if(break_after_this_iteration)
    {
      break;
    }

    bk *= ak_tmp;
    bk  = sqrt(bk);
  }

  // We are now finished with the AGM iteration for log(x).

  // Compute log(x) = {pi / [2 * AGM(1, 4 / 2^m)]} - (m * ln2)
  // Note at this time that (ak = bk) = AGM(...)

  // Retrieve the value of pi, divide by (2 * a) and subtract (m * ln2).
  const FloatingPointType result =
       boost::math::constants::pi<FloatingPointType>() / (ak * 2)
    - (boost::math::constants::ln_two<FloatingPointType>() * m);

  return ((b_negate == true) ? -result : result);
}

} } } // namespace boost::multiprecision::exercise_threading

template<typename FloatingPointType>
bool log_agm_concurrent(float& calculation_time)
{
  const std::size_t count = boost::multiprecision::exercise_threading::detail::primes().size();

  std::vector<FloatingPointType> log_results(count);
  std::vector<FloatingPointType> log_control(count);

  std::atomic_flag log_agm_lock = ATOMIC_FLAG_INIT;

  std::size_t concurrent_log_agm_count = 0U;

  const auto start = std::chrono::high_resolution_clock::now();

  boost::multiprecision::exercise_threading::detail::my_concurrency::parallel_for
  (
    std::size_t(0U),
    log_results.size(),
    [&log_results, &log_control, &concurrent_log_agm_count, &log_agm_lock](std::size_t i)
    {
      while(log_agm_lock.test_and_set()) { ; }
      const FloatingPointType dx = (FloatingPointType(1U) / (boost::multiprecision::exercise_threading::detail::primes()[i]));
      log_agm_lock.clear();

      const FloatingPointType  x = boost::math::constants::catalan<FloatingPointType>() + dx;

      const FloatingPointType lr = boost::multiprecision::exercise_threading::log(x);
      const FloatingPointType lc = boost::multiprecision::log(x);

      while(log_agm_lock.test_and_set()) { ; }

      log_results[i] = lr;
      log_control[i] = lc;

      ++concurrent_log_agm_count;

      if((concurrent_log_agm_count % 100U) == 0U)
      {
        std::cout << "log agm concurrent at index "
                  << concurrent_log_agm_count
                  << " of "
                  << log_results.size()
                  << ". Total processed so far: "
                  << std::fixed
                  << std::setprecision(2)
                  << (100.0F * float(concurrent_log_agm_count)) / float(log_results.size())
                  << "%."
                  << "\r";
      }

      log_agm_lock.clear();
    }
  );

  const auto stop = std::chrono::high_resolution_clock::now();

  calculation_time = static_cast<float>(std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count()) / 1000.0F;

  std::cout << std::endl;

  std::cout << "Checking results concurrent: ";

  bool result_is_ok = true;

  const FloatingPointType tol = std::numeric_limits<FloatingPointType>::epsilon() * 1000000U;

  for(std::size_t i = 0U; i < log_results.size(); ++i)
  {
    using std::fabs;

    const FloatingPointType close_fraction = fabs(1 - (log_results[i] / log_control[i]));

    result_is_ok &= (close_fraction < tol);
  }

  std::stringstream strm { };
  strm << std::boolalpha << result_is_ok;

  std::cout << strm.str() << std::endl;

  return result_is_ok;
}

template<typename FloatingPointType>
bool log_agm_sequential(float& calculation_time)
{
  const std::size_t count = boost::multiprecision::exercise_threading::detail::primes().size();

  std::vector<FloatingPointType> log_results(count);
  std::vector<FloatingPointType> log_control(count);

  const auto start = std::chrono::high_resolution_clock::now();

  for(std::size_t i = 0U; i < log_results.size(); ++i)
  {
    const std::size_t sequential_log_agm_count = i + 1U;

    const FloatingPointType dx = (FloatingPointType(1U) / (boost::multiprecision::exercise_threading::detail::primes()[i]));
    const FloatingPointType  x = boost::math::constants::catalan<FloatingPointType>() + dx;

    log_results[i] = boost::multiprecision::exercise_threading::log(x);
    log_control[i] = boost::multiprecision::log(x);

    if((sequential_log_agm_count % 100U) == 0U)
    {
      std::cout << "log agm sequential at index "
                << sequential_log_agm_count
                << " of "
                << log_results.size()
                << ". Total processed so far: "
                << std::fixed
                << std::setprecision(2)
                << (100.0F * float(sequential_log_agm_count)) / float(log_results.size())
                << "%."
                << "\r";
    }
  }

  const auto stop = std::chrono::high_resolution_clock::now();

  calculation_time = static_cast<float>(std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count()) / 1000.0F;

  std::cout << std::endl;

  std::cout << "Checking results sequential: ";

  bool result_is_ok = true;

  const FloatingPointType tol = std::numeric_limits<FloatingPointType>::epsilon() * 1000000U;

  for(std::size_t i = 0U; i < log_results.size(); ++i)
  {
    using std::fabs;

    const FloatingPointType close_fraction = fabs(1 - (log_results[i] / log_control[i]));

    result_is_ok &= (close_fraction < tol);
  }

  std::stringstream strm { };
  strm << std::boolalpha << result_is_ok;

  std::cout << strm.str() << std::endl;

  return result_is_ok;
}

int main()
{
  std::cout << "Calculating "
            << boost::multiprecision::exercise_threading::detail::primes().size()
            << " primes"
            << std::endl;

  float calculation_time_concurrent { };
  float calculation_time_sequential { };

  const bool result_is_ok_concurrent = log_agm_concurrent<big_float_type>(calculation_time_concurrent);
  const bool result_is_ok_sequential = log_agm_sequential<big_float_type>(calculation_time_sequential);

  std::cout << std::endl;

  {
    std::stringstream strm { };

    strm << "result_is_ok_concurrent: "
         << std::boolalpha
         << result_is_ok_concurrent
         << ", calculation_time_concurrent: "
         << std::fixed
         << std::setprecision(2)
         << calculation_time_concurrent
         << "s";

    std::cout << strm.str() << std::endl;
  }

  {
    std::stringstream strm { };

    strm << "result_is_ok_sequential: "
         << std::boolalpha
         << result_is_ok_sequential
         << ", calculation_time_sequential: "
         << std::fixed
         << std::setprecision(2)
         << calculation_time_sequential
         << "s";

    std::cout << strm.str() << std::endl;
  }
}